discovery-indexer 0.11.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 51776b99172c16f9360bd19a8c51406a2e6379b1
4
- data.tar.gz: c52db6afd40ad759784b4a9686fa3b61ffc55698
3
+ metadata.gz: 0da2f698e5d93f7b4c2e2413ec16cf6a96bf0a89
4
+ data.tar.gz: 874859db3a705393f1b801583c58800ee5dc8004
5
5
  SHA512:
6
- metadata.gz: 289af09b8cbf7b4dedb28b69f5579393cece24d94cb98a2202d217fee855996df3cccb0772b88ac6977ea59fa1094709640633f8d53cf341d6b72b6266da6ad0
7
- data.tar.gz: 7379566d52c7f17a8a2ecded2795ec4fc006199108302bf5b9998029bbd54478bb42687a8aa03a270199b039543dcdb67c7349aa0542e745222a63750cd1ce7b
6
+ metadata.gz: 7d56ddc0030ff54c25ad343fbaeef50111cbc874fae5234c7a39680acb0f4f9168b36adb0e7dd6c0e1f79a94ac8c75a3b310d598a98f44879ab987965a60f555
7
+ data.tar.gz: 3f2dc26535ff9cffbf0936d877ec8d0244742b4273b9a615d31e5acc257c1f1f97931481e627522ee4e9da823fdff41521a49f55b804b1d9fb1c426e5dbb78e1
@@ -1,10 +1,9 @@
1
1
  module DiscoveryIndexer
2
2
 
3
- # It caches the collection information such as name and catkey
3
+ # Collection information such as name (title/label) and catkey
4
4
  class Collection
5
5
 
6
6
  attr_reader :druid
7
- delegate :present?, to: :collection_info
8
7
 
9
8
  def initialize(druid)
10
9
  @druid = druid
@@ -20,20 +19,11 @@ module DiscoveryIndexer
20
19
 
21
20
  private
22
21
 
23
- # Returns the collection name from cache, otherwise will fetch it from PURL.
24
- #
25
- # @param collection_druid [String] is the druid for a collection e.g., ab123cd4567
26
- # @return [Array<String>] the collection data or [] if there is no name and catkey or the object
27
- # is not a collection
22
+ # @return [Hash] the collection data as { title: 'coll title', ckey: catkey'}
28
23
  def collection_info
29
- from_purl || {}
30
- end
31
-
32
- # @param [String] collection_druid is the druid for a collection e.g., ab123cd4567
33
- # @return [String] return the collection label from purl if available, nil otherwise
34
- def from_purl
35
- return unless purl_model
36
- { title: purl_model.label, ckey: purl_model.catkey }
24
+ return {} unless purl_model
25
+ @info = {}
26
+ @info = { title: purl_model.label, ckey: purl_model.catkey } if @info.empty?
37
27
  end
38
28
 
39
29
  def purl_model
@@ -5,10 +5,6 @@ module DiscoveryIndexer
5
5
 
6
6
  # Initializes an instance from IndexMapper
7
7
  # @param [String] druid e.g. ab123cd4567
8
- # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
9
- # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
10
- # @param [Hash] collection_data represents a hash of collection_druid and catkey
11
- # collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
12
8
  def initialize(druid)
13
9
  @druid = druid
14
10
  end
@@ -22,21 +18,36 @@ module DiscoveryIndexer
22
18
  solr_doc
23
19
  end
24
20
 
25
- # It converts collection_druids list to a hash with names. If the druid doesn't
26
- # have a collection name, it will be excluded from the hash
27
- # @return [Hash] a hash for collection druid and its name
28
- # !{"ab123cd4567"=>"Collection 1", "ef123gh4567"=>"Collection 2"}
21
+ # @return [DiscoveryIndexer::Collection] for each collection druid, or [] if no collection druids
29
22
  def collection_data
30
23
  @collection_data ||= collection_druids.map do |cdruid|
31
24
  DiscoveryIndexer::Collection.new(cdruid)
32
25
  end
33
26
  end
27
+
28
+ # @return [Array<String>] Array of bare druids from rels-ext isMemberOfCollection in public xml (e.g. ['oo000oo0000'])
34
29
  def collection_druids
35
30
  purlxml.collection_druids
36
31
  end
32
+
33
+ # @return [DiscoveryIndexer::Collection] for each constituent druid, or [] if no constituent druids
34
+ def constituent_data
35
+ @constituent_data ||= constituent_druids.map do |cdruid|
36
+ DiscoveryIndexer::Collection.new(cdruid)
37
+ end
38
+ end
39
+
40
+ # @return [Array<String>] Array of bare druids from rels-ext isConstituentOf in public xml (e.g. ['oo000oo0000'])
41
+ def constituent_druids
42
+ purlxml.constituent_druids
43
+ end
44
+
45
+ # @return [Stanford::Mods::Record] the MODS xml for the druid
37
46
  def modsxml
38
47
  @modsxml ||= DiscoveryIndexer::InputXml::Modsxml.new(druid).load
39
48
  end
49
+
50
+ # @return [DiscoveryIndexer::Reader::PurlxmlModel] the purlxml model
40
51
  def purlxml
41
52
  @purlxml ||= DiscoveryIndexer::InputXml::Purlxml.new(druid).load
42
53
  end
@@ -1,25 +1,27 @@
1
1
  module DiscoveryIndexer
2
2
  module InputXml
3
- # This class is the main class to access and parse the purl xml
4
- # as retrieved from PURL server
3
+ # Main model class to access the parsed purl xml retrieved from PURL server
5
4
  # @example to run the code
6
- # druid = "aa111aa1111"
7
- # p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
8
- # model = p.load()
5
+ # druid = "aa111aa1111"
6
+ # p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
7
+ # model = p.load()
8
+ # then you can access the bits of interest
9
+ # model.collection_druids
9
10
  class Purlxml
10
11
  # initializes a new object
11
12
  # @param druid [String] the druid object in the format "aa111aa1111"
12
13
  def initialize(druid)
13
14
  @druid = druid
14
15
  @purlxml_ng_doc = nil
16
+ @populated_model = nil
15
17
  end
16
18
 
17
19
  # loads the purl xml to purlxml model for the fedora object defind in the druid,
18
20
  # it reads the purl xml once from PURL server, and repeat the parsing with each call
19
21
  # @return [PurlxmlModel] represents the purlxml
20
22
  def load
21
- @purlxml_ng_doc = PurlxmlReader.read(@druid) if @purlxml_ng_doc.nil?
22
- purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc).parse
23
+ @purlxml_ng_doc ||= PurlxmlReader.read(@druid)
24
+ @populated_model ||= PurlxmlParserStrict.new(@druid, @purlxml_ng_doc).parse
23
25
  end
24
26
  end
25
27
  end
@@ -41,21 +41,22 @@ module DiscoveryIndexer
41
41
  # content_metadata.
42
42
  attr_accessor :dor_content_type
43
43
 
44
- # @!attribute [rw] dor_display_type
45
- # @return [String] The displayType as extracted from public xml
46
- # identity_metadata.
47
- attr_accessor :dor_display_type
48
-
49
44
  # @!attribute [rw] is_collection
50
45
  # @return [Boolean] true if the item type is collection in the identity_metadata
51
46
  attr_accessor :is_collection
52
47
 
53
48
  # @!attribute [rw] collection_druids
54
- # @return [Array] a list of the collections that this is druid belongs to
49
+ # @return [Array<String>] bare druids of the collections that this druid is a member of
55
50
  # @example
56
51
  # ["aa11aaa1111","bb111bb1111"]
57
52
  attr_accessor :collection_druids
58
53
 
54
+ # @!attribute [rw] constituent_druids
55
+ # @return [Array<String>] bare druids of the objects that this druid is a constituent of
56
+ # @example
57
+ # ["aa11aaa1111","bb111bb1111"]
58
+ attr_accessor :constituent_druids
59
+
59
60
  # @!attribute [rw] file_ids
60
61
  # @return [Array] a list of the file ids in the content_metadata
61
62
  # @example
@@ -6,6 +6,7 @@ module DiscoveryIndexer
6
6
  RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
7
7
  OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
8
8
  MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
9
+ FEDORA_NAMESPACE = 'info:fedora/fedora-system:def/relations-external#'
9
10
 
10
11
  def initialize(druid, purlxml_ng_doc)
11
12
  @purlxml_ng_doc = purlxml_ng_doc
@@ -21,12 +22,12 @@ module DiscoveryIndexer
21
22
  purlxml_model.content_metadata = parse_content_metadata
22
23
  purlxml_model.identity_metadata = parse_identity_metadata
23
24
  purlxml_model.rights_metadata = parse_rights_metadata
24
- purlxml_model.dc = parse_dc
25
+ purlxml_model.dc = parse_dc # why do we care?
25
26
  purlxml_model.rdf = parse_rdf
26
27
  purlxml_model.is_collection = parse_is_collection
27
- purlxml_model.collection_druids = parse_collection_druids
28
+ purlxml_model.collection_druids = parse_predicate_druids('isMemberOfCollection', FEDORA_NAMESPACE)
29
+ purlxml_model.constituent_druids = parse_predicate_druids('isConstituentOf', FEDORA_NAMESPACE)
28
30
  purlxml_model.dor_content_type = parse_dor_content_type
29
- purlxml_model.dor_display_type = parse_dor_display_type
30
31
  purlxml_model.release_tags_hash = parse_release_tags_hash
31
32
  purlxml_model.file_ids = parse_file_ids
32
33
  purlxml_model.image_ids = parse_image_ids
@@ -39,21 +40,23 @@ module DiscoveryIndexer
39
40
  purlxml_model
40
41
  end
41
42
 
43
+ private
44
+
42
45
  # extracts the identityMetadata for this fedora object, from the purl xml
43
46
  # @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
44
47
  # @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no identity_metadata
45
48
  def parse_identity_metadata
46
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
47
- fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
48
- ng_doc
49
+ @idmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
50
+ fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !@idmd_ng_doc || @idmd_ng_doc.children.empty?
51
+ @idmd_ng_doc
49
52
  rescue
50
53
  raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
51
54
  end
52
55
 
53
56
  def parse_rights_metadata
54
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
55
- fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
56
- ng_doc
57
+ @rmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
58
+ fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !@rmd_ng_doc || @rmd_ng_doc.children.empty?
59
+ @rmd_ng_doc
57
60
  rescue
58
61
  raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
59
62
  end
@@ -62,9 +65,9 @@ module DiscoveryIndexer
62
65
  # @return [Nokogiri::XML::Document] the dc for the fedora object
63
66
  # @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
64
67
  def parse_dc
65
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', 'dc' => OAI_DC_NAMESPACE).to_xml(encoding: 'utf-8'))
66
- fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
67
- ng_doc
68
+ @dc_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', 'dc' => OAI_DC_NAMESPACE).to_xml(encoding: 'utf-8'))
69
+ fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !@dc_ng_doc || @dc_ng_doc.children.empty?
70
+ @dc_ng_doc
68
71
  rescue
69
72
  raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
70
73
  end
@@ -73,9 +76,9 @@ module DiscoveryIndexer
73
76
  # @return [Nokogiri::XML::Document] the rdf for the fedora object
74
77
  # @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
75
78
  def parse_rdf
76
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', 'rdf' => RDF_NAMESPACE).to_xml)
77
- fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
78
- ng_doc
79
+ @rdf_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', 'rdf' => RDF_NAMESPACE).to_xml)
80
+ fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !@rdf_ng_doc || @rdf_ng_doc.children.empty?
81
+ @rdf_ng_doc
79
82
  rescue
80
83
  raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
81
84
  end
@@ -101,9 +104,9 @@ module DiscoveryIndexer
101
104
  # @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
102
105
  # @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
103
106
  def parse_content_metadata
104
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
105
- ng_doc = nil if !ng_doc || ng_doc.children.empty?
106
- ng_doc
107
+ @cntmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
108
+ @cntmd_ng_doc = nil if !@cntmd_ng_doc || @cntmd_ng_doc.children.empty?
109
+ @cntmd_ng_doc
107
110
  end
108
111
 
109
112
  # @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
@@ -116,13 +119,13 @@ module DiscoveryIndexer
116
119
  false
117
120
  end
118
121
 
119
- # get the druids from isMemberOfCollection relationships in rels-ext from public_xml
120
- # @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
121
- def parse_collection_druids
122
- ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => 'info:fedora/fedora-system:def/relations-external#', '' => '' }
123
- is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
124
- # from public_xml rels-ext
125
- is_member_of_nodes.reject { |n| n.value.empty? }.map do |n|
122
+ # get the druids from predicate relationships in rels-ext from public_xml
123
+ # @return [Array<String>, nil] the druids (e.g. ww123yy1234) from the rdf:resource of the predicate relationships, or nil if none
124
+ def parse_predicate_druids(predicate, predicate_ns)
125
+ ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'pred_ns' => predicate_ns }
126
+ xpth = "/publicObject/rdf:RDF/rdf:Description/pred_ns:#{predicate}/@rdf:resource"
127
+ pred_nodes = @purlxml_ng_doc.xpath(xpth, ns_hash)
128
+ pred_nodes.reject { |n| n.value.empty? }.map do |n|
126
129
  n.value.split('druid:').last
127
130
  end
128
131
  end
@@ -139,15 +142,6 @@ module DiscoveryIndexer
139
142
  dct
140
143
  end
141
144
 
142
- # the value of the displyType tag from a DOR collection's identityMetadata
143
- # @return [String]
144
- def parse_dor_display_type
145
- identity_md = parse_identity_metadata
146
- ddt = identity_md ? identity_md.xpath('//displayType').text : nil
147
- DiscoveryIndexer::Logging.logger.debug "#{@druid} has no DOR display type (<identityMetadata> element may be missing displayType tag)" if !ddt || ddt.empty?
148
- ddt
149
- end
150
-
151
145
  # the @id attribute of resource/file elements that match the image type, including extension
152
146
  # @return [Array<String>] filenames
153
147
  def parse_image_ids
@@ -1,3 +1,3 @@
1
1
  module DiscoveryIndexer
2
- VERSION = '0.11.0'
2
+ VERSION = '1.0.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discovery-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ahmed AlSum
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-02-04 00:00:00.000000000 Z
12
+ date: 2016-02-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -151,6 +151,48 @@ dependencies:
151
151
  - - ">="
152
152
  - !ruby/object:Gem::Version
153
153
  version: '0'
154
+ - !ruby/object:Gem::Dependency
155
+ name: yard
156
+ requirement: !ruby/object:Gem::Requirement
157
+ requirements:
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ version: '0'
161
+ type: :development
162
+ prerelease: false
163
+ version_requirements: !ruby/object:Gem::Requirement
164
+ requirements:
165
+ - - ">="
166
+ - !ruby/object:Gem::Version
167
+ version: '0'
168
+ - !ruby/object:Gem::Dependency
169
+ name: rubocop
170
+ requirement: !ruby/object:Gem::Requirement
171
+ requirements:
172
+ - - ">="
173
+ - !ruby/object:Gem::Version
174
+ version: '0'
175
+ type: :development
176
+ prerelease: false
177
+ version_requirements: !ruby/object:Gem::Requirement
178
+ requirements:
179
+ - - ">="
180
+ - !ruby/object:Gem::Version
181
+ version: '0'
182
+ - !ruby/object:Gem::Dependency
183
+ name: rubocop-rspec
184
+ requirement: !ruby/object:Gem::Requirement
185
+ requirements:
186
+ - - ">="
187
+ - !ruby/object:Gem::Version
188
+ version: '0'
189
+ type: :development
190
+ prerelease: false
191
+ version_requirements: !ruby/object:Gem::Requirement
192
+ requirements:
193
+ - - ">="
194
+ - !ruby/object:Gem::Version
195
+ version: '0'
154
196
  description: This library manages the core operations for the discovery indexing such
155
197
  as reading PURL xml, mapping to the solr document, and writing to solr core.
156
198
  email: laneymcg@stanford.edu