discovery-indexer 0.11.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 51776b99172c16f9360bd19a8c51406a2e6379b1
4
- data.tar.gz: c52db6afd40ad759784b4a9686fa3b61ffc55698
3
+ metadata.gz: 0da2f698e5d93f7b4c2e2413ec16cf6a96bf0a89
4
+ data.tar.gz: 874859db3a705393f1b801583c58800ee5dc8004
5
5
  SHA512:
6
- metadata.gz: 289af09b8cbf7b4dedb28b69f5579393cece24d94cb98a2202d217fee855996df3cccb0772b88ac6977ea59fa1094709640633f8d53cf341d6b72b6266da6ad0
7
- data.tar.gz: 7379566d52c7f17a8a2ecded2795ec4fc006199108302bf5b9998029bbd54478bb42687a8aa03a270199b039543dcdb67c7349aa0542e745222a63750cd1ce7b
6
+ metadata.gz: 7d56ddc0030ff54c25ad343fbaeef50111cbc874fae5234c7a39680acb0f4f9168b36adb0e7dd6c0e1f79a94ac8c75a3b310d598a98f44879ab987965a60f555
7
+ data.tar.gz: 3f2dc26535ff9cffbf0936d877ec8d0244742b4273b9a615d31e5acc257c1f1f97931481e627522ee4e9da823fdff41521a49f55b804b1d9fb1c426e5dbb78e1
@@ -1,10 +1,9 @@
1
1
  module DiscoveryIndexer
2
2
 
3
- # It caches the collection information such as name and catkey
3
+ # Collection information such as name (title/label) and catkey
4
4
  class Collection
5
5
 
6
6
  attr_reader :druid
7
- delegate :present?, to: :collection_info
8
7
 
9
8
  def initialize(druid)
10
9
  @druid = druid
@@ -20,20 +19,11 @@ module DiscoveryIndexer
20
19
 
21
20
  private
22
21
 
23
- # Returns the collection name from cache, otherwise will fetch it from PURL.
24
- #
25
- # @param collection_druid [String] is the druid for a collection e.g., ab123cd4567
26
- # @return [Array<String>] the collection data or [] if there is no name and catkey or the object
27
- # is not a collection
22
+ # @return [Hash] the collection data as { title: 'coll title', ckey: catkey'}
28
23
  def collection_info
29
- from_purl || {}
30
- end
31
-
32
- # @param [String] collection_druid is the druid for a collection e.g., ab123cd4567
33
- # @return [String] return the collection label from purl if available, nil otherwise
34
- def from_purl
35
- return unless purl_model
36
- { title: purl_model.label, ckey: purl_model.catkey }
24
+ return {} unless purl_model
25
+ @info = {}
26
+ @info = { title: purl_model.label, ckey: purl_model.catkey } if @info.empty?
37
27
  end
38
28
 
39
29
  def purl_model
@@ -5,10 +5,6 @@ module DiscoveryIndexer
5
5
 
6
6
  # Initializes an instance from IndexMapper
7
7
  # @param [String] druid e.g. ab123cd4567
8
- # @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
9
- # @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
10
- # @param [Hash] collection_data represents a hash of collection_druid and catkey
11
- # collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
12
8
  def initialize(druid)
13
9
  @druid = druid
14
10
  end
@@ -22,21 +18,36 @@ module DiscoveryIndexer
22
18
  solr_doc
23
19
  end
24
20
 
25
- # It converts collection_druids list to a hash with names. If the druid doesn't
26
- # have a collection name, it will be excluded from the hash
27
- # @return [Hash] a hash for collection druid and its name
28
- # !{"ab123cd4567"=>"Collection 1", "ef123gh4567"=>"Collection 2"}
21
+ # @return [DiscoveryIndexer::Collection] for each collection druid, or [] if no collection druids
29
22
  def collection_data
30
23
  @collection_data ||= collection_druids.map do |cdruid|
31
24
  DiscoveryIndexer::Collection.new(cdruid)
32
25
  end
33
26
  end
27
+
28
+ # @return [Array<String>] Array of bare druids from rels-ext isMemberOfCollection in public xml (e.g. ['oo000oo0000'])
34
29
  def collection_druids
35
30
  purlxml.collection_druids
36
31
  end
32
+
33
+ # @return [DiscoveryIndexer::Collection] for each constituent druid, or [] if no constituent druids
34
+ def constituent_data
35
+ @constituent_data ||= constituent_druids.map do |cdruid|
36
+ DiscoveryIndexer::Collection.new(cdruid)
37
+ end
38
+ end
39
+
40
+ # @return [Array<String>] Array of bare druids from rels-ext isConstituentOf in public xml (e.g. ['oo000oo0000'])
41
+ def constituent_druids
42
+ purlxml.constituent_druids
43
+ end
44
+
45
+ # @return [Stanford::Mods::Record] the MODS xml for the druid
37
46
  def modsxml
38
47
  @modsxml ||= DiscoveryIndexer::InputXml::Modsxml.new(druid).load
39
48
  end
49
+
50
+ # @return [DiscoveryIndexer::Reader::PurlxmlModel] the purlxml model
40
51
  def purlxml
41
52
  @purlxml ||= DiscoveryIndexer::InputXml::Purlxml.new(druid).load
42
53
  end
@@ -1,25 +1,27 @@
1
1
  module DiscoveryIndexer
2
2
  module InputXml
3
- # This class is the main class to access and parse the purl xml
4
- # as retrieved from PURL server
3
+ # Main model class to access the parsed purl xml retrieved from PURL server
5
4
  # @example to run the code
6
- # druid = "aa111aa1111"
7
- # p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
8
- # model = p.load()
5
+ # druid = "aa111aa1111"
6
+ # p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
7
+ # model = p.load()
8
+ # then you can access the bits of interest
9
+ # model.collection_druids
9
10
  class Purlxml
10
11
  # initializes a new object
11
12
  # @param druid [String] the druid object in the format "aa111aa1111"
12
13
  def initialize(druid)
13
14
  @druid = druid
14
15
  @purlxml_ng_doc = nil
16
+ @populated_model = nil
15
17
  end
16
18
 
17
19
  # loads the purl xml to purlxml model for the fedora object defind in the druid,
18
20
  # it reads the purl xml once from PURL server, and repeat the parsing with each call
19
21
  # @return [PurlxmlModel] represents the purlxml
20
22
  def load
21
- @purlxml_ng_doc = PurlxmlReader.read(@druid) if @purlxml_ng_doc.nil?
22
- purlxml_parser = PurlxmlParserStrict.new(@druid, @purlxml_ng_doc).parse
23
+ @purlxml_ng_doc ||= PurlxmlReader.read(@druid)
24
+ @populated_model ||= PurlxmlParserStrict.new(@druid, @purlxml_ng_doc).parse
23
25
  end
24
26
  end
25
27
  end
@@ -41,21 +41,22 @@ module DiscoveryIndexer
41
41
  # content_metadata.
42
42
  attr_accessor :dor_content_type
43
43
 
44
- # @!attribute [rw] dor_display_type
45
- # @return [String] The displayType as extracted from public xml
46
- # identity_metadata.
47
- attr_accessor :dor_display_type
48
-
49
44
  # @!attribute [rw] is_collection
50
45
  # @return [Boolean] true if the item type is collection in the identity_metadata
51
46
  attr_accessor :is_collection
52
47
 
53
48
  # @!attribute [rw] collection_druids
54
- # @return [Array] a list of the collections that this is druid belongs to
49
+ # @return [Array<String>] bare druids of the collections that this druid is a member of
55
50
  # @example
56
51
  # ["aa11aaa1111","bb111bb1111"]
57
52
  attr_accessor :collection_druids
58
53
 
54
+ # @!attribute [rw] constituent_druids
55
+ # @return [Array<String>] bare druids of the objects that this druid is a constituent of
56
+ # @example
57
+ # ["aa11aaa1111","bb111bb1111"]
58
+ attr_accessor :constituent_druids
59
+
59
60
  # @!attribute [rw] file_ids
60
61
  # @return [Array] a list of the file ids in the content_metadata
61
62
  # @example
@@ -6,6 +6,7 @@ module DiscoveryIndexer
6
6
  RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
7
7
  OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
8
8
  MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
9
+ FEDORA_NAMESPACE = 'info:fedora/fedora-system:def/relations-external#'
9
10
 
10
11
  def initialize(druid, purlxml_ng_doc)
11
12
  @purlxml_ng_doc = purlxml_ng_doc
@@ -21,12 +22,12 @@ module DiscoveryIndexer
21
22
  purlxml_model.content_metadata = parse_content_metadata
22
23
  purlxml_model.identity_metadata = parse_identity_metadata
23
24
  purlxml_model.rights_metadata = parse_rights_metadata
24
- purlxml_model.dc = parse_dc
25
+ purlxml_model.dc = parse_dc # why do we care?
25
26
  purlxml_model.rdf = parse_rdf
26
27
  purlxml_model.is_collection = parse_is_collection
27
- purlxml_model.collection_druids = parse_collection_druids
28
+ purlxml_model.collection_druids = parse_predicate_druids('isMemberOfCollection', FEDORA_NAMESPACE)
29
+ purlxml_model.constituent_druids = parse_predicate_druids('isConstituentOf', FEDORA_NAMESPACE)
28
30
  purlxml_model.dor_content_type = parse_dor_content_type
29
- purlxml_model.dor_display_type = parse_dor_display_type
30
31
  purlxml_model.release_tags_hash = parse_release_tags_hash
31
32
  purlxml_model.file_ids = parse_file_ids
32
33
  purlxml_model.image_ids = parse_image_ids
@@ -39,21 +40,23 @@ module DiscoveryIndexer
39
40
  purlxml_model
40
41
  end
41
42
 
43
+ private
44
+
42
45
  # extracts the identityMetadata for this fedora object, from the purl xml
43
46
  # @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
44
47
  # @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no identity_metadata
45
48
  def parse_identity_metadata
46
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
47
- fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
48
- ng_doc
49
+ @idmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
50
+ fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !@idmd_ng_doc || @idmd_ng_doc.children.empty?
51
+ @idmd_ng_doc
49
52
  rescue
50
53
  raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
51
54
  end
52
55
 
53
56
  def parse_rights_metadata
54
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
55
- fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
56
- ng_doc
57
+ @rmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
58
+ fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !@rmd_ng_doc || @rmd_ng_doc.children.empty?
59
+ @rmd_ng_doc
57
60
  rescue
58
61
  raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
59
62
  end
@@ -62,9 +65,9 @@ module DiscoveryIndexer
62
65
  # @return [Nokogiri::XML::Document] the dc for the fedora object
63
66
  # @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
64
67
  def parse_dc
65
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', 'dc' => OAI_DC_NAMESPACE).to_xml(encoding: 'utf-8'))
66
- fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
67
- ng_doc
68
+ @dc_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', 'dc' => OAI_DC_NAMESPACE).to_xml(encoding: 'utf-8'))
69
+ fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !@dc_ng_doc || @dc_ng_doc.children.empty?
70
+ @dc_ng_doc
68
71
  rescue
69
72
  raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
70
73
  end
@@ -73,9 +76,9 @@ module DiscoveryIndexer
73
76
  # @return [Nokogiri::XML::Document] the rdf for the fedora object
74
77
  # @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
75
78
  def parse_rdf
76
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', 'rdf' => RDF_NAMESPACE).to_xml)
77
- fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !ng_doc || ng_doc.children.empty?
78
- ng_doc
79
+ @rdf_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', 'rdf' => RDF_NAMESPACE).to_xml)
80
+ fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !@rdf_ng_doc || @rdf_ng_doc.children.empty?
81
+ @rdf_ng_doc
79
82
  rescue
80
83
  raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
81
84
  end
@@ -101,9 +104,9 @@ module DiscoveryIndexer
101
104
  # @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
102
105
  # @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
103
106
  def parse_content_metadata
104
- ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
105
- ng_doc = nil if !ng_doc || ng_doc.children.empty?
106
- ng_doc
107
+ @cntmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
108
+ @cntmd_ng_doc = nil if !@cntmd_ng_doc || @cntmd_ng_doc.children.empty?
109
+ @cntmd_ng_doc
107
110
  end
108
111
 
109
112
  # @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
@@ -116,13 +119,13 @@ module DiscoveryIndexer
116
119
  false
117
120
  end
118
121
 
119
- # get the druids from isMemberOfCollection relationships in rels-ext from public_xml
120
- # @return [Array<String>] the druids (e.g. ww123yy1234) this object has isMemberOfColletion relationship with, or nil if none
121
- def parse_collection_druids
122
- ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'fedora' => 'info:fedora/fedora-system:def/relations-external#', '' => '' }
123
- is_member_of_nodes ||= @purlxml_ng_doc.xpath('/publicObject/rdf:RDF/rdf:Description/fedora:isMemberOfCollection/@rdf:resource', ns_hash)
124
- # from public_xml rels-ext
125
- is_member_of_nodes.reject { |n| n.value.empty? }.map do |n|
122
+ # get the druids from predicate relationships in rels-ext from public_xml
123
+ # @return [Array<String>, nil] the druids (e.g. ww123yy1234) from the rdf:resource of the predicate relationships, or nil if none
124
+ def parse_predicate_druids(predicate, predicate_ns)
125
+ ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'pred_ns' => predicate_ns }
126
+ xpth = "/publicObject/rdf:RDF/rdf:Description/pred_ns:#{predicate}/@rdf:resource"
127
+ pred_nodes = @purlxml_ng_doc.xpath(xpth, ns_hash)
128
+ pred_nodes.reject { |n| n.value.empty? }.map do |n|
126
129
  n.value.split('druid:').last
127
130
  end
128
131
  end
@@ -139,15 +142,6 @@ module DiscoveryIndexer
139
142
  dct
140
143
  end
141
144
 
142
- # the value of the displyType tag from a DOR collection's identityMetadata
143
- # @return [String]
144
- def parse_dor_display_type
145
- identity_md = parse_identity_metadata
146
- ddt = identity_md ? identity_md.xpath('//displayType').text : nil
147
- DiscoveryIndexer::Logging.logger.debug "#{@druid} has no DOR display type (<identityMetadata> element may be missing displayType tag)" if !ddt || ddt.empty?
148
- ddt
149
- end
150
-
151
145
  # the @id attribute of resource/file elements that match the image type, including extension
152
146
  # @return [Array<String>] filenames
153
147
  def parse_image_ids
@@ -1,3 +1,3 @@
1
1
  module DiscoveryIndexer
2
- VERSION = '0.11.0'
2
+ VERSION = '1.0.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: discovery-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ahmed AlSum
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-02-04 00:00:00.000000000 Z
12
+ date: 2016-02-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -151,6 +151,48 @@ dependencies:
151
151
  - - ">="
152
152
  - !ruby/object:Gem::Version
153
153
  version: '0'
154
+ - !ruby/object:Gem::Dependency
155
+ name: yard
156
+ requirement: !ruby/object:Gem::Requirement
157
+ requirements:
158
+ - - ">="
159
+ - !ruby/object:Gem::Version
160
+ version: '0'
161
+ type: :development
162
+ prerelease: false
163
+ version_requirements: !ruby/object:Gem::Requirement
164
+ requirements:
165
+ - - ">="
166
+ - !ruby/object:Gem::Version
167
+ version: '0'
168
+ - !ruby/object:Gem::Dependency
169
+ name: rubocop
170
+ requirement: !ruby/object:Gem::Requirement
171
+ requirements:
172
+ - - ">="
173
+ - !ruby/object:Gem::Version
174
+ version: '0'
175
+ type: :development
176
+ prerelease: false
177
+ version_requirements: !ruby/object:Gem::Requirement
178
+ requirements:
179
+ - - ">="
180
+ - !ruby/object:Gem::Version
181
+ version: '0'
182
+ - !ruby/object:Gem::Dependency
183
+ name: rubocop-rspec
184
+ requirement: !ruby/object:Gem::Requirement
185
+ requirements:
186
+ - - ">="
187
+ - !ruby/object:Gem::Version
188
+ version: '0'
189
+ type: :development
190
+ prerelease: false
191
+ version_requirements: !ruby/object:Gem::Requirement
192
+ requirements:
193
+ - - ">="
194
+ - !ruby/object:Gem::Version
195
+ version: '0'
154
196
  description: This library manages the core operations for the discovery indexing such
155
197
  as reading PURL xml, mapping to the solr document, and writing to solr core.
156
198
  email: laneymcg@stanford.edu