discovery-indexer 0.11.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/discovery-indexer/collection.rb +5 -15
- data/lib/discovery-indexer/general_mapper.rb +19 -8
- data/lib/discovery-indexer/reader/purlxml.rb +9 -7
- data/lib/discovery-indexer/reader/purlxml_model.rb +7 -6
- data/lib/discovery-indexer/reader/purlxml_parser_strict.rb +28 -34
- data/lib/discovery-indexer/version.rb +1 -1
- metadata +44 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0da2f698e5d93f7b4c2e2413ec16cf6a96bf0a89
|
4
|
+
data.tar.gz: 874859db3a705393f1b801583c58800ee5dc8004
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d56ddc0030ff54c25ad343fbaeef50111cbc874fae5234c7a39680acb0f4f9168b36adb0e7dd6c0e1f79a94ac8c75a3b310d598a98f44879ab987965a60f555
|
7
|
+
data.tar.gz: 3f2dc26535ff9cffbf0936d877ec8d0244742b4273b9a615d31e5acc257c1f1f97931481e627522ee4e9da823fdff41521a49f55b804b1d9fb1c426e5dbb78e1
|
@@ -1,10 +1,9 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
|
3
|
-
#
|
3
|
+
# Collection information such as name (title/label) and catkey
|
4
4
|
class Collection
|
5
5
|
|
6
6
|
attr_reader :druid
|
7
|
-
delegate :present?, to: :collection_info
|
8
7
|
|
9
8
|
def initialize(druid)
|
10
9
|
@druid = druid
|
@@ -20,20 +19,11 @@ module DiscoveryIndexer
|
|
20
19
|
|
21
20
|
private
|
22
21
|
|
23
|
-
#
|
24
|
-
#
|
25
|
-
# @param collection_druid [String] is the druid for a collection e.g., ab123cd4567
|
26
|
-
# @return [Array<String>] the collection data or [] if there is no name and catkey or the object
|
27
|
-
# is not a collection
|
22
|
+
# @return [Hash] the collection data as { title: 'coll title', ckey: catkey'}
|
28
23
|
def collection_info
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
# @param [String] collection_druid is the druid for a collection e.g., ab123cd4567
|
33
|
-
# @return [String] return the collection label from purl if available, nil otherwise
|
34
|
-
def from_purl
|
35
|
-
return unless purl_model
|
36
|
-
{ title: purl_model.label, ckey: purl_model.catkey }
|
24
|
+
return {} unless purl_model
|
25
|
+
@info = {}
|
26
|
+
@info = { title: purl_model.label, ckey: purl_model.catkey } if @info.empty?
|
37
27
|
end
|
38
28
|
|
39
29
|
def purl_model
|
@@ -5,10 +5,6 @@ module DiscoveryIndexer
|
|
5
5
|
|
6
6
|
# Initializes an instance from IndexMapper
|
7
7
|
# @param [String] druid e.g. ab123cd4567
|
8
|
-
# @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
|
9
|
-
# @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
|
10
|
-
# @param [Hash] collection_data represents a hash of collection_druid and catkey
|
11
|
-
# collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
|
12
8
|
def initialize(druid)
|
13
9
|
@druid = druid
|
14
10
|
end
|
@@ -22,21 +18,36 @@ module DiscoveryIndexer
|
|
22
18
|
solr_doc
|
23
19
|
end
|
24
20
|
|
25
|
-
#
|
26
|
-
# have a collection name, it will be excluded from the hash
|
27
|
-
# @return [Hash] a hash for collection druid and its name
|
28
|
-
# !{"ab123cd4567"=>"Collection 1", "ef123gh4567"=>"Collection 2"}
|
21
|
+
# @return [DiscoveryIndexer::Collection] for each collection druid, or [] if no collection druids
|
29
22
|
def collection_data
|
30
23
|
@collection_data ||= collection_druids.map do |cdruid|
|
31
24
|
DiscoveryIndexer::Collection.new(cdruid)
|
32
25
|
end
|
33
26
|
end
|
27
|
+
|
28
|
+
# @return [Array<String>] Array of bare druids from rels-ext isMemberOfCollection in public xml (e.g. ['oo000oo0000'])
|
34
29
|
def collection_druids
|
35
30
|
purlxml.collection_druids
|
36
31
|
end
|
32
|
+
|
33
|
+
# @return [DiscoveryIndexer::Collection] for each constituent druid, or [] if no constituent druids
|
34
|
+
def constituent_data
|
35
|
+
@constituent_data ||= constituent_druids.map do |cdruid|
|
36
|
+
DiscoveryIndexer::Collection.new(cdruid)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# @return [Array<String>] Array of bare druids from rels-ext isConstituentOf in public xml (e.g. ['oo000oo0000'])
|
41
|
+
def constituent_druids
|
42
|
+
purlxml.constituent_druids
|
43
|
+
end
|
44
|
+
|
45
|
+
# @return [Stanford::Mods::Record] the MODS xml for the druid
|
37
46
|
def modsxml
|
38
47
|
@modsxml ||= DiscoveryIndexer::InputXml::Modsxml.new(druid).load
|
39
48
|
end
|
49
|
+
|
50
|
+
# @return [DiscoveryIndexer::Reader::PurlxmlModel] the purlxml model
|
40
51
|
def purlxml
|
41
52
|
@purlxml ||= DiscoveryIndexer::InputXml::Purlxml.new(druid).load
|
42
53
|
end
|
@@ -1,25 +1,27 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
module InputXml
|
3
|
-
#
|
4
|
-
# as retrieved from PURL server
|
3
|
+
# Main model class to access the parsed purl xml retrieved from PURL server
|
5
4
|
# @example to run the code
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
5
|
+
# druid = "aa111aa1111"
|
6
|
+
# p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
|
7
|
+
# model = p.load()
|
8
|
+
# then you can access the bits of interest
|
9
|
+
# model.collection_druids
|
9
10
|
class Purlxml
|
10
11
|
# initializes a new object
|
11
12
|
# @param druid [String] the druid object in the format "aa111aa1111"
|
12
13
|
def initialize(druid)
|
13
14
|
@druid = druid
|
14
15
|
@purlxml_ng_doc = nil
|
16
|
+
@populated_model = nil
|
15
17
|
end
|
16
18
|
|
17
19
|
# loads the purl xml to purlxml model for the fedora object defind in the druid,
|
18
20
|
# it reads the purl xml once from PURL server, and repeat the parsing with each call
|
19
21
|
# @return [PurlxmlModel] represents the purlxml
|
20
22
|
def load
|
21
|
-
@purlxml_ng_doc
|
22
|
-
|
23
|
+
@purlxml_ng_doc ||= PurlxmlReader.read(@druid)
|
24
|
+
@populated_model ||= PurlxmlParserStrict.new(@druid, @purlxml_ng_doc).parse
|
23
25
|
end
|
24
26
|
end
|
25
27
|
end
|
@@ -41,21 +41,22 @@ module DiscoveryIndexer
|
|
41
41
|
# content_metadata.
|
42
42
|
attr_accessor :dor_content_type
|
43
43
|
|
44
|
-
# @!attribute [rw] dor_display_type
|
45
|
-
# @return [String] The displayType as extracted from public xml
|
46
|
-
# identity_metadata.
|
47
|
-
attr_accessor :dor_display_type
|
48
|
-
|
49
44
|
# @!attribute [rw] is_collection
|
50
45
|
# @return [Boolean] true if the item type is collection in the identity_metadata
|
51
46
|
attr_accessor :is_collection
|
52
47
|
|
53
48
|
# @!attribute [rw] collection_druids
|
54
|
-
# @return [Array]
|
49
|
+
# @return [Array<String>] bare druids of the collections that this druid is a member of
|
55
50
|
# @example
|
56
51
|
# ["aa11aaa1111","bb111bb1111"]
|
57
52
|
attr_accessor :collection_druids
|
58
53
|
|
54
|
+
# @!attribute [rw] constituent_druids
|
55
|
+
# @return [Array<String>] bare druids of the objects that this druid is a constituent of
|
56
|
+
# @example
|
57
|
+
# ["aa11aaa1111","bb111bb1111"]
|
58
|
+
attr_accessor :constituent_druids
|
59
|
+
|
59
60
|
# @!attribute [rw] file_ids
|
60
61
|
# @return [Array] a list of the file ids in the content_metadata
|
61
62
|
# @example
|
@@ -6,6 +6,7 @@ module DiscoveryIndexer
|
|
6
6
|
RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
7
7
|
OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
|
8
8
|
MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
|
9
|
+
FEDORA_NAMESPACE = 'info:fedora/fedora-system:def/relations-external#'
|
9
10
|
|
10
11
|
def initialize(druid, purlxml_ng_doc)
|
11
12
|
@purlxml_ng_doc = purlxml_ng_doc
|
@@ -21,12 +22,12 @@ module DiscoveryIndexer
|
|
21
22
|
purlxml_model.content_metadata = parse_content_metadata
|
22
23
|
purlxml_model.identity_metadata = parse_identity_metadata
|
23
24
|
purlxml_model.rights_metadata = parse_rights_metadata
|
24
|
-
purlxml_model.dc = parse_dc
|
25
|
+
purlxml_model.dc = parse_dc # why do we care?
|
25
26
|
purlxml_model.rdf = parse_rdf
|
26
27
|
purlxml_model.is_collection = parse_is_collection
|
27
|
-
purlxml_model.collection_druids =
|
28
|
+
purlxml_model.collection_druids = parse_predicate_druids('isMemberOfCollection', FEDORA_NAMESPACE)
|
29
|
+
purlxml_model.constituent_druids = parse_predicate_druids('isConstituentOf', FEDORA_NAMESPACE)
|
28
30
|
purlxml_model.dor_content_type = parse_dor_content_type
|
29
|
-
purlxml_model.dor_display_type = parse_dor_display_type
|
30
31
|
purlxml_model.release_tags_hash = parse_release_tags_hash
|
31
32
|
purlxml_model.file_ids = parse_file_ids
|
32
33
|
purlxml_model.image_ids = parse_image_ids
|
@@ -39,21 +40,23 @@ module DiscoveryIndexer
|
|
39
40
|
purlxml_model
|
40
41
|
end
|
41
42
|
|
43
|
+
private
|
44
|
+
|
42
45
|
# extracts the identityMetadata for this fedora object, from the purl xml
|
43
46
|
# @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
|
44
47
|
# @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no identity_metadata
|
45
48
|
def parse_identity_metadata
|
46
|
-
|
47
|
-
fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if
|
48
|
-
|
49
|
+
@idmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
|
50
|
+
fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !@idmd_ng_doc || @idmd_ng_doc.children.empty?
|
51
|
+
@idmd_ng_doc
|
49
52
|
rescue
|
50
53
|
raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
|
51
54
|
end
|
52
55
|
|
53
56
|
def parse_rights_metadata
|
54
|
-
|
55
|
-
fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if
|
56
|
-
|
57
|
+
@rmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
|
58
|
+
fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !@rmd_ng_doc || @rmd_ng_doc.children.empty?
|
59
|
+
@rmd_ng_doc
|
57
60
|
rescue
|
58
61
|
raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
|
59
62
|
end
|
@@ -62,9 +65,9 @@ module DiscoveryIndexer
|
|
62
65
|
# @return [Nokogiri::XML::Document] the dc for the fedora object
|
63
66
|
# @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
|
64
67
|
def parse_dc
|
65
|
-
|
66
|
-
fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if
|
67
|
-
|
68
|
+
@dc_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', 'dc' => OAI_DC_NAMESPACE).to_xml(encoding: 'utf-8'))
|
69
|
+
fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !@dc_ng_doc || @dc_ng_doc.children.empty?
|
70
|
+
@dc_ng_doc
|
68
71
|
rescue
|
69
72
|
raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
|
70
73
|
end
|
@@ -73,9 +76,9 @@ module DiscoveryIndexer
|
|
73
76
|
# @return [Nokogiri::XML::Document] the rdf for the fedora object
|
74
77
|
# @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
|
75
78
|
def parse_rdf
|
76
|
-
|
77
|
-
fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if
|
78
|
-
|
79
|
+
@rdf_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', 'rdf' => RDF_NAMESPACE).to_xml)
|
80
|
+
fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !@rdf_ng_doc || @rdf_ng_doc.children.empty?
|
81
|
+
@rdf_ng_doc
|
79
82
|
rescue
|
80
83
|
raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
|
81
84
|
end
|
@@ -101,9 +104,9 @@ module DiscoveryIndexer
|
|
101
104
|
# @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
|
102
105
|
# @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
|
103
106
|
def parse_content_metadata
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
+
@cntmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
|
108
|
+
@cntmd_ng_doc = nil if !@cntmd_ng_doc || @cntmd_ng_doc.children.empty?
|
109
|
+
@cntmd_ng_doc
|
107
110
|
end
|
108
111
|
|
109
112
|
# @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
|
@@ -116,13 +119,13 @@ module DiscoveryIndexer
|
|
116
119
|
false
|
117
120
|
end
|
118
121
|
|
119
|
-
# get the druids from
|
120
|
-
# @return [Array<String
|
121
|
-
def
|
122
|
-
ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', '
|
123
|
-
|
124
|
-
|
125
|
-
|
122
|
+
# get the druids from predicate relationships in rels-ext from public_xml
|
123
|
+
# @return [Array<String>, nil] the druids (e.g. ww123yy1234) from the rdf:resource of the predicate relationships, or nil if none
|
124
|
+
def parse_predicate_druids(predicate, predicate_ns)
|
125
|
+
ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'pred_ns' => predicate_ns }
|
126
|
+
xpth = "/publicObject/rdf:RDF/rdf:Description/pred_ns:#{predicate}/@rdf:resource"
|
127
|
+
pred_nodes = @purlxml_ng_doc.xpath(xpth, ns_hash)
|
128
|
+
pred_nodes.reject { |n| n.value.empty? }.map do |n|
|
126
129
|
n.value.split('druid:').last
|
127
130
|
end
|
128
131
|
end
|
@@ -139,15 +142,6 @@ module DiscoveryIndexer
|
|
139
142
|
dct
|
140
143
|
end
|
141
144
|
|
142
|
-
# the value of the displyType tag from a DOR collection's identityMetadata
|
143
|
-
# @return [String]
|
144
|
-
def parse_dor_display_type
|
145
|
-
identity_md = parse_identity_metadata
|
146
|
-
ddt = identity_md ? identity_md.xpath('//displayType').text : nil
|
147
|
-
DiscoveryIndexer::Logging.logger.debug "#{@druid} has no DOR display type (<identityMetadata> element may be missing displayType tag)" if !ddt || ddt.empty?
|
148
|
-
ddt
|
149
|
-
end
|
150
|
-
|
151
145
|
# the @id attribute of resource/file elements that match the image type, including extension
|
152
146
|
# @return [Array<String>] filenames
|
153
147
|
def parse_image_ids
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: discovery-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ahmed AlSum
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-02-
|
12
|
+
date: 2016-02-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -151,6 +151,48 @@ dependencies:
|
|
151
151
|
- - ">="
|
152
152
|
- !ruby/object:Gem::Version
|
153
153
|
version: '0'
|
154
|
+
- !ruby/object:Gem::Dependency
|
155
|
+
name: yard
|
156
|
+
requirement: !ruby/object:Gem::Requirement
|
157
|
+
requirements:
|
158
|
+
- - ">="
|
159
|
+
- !ruby/object:Gem::Version
|
160
|
+
version: '0'
|
161
|
+
type: :development
|
162
|
+
prerelease: false
|
163
|
+
version_requirements: !ruby/object:Gem::Requirement
|
164
|
+
requirements:
|
165
|
+
- - ">="
|
166
|
+
- !ruby/object:Gem::Version
|
167
|
+
version: '0'
|
168
|
+
- !ruby/object:Gem::Dependency
|
169
|
+
name: rubocop
|
170
|
+
requirement: !ruby/object:Gem::Requirement
|
171
|
+
requirements:
|
172
|
+
- - ">="
|
173
|
+
- !ruby/object:Gem::Version
|
174
|
+
version: '0'
|
175
|
+
type: :development
|
176
|
+
prerelease: false
|
177
|
+
version_requirements: !ruby/object:Gem::Requirement
|
178
|
+
requirements:
|
179
|
+
- - ">="
|
180
|
+
- !ruby/object:Gem::Version
|
181
|
+
version: '0'
|
182
|
+
- !ruby/object:Gem::Dependency
|
183
|
+
name: rubocop-rspec
|
184
|
+
requirement: !ruby/object:Gem::Requirement
|
185
|
+
requirements:
|
186
|
+
- - ">="
|
187
|
+
- !ruby/object:Gem::Version
|
188
|
+
version: '0'
|
189
|
+
type: :development
|
190
|
+
prerelease: false
|
191
|
+
version_requirements: !ruby/object:Gem::Requirement
|
192
|
+
requirements:
|
193
|
+
- - ">="
|
194
|
+
- !ruby/object:Gem::Version
|
195
|
+
version: '0'
|
154
196
|
description: This library manages the core operations for the discovery indexing such
|
155
197
|
as reading PURL xml, mapping to the solr document, and writing to solr core.
|
156
198
|
email: laneymcg@stanford.edu
|