discovery-indexer 0.11.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/discovery-indexer/collection.rb +5 -15
- data/lib/discovery-indexer/general_mapper.rb +19 -8
- data/lib/discovery-indexer/reader/purlxml.rb +9 -7
- data/lib/discovery-indexer/reader/purlxml_model.rb +7 -6
- data/lib/discovery-indexer/reader/purlxml_parser_strict.rb +28 -34
- data/lib/discovery-indexer/version.rb +1 -1
- metadata +44 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0da2f698e5d93f7b4c2e2413ec16cf6a96bf0a89
|
4
|
+
data.tar.gz: 874859db3a705393f1b801583c58800ee5dc8004
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d56ddc0030ff54c25ad343fbaeef50111cbc874fae5234c7a39680acb0f4f9168b36adb0e7dd6c0e1f79a94ac8c75a3b310d598a98f44879ab987965a60f555
|
7
|
+
data.tar.gz: 3f2dc26535ff9cffbf0936d877ec8d0244742b4273b9a615d31e5acc257c1f1f97931481e627522ee4e9da823fdff41521a49f55b804b1d9fb1c426e5dbb78e1
|
@@ -1,10 +1,9 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
|
3
|
-
#
|
3
|
+
# Collection information such as name (title/label) and catkey
|
4
4
|
class Collection
|
5
5
|
|
6
6
|
attr_reader :druid
|
7
|
-
delegate :present?, to: :collection_info
|
8
7
|
|
9
8
|
def initialize(druid)
|
10
9
|
@druid = druid
|
@@ -20,20 +19,11 @@ module DiscoveryIndexer
|
|
20
19
|
|
21
20
|
private
|
22
21
|
|
23
|
-
#
|
24
|
-
#
|
25
|
-
# @param collection_druid [String] is the druid for a collection e.g., ab123cd4567
|
26
|
-
# @return [Array<String>] the collection data or [] if there is no name and catkey or the object
|
27
|
-
# is not a collection
|
22
|
+
# @return [Hash] the collection data as { title: 'coll title', ckey: catkey'}
|
28
23
|
def collection_info
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
# @param [String] collection_druid is the druid for a collection e.g., ab123cd4567
|
33
|
-
# @return [String] return the collection label from purl if available, nil otherwise
|
34
|
-
def from_purl
|
35
|
-
return unless purl_model
|
36
|
-
{ title: purl_model.label, ckey: purl_model.catkey }
|
24
|
+
return {} unless purl_model
|
25
|
+
@info = {}
|
26
|
+
@info = { title: purl_model.label, ckey: purl_model.catkey } if @info.empty?
|
37
27
|
end
|
38
28
|
|
39
29
|
def purl_model
|
@@ -5,10 +5,6 @@ module DiscoveryIndexer
|
|
5
5
|
|
6
6
|
# Initializes an instance from IndexMapper
|
7
7
|
# @param [String] druid e.g. ab123cd4567
|
8
|
-
# @param [Stanford::Mods::Record] modsxml represents the MODS xml for the druid
|
9
|
-
# @param [DiscoveryIndexer::Reader::PurlxmlModel] purlxml represents the purlxml model
|
10
|
-
# @param [Hash] collection_data represents a hash of collection_druid and catkey
|
11
|
-
# collection_data = {'aa00bb0001'=>{:name=>'Test Collection Name',:ckey=>'000001'},'nt028fd5773'=>{:name=>'Revs Institute Archive',:ckey=>'000002'}}
|
12
8
|
def initialize(druid)
|
13
9
|
@druid = druid
|
14
10
|
end
|
@@ -22,21 +18,36 @@ module DiscoveryIndexer
|
|
22
18
|
solr_doc
|
23
19
|
end
|
24
20
|
|
25
|
-
#
|
26
|
-
# have a collection name, it will be excluded from the hash
|
27
|
-
# @return [Hash] a hash for collection druid and its name
|
28
|
-
# !{"ab123cd4567"=>"Collection 1", "ef123gh4567"=>"Collection 2"}
|
21
|
+
# @return [DiscoveryIndexer::Collection] for each collection druid, or [] if no collection druids
|
29
22
|
def collection_data
|
30
23
|
@collection_data ||= collection_druids.map do |cdruid|
|
31
24
|
DiscoveryIndexer::Collection.new(cdruid)
|
32
25
|
end
|
33
26
|
end
|
27
|
+
|
28
|
+
# @return [Array<String>] Array of bare druids from rels-ext isMemberOfCollection in public xml (e.g. ['oo000oo0000'])
|
34
29
|
def collection_druids
|
35
30
|
purlxml.collection_druids
|
36
31
|
end
|
32
|
+
|
33
|
+
# @return [DiscoveryIndexer::Collection] for each constituent druid, or [] if no constituent druids
|
34
|
+
def constituent_data
|
35
|
+
@constituent_data ||= constituent_druids.map do |cdruid|
|
36
|
+
DiscoveryIndexer::Collection.new(cdruid)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# @return [Array<String>] Array of bare druids from rels-ext isConstituentOf in public xml (e.g. ['oo000oo0000'])
|
41
|
+
def constituent_druids
|
42
|
+
purlxml.constituent_druids
|
43
|
+
end
|
44
|
+
|
45
|
+
# @return [Stanford::Mods::Record] the MODS xml for the druid
|
37
46
|
def modsxml
|
38
47
|
@modsxml ||= DiscoveryIndexer::InputXml::Modsxml.new(druid).load
|
39
48
|
end
|
49
|
+
|
50
|
+
# @return [DiscoveryIndexer::Reader::PurlxmlModel] the purlxml model
|
40
51
|
def purlxml
|
41
52
|
@purlxml ||= DiscoveryIndexer::InputXml::Purlxml.new(druid).load
|
42
53
|
end
|
@@ -1,25 +1,27 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
module InputXml
|
3
|
-
#
|
4
|
-
# as retrieved from PURL server
|
3
|
+
# Main model class to access the parsed purl xml retrieved from PURL server
|
5
4
|
# @example to run the code
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
5
|
+
# druid = "aa111aa1111"
|
6
|
+
# p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
|
7
|
+
# model = p.load()
|
8
|
+
# then you can access the bits of interest
|
9
|
+
# model.collection_druids
|
9
10
|
class Purlxml
|
10
11
|
# initializes a new object
|
11
12
|
# @param druid [String] the druid object in the format "aa111aa1111"
|
12
13
|
def initialize(druid)
|
13
14
|
@druid = druid
|
14
15
|
@purlxml_ng_doc = nil
|
16
|
+
@populated_model = nil
|
15
17
|
end
|
16
18
|
|
17
19
|
# loads the purl xml to purlxml model for the fedora object defind in the druid,
|
18
20
|
# it reads the purl xml once from PURL server, and repeat the parsing with each call
|
19
21
|
# @return [PurlxmlModel] represents the purlxml
|
20
22
|
def load
|
21
|
-
@purlxml_ng_doc
|
22
|
-
|
23
|
+
@purlxml_ng_doc ||= PurlxmlReader.read(@druid)
|
24
|
+
@populated_model ||= PurlxmlParserStrict.new(@druid, @purlxml_ng_doc).parse
|
23
25
|
end
|
24
26
|
end
|
25
27
|
end
|
@@ -41,21 +41,22 @@ module DiscoveryIndexer
|
|
41
41
|
# content_metadata.
|
42
42
|
attr_accessor :dor_content_type
|
43
43
|
|
44
|
-
# @!attribute [rw] dor_display_type
|
45
|
-
# @return [String] The displayType as extracted from public xml
|
46
|
-
# identity_metadata.
|
47
|
-
attr_accessor :dor_display_type
|
48
|
-
|
49
44
|
# @!attribute [rw] is_collection
|
50
45
|
# @return [Boolean] true if the item type is collection in the identity_metadata
|
51
46
|
attr_accessor :is_collection
|
52
47
|
|
53
48
|
# @!attribute [rw] collection_druids
|
54
|
-
# @return [Array]
|
49
|
+
# @return [Array<String>] bare druids of the collections that this druid is a member of
|
55
50
|
# @example
|
56
51
|
# ["aa11aaa1111","bb111bb1111"]
|
57
52
|
attr_accessor :collection_druids
|
58
53
|
|
54
|
+
# @!attribute [rw] constituent_druids
|
55
|
+
# @return [Array<String>] bare druids of the objects that this druid is a constituent of
|
56
|
+
# @example
|
57
|
+
# ["aa11aaa1111","bb111bb1111"]
|
58
|
+
attr_accessor :constituent_druids
|
59
|
+
|
59
60
|
# @!attribute [rw] file_ids
|
60
61
|
# @return [Array] a list of the file ids in the content_metadata
|
61
62
|
# @example
|
@@ -6,6 +6,7 @@ module DiscoveryIndexer
|
|
6
6
|
RDF_NAMESPACE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
7
7
|
OAI_DC_NAMESPACE = 'http://www.openarchives.org/OAI/2.0/oai_dc/'
|
8
8
|
MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
|
9
|
+
FEDORA_NAMESPACE = 'info:fedora/fedora-system:def/relations-external#'
|
9
10
|
|
10
11
|
def initialize(druid, purlxml_ng_doc)
|
11
12
|
@purlxml_ng_doc = purlxml_ng_doc
|
@@ -21,12 +22,12 @@ module DiscoveryIndexer
|
|
21
22
|
purlxml_model.content_metadata = parse_content_metadata
|
22
23
|
purlxml_model.identity_metadata = parse_identity_metadata
|
23
24
|
purlxml_model.rights_metadata = parse_rights_metadata
|
24
|
-
purlxml_model.dc = parse_dc
|
25
|
+
purlxml_model.dc = parse_dc # why do we care?
|
25
26
|
purlxml_model.rdf = parse_rdf
|
26
27
|
purlxml_model.is_collection = parse_is_collection
|
27
|
-
purlxml_model.collection_druids =
|
28
|
+
purlxml_model.collection_druids = parse_predicate_druids('isMemberOfCollection', FEDORA_NAMESPACE)
|
29
|
+
purlxml_model.constituent_druids = parse_predicate_druids('isConstituentOf', FEDORA_NAMESPACE)
|
28
30
|
purlxml_model.dor_content_type = parse_dor_content_type
|
29
|
-
purlxml_model.dor_display_type = parse_dor_display_type
|
30
31
|
purlxml_model.release_tags_hash = parse_release_tags_hash
|
31
32
|
purlxml_model.file_ids = parse_file_ids
|
32
33
|
purlxml_model.image_ids = parse_image_ids
|
@@ -39,21 +40,23 @@ module DiscoveryIndexer
|
|
39
40
|
purlxml_model
|
40
41
|
end
|
41
42
|
|
43
|
+
private
|
44
|
+
|
42
45
|
# extracts the identityMetadata for this fedora object, from the purl xml
|
43
46
|
# @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
|
44
47
|
# @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no identity_metadata
|
45
48
|
def parse_identity_metadata
|
46
|
-
|
47
|
-
fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if
|
48
|
-
|
49
|
+
@idmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
|
50
|
+
fail DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect) if !@idmd_ng_doc || @idmd_ng_doc.children.empty?
|
51
|
+
@idmd_ng_doc
|
49
52
|
rescue
|
50
53
|
raise DiscoveryIndexer::Errors::MissingIdentityMetadata.new(@purlxml_ng_doc.inspect)
|
51
54
|
end
|
52
55
|
|
53
56
|
def parse_rights_metadata
|
54
|
-
|
55
|
-
fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if
|
56
|
-
|
57
|
+
@rmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rightsMetadata').to_xml)
|
58
|
+
fail DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect) if !@rmd_ng_doc || @rmd_ng_doc.children.empty?
|
59
|
+
@rmd_ng_doc
|
57
60
|
rescue
|
58
61
|
raise DiscoveryIndexer::Errors::MissingRightsMetadata.new(@purlxml_ng_doc.inspect)
|
59
62
|
end
|
@@ -62,9 +65,9 @@ module DiscoveryIndexer
|
|
62
65
|
# @return [Nokogiri::XML::Document] the dc for the fedora object
|
63
66
|
# @raise [DiscoveryIndexer::Errors::MissingDC] if there is no dc element
|
64
67
|
def parse_dc
|
65
|
-
|
66
|
-
fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if
|
67
|
-
|
68
|
+
@dc_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/dc:dc', 'dc' => OAI_DC_NAMESPACE).to_xml(encoding: 'utf-8'))
|
69
|
+
fail DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect) if !@dc_ng_doc || @dc_ng_doc.children.empty?
|
70
|
+
@dc_ng_doc
|
68
71
|
rescue
|
69
72
|
raise DiscoveryIndexer::Errors::MissingDC.new(@purlxml_ng_doc.inspect)
|
70
73
|
end
|
@@ -73,9 +76,9 @@ module DiscoveryIndexer
|
|
73
76
|
# @return [Nokogiri::XML::Document] the rdf for the fedora object
|
74
77
|
# @raise [DiscoveryIndexer::Errors::MissingRDF] if there is no rdf element
|
75
78
|
def parse_rdf
|
76
|
-
|
77
|
-
fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if
|
78
|
-
|
79
|
+
@rdf_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/rdf:RDF', 'rdf' => RDF_NAMESPACE).to_xml)
|
80
|
+
fail DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect) if !@rdf_ng_doc || @rdf_ng_doc.children.empty?
|
81
|
+
@rdf_ng_doc
|
79
82
|
rescue
|
80
83
|
raise DiscoveryIndexer::Errors::MissingRDF.new(@purlxml_ng_doc.inspect)
|
81
84
|
end
|
@@ -101,9 +104,9 @@ module DiscoveryIndexer
|
|
101
104
|
# @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
|
102
105
|
# @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
|
103
106
|
def parse_content_metadata
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
+
@cntmd_ng_doc ||= Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
|
108
|
+
@cntmd_ng_doc = nil if !@cntmd_ng_doc || @cntmd_ng_doc.children.empty?
|
109
|
+
@cntmd_ng_doc
|
107
110
|
end
|
108
111
|
|
109
112
|
# @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
|
@@ -116,13 +119,13 @@ module DiscoveryIndexer
|
|
116
119
|
false
|
117
120
|
end
|
118
121
|
|
119
|
-
# get the druids from
|
120
|
-
# @return [Array<String
|
121
|
-
def
|
122
|
-
ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', '
|
123
|
-
|
124
|
-
|
125
|
-
|
122
|
+
# get the druids from predicate relationships in rels-ext from public_xml
|
123
|
+
# @return [Array<String>, nil] the druids (e.g. ww123yy1234) from the rdf:resource of the predicate relationships, or nil if none
|
124
|
+
def parse_predicate_druids(predicate, predicate_ns)
|
125
|
+
ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'pred_ns' => predicate_ns }
|
126
|
+
xpth = "/publicObject/rdf:RDF/rdf:Description/pred_ns:#{predicate}/@rdf:resource"
|
127
|
+
pred_nodes = @purlxml_ng_doc.xpath(xpth, ns_hash)
|
128
|
+
pred_nodes.reject { |n| n.value.empty? }.map do |n|
|
126
129
|
n.value.split('druid:').last
|
127
130
|
end
|
128
131
|
end
|
@@ -139,15 +142,6 @@ module DiscoveryIndexer
|
|
139
142
|
dct
|
140
143
|
end
|
141
144
|
|
142
|
-
# the value of the displyType tag from a DOR collection's identityMetadata
|
143
|
-
# @return [String]
|
144
|
-
def parse_dor_display_type
|
145
|
-
identity_md = parse_identity_metadata
|
146
|
-
ddt = identity_md ? identity_md.xpath('//displayType').text : nil
|
147
|
-
DiscoveryIndexer::Logging.logger.debug "#{@druid} has no DOR display type (<identityMetadata> element may be missing displayType tag)" if !ddt || ddt.empty?
|
148
|
-
ddt
|
149
|
-
end
|
150
|
-
|
151
145
|
# the @id attribute of resource/file elements that match the image type, including extension
|
152
146
|
# @return [Array<String>] filenames
|
153
147
|
def parse_image_ids
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: discovery-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ahmed AlSum
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-02-
|
12
|
+
date: 2016-02-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -151,6 +151,48 @@ dependencies:
|
|
151
151
|
- - ">="
|
152
152
|
- !ruby/object:Gem::Version
|
153
153
|
version: '0'
|
154
|
+
- !ruby/object:Gem::Dependency
|
155
|
+
name: yard
|
156
|
+
requirement: !ruby/object:Gem::Requirement
|
157
|
+
requirements:
|
158
|
+
- - ">="
|
159
|
+
- !ruby/object:Gem::Version
|
160
|
+
version: '0'
|
161
|
+
type: :development
|
162
|
+
prerelease: false
|
163
|
+
version_requirements: !ruby/object:Gem::Requirement
|
164
|
+
requirements:
|
165
|
+
- - ">="
|
166
|
+
- !ruby/object:Gem::Version
|
167
|
+
version: '0'
|
168
|
+
- !ruby/object:Gem::Dependency
|
169
|
+
name: rubocop
|
170
|
+
requirement: !ruby/object:Gem::Requirement
|
171
|
+
requirements:
|
172
|
+
- - ">="
|
173
|
+
- !ruby/object:Gem::Version
|
174
|
+
version: '0'
|
175
|
+
type: :development
|
176
|
+
prerelease: false
|
177
|
+
version_requirements: !ruby/object:Gem::Requirement
|
178
|
+
requirements:
|
179
|
+
- - ">="
|
180
|
+
- !ruby/object:Gem::Version
|
181
|
+
version: '0'
|
182
|
+
- !ruby/object:Gem::Dependency
|
183
|
+
name: rubocop-rspec
|
184
|
+
requirement: !ruby/object:Gem::Requirement
|
185
|
+
requirements:
|
186
|
+
- - ">="
|
187
|
+
- !ruby/object:Gem::Version
|
188
|
+
version: '0'
|
189
|
+
type: :development
|
190
|
+
prerelease: false
|
191
|
+
version_requirements: !ruby/object:Gem::Requirement
|
192
|
+
requirements:
|
193
|
+
- - ">="
|
194
|
+
- !ruby/object:Gem::Version
|
195
|
+
version: '0'
|
154
196
|
description: This library manages the core operations for the discovery indexing such
|
155
197
|
as reading PURL xml, mapping to the solr document, and writing to solr core.
|
156
198
|
email: laneymcg@stanford.edu
|