discovery-indexer 0.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/discovery-indexer.rb +3 -1
- data/lib/logging.rb +16 -0
- data/lib/reader/modsxml.rb +3 -3
- data/lib/reader/purlxml.rb +3 -3
- data/lib/reader/purlxml_model.rb +55 -1
- data/lib/reader/purlxml_parser_strict.rb +11 -10
- data/lib/version.rb +1 -1
- data/lib/writer/solr_client.rb +7 -7
- data/lib/writer/solr_writer.rb +1 -2
- metadata +21 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f852ef4d15e2ba4dc67d49b1a0ec599ce5791234
|
4
|
+
data.tar.gz: 781d104e85d4b7ce490b639ca29b5aa31e03ef7e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 12bc9321557bc08c608763546a185404d47fe4d8225df32604378f864acde9ceb93f8df48c63e755ffe6fb236b7e5a60ea15453ce2e07ba18f213e37f8249d8a
|
7
|
+
data.tar.gz: 91402727cff066c16adf1f29a8b9a156dc2c175de887760b2e343fe49caed6ab97f6c8bfd190b034bf8add13095c60553f71eda75b5890b2c96bddfc82682c7a
|
data/lib/discovery-indexer.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
require 'errors'
|
2
|
+
require 'logging'
|
3
|
+
|
1
4
|
require 'reader/purlxml'
|
2
5
|
require 'reader/purlxml_reader'
|
3
6
|
require 'reader/purlxml_parser'
|
@@ -15,7 +18,6 @@ require 'writer/solr_writer'
|
|
15
18
|
|
16
19
|
#require 'utilities/extract_sub_targets'
|
17
20
|
|
18
|
-
require 'errors'
|
19
21
|
|
20
22
|
module DiscoveryIndexer
|
21
23
|
PURL_DEFAULT = 'http://purl-test.stanford.edu'
|
data/lib/logging.rb
ADDED
data/lib/reader/modsxml.rb
CHANGED
@@ -5,9 +5,9 @@ module DiscoveryIndexer
|
|
5
5
|
# This class is the main class to access and parse the mods xml
|
6
6
|
# as retrieved from PURL server
|
7
7
|
# @example to run the code
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
8
|
+
# druid = "aa111aa1111"
|
9
|
+
# p = DiscoveryIndexer::InputXml::Modsxml.new(druid)
|
10
|
+
# model = p.load()
|
11
11
|
#
|
12
12
|
#
|
13
13
|
class Modsxml
|
data/lib/reader/purlxml.rb
CHANGED
@@ -4,9 +4,9 @@ module DiscoveryIndexer
|
|
4
4
|
# This class is the main class to access and parse the purl xml
|
5
5
|
# as retrieved from PURL server
|
6
6
|
# @example to run the code
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
7
|
+
# druid = "aa111aa1111"
|
8
|
+
# p = DiscoveryIndexer::InputXml::Purlxml.new(druid)
|
9
|
+
# model = p.load()
|
10
10
|
#
|
11
11
|
class Purlxml
|
12
12
|
|
data/lib/reader/purlxml_model.rb
CHANGED
@@ -1,21 +1,75 @@
|
|
1
1
|
module DiscoveryIndexer
|
2
2
|
module InputXml
|
3
3
|
class PurlxmlModel
|
4
|
+
|
5
|
+
#@!attribute [rw] public_xml
|
6
|
+
# @return [Nokogiri::XML] The publix xml as retrieved from purl server
|
4
7
|
attr_accessor :public_xml
|
8
|
+
|
9
|
+
#@!attribute [rw] content_metadata
|
10
|
+
# @return [Nokogiri::XML] The content_metadata as extracted from public xml
|
5
11
|
attr_accessor :content_metadata
|
12
|
+
|
13
|
+
#@!attribute [rw] identity_metadata
|
14
|
+
# @return [Nokogiri::XML] The identity_metadata as extracted from public xml
|
6
15
|
attr_accessor :identity_metadata
|
16
|
+
|
17
|
+
#@!attribute [rw] rights_metadata
|
18
|
+
# @return [Nokogiri::XML] The rights_metadata as extracted from public xml
|
7
19
|
attr_accessor :rights_metadata
|
20
|
+
|
21
|
+
#@!attribute [rw] dc
|
22
|
+
# @return [Nokogiri::XML] The dc element as extracted from public xml
|
8
23
|
attr_accessor :dc
|
24
|
+
|
25
|
+
#@!attribute [rw] rdf
|
26
|
+
# @return [Nokogiri::XML] The rdf element as extracted from public xml
|
9
27
|
attr_accessor :rdf
|
28
|
+
|
29
|
+
# @!attribute [rw] release_tags_hash
|
30
|
+
# @return [Hash] The release_tag in hash format asextracted from public xml
|
31
|
+
# identity_metadata.
|
32
|
+
# @example
|
33
|
+
# !{"target1"=>true, "target2"=>false}
|
10
34
|
attr_accessor :release_tags_hash
|
35
|
+
|
36
|
+
# @!attribute [rw] dor_content_type
|
37
|
+
# @return [String] The dor_content_type as extracted from public xml
|
38
|
+
# content_metadata.
|
11
39
|
attr_accessor :dor_content_type
|
40
|
+
|
41
|
+
# @!attribute [rw] is_collection
|
42
|
+
# @return [Boolean] true if the item type is collection in the identity_metadata
|
12
43
|
attr_accessor :is_collection
|
44
|
+
|
45
|
+
# @!attribute [rw] collection_druids
|
46
|
+
# @return [Array] a list of the collections that this is druid belongs to
|
47
|
+
# @example
|
48
|
+
# ["aa11aaa1111","bb111bb1111"]
|
13
49
|
attr_accessor :collection_druids
|
14
|
-
|
50
|
+
|
51
|
+
# @!attribute [rw] file_ids
|
52
|
+
# @return [Array] a list of the file ids in the content_metadata
|
53
|
+
# @example
|
54
|
+
# ["pc0065_b08_f10_i031.txt","pc0065_b08_f10_i032.txt"]
|
15
55
|
attr_accessor :file_ids
|
56
|
+
|
57
|
+
# @!attribute [rw] image_ids
|
58
|
+
# @return [Array] a list of the image ids in the content_metadata
|
59
|
+
# @example
|
60
|
+
# ["pc0065_b08_f10_i031.jp2","pc0065_b08_f10_i032.jp2"]
|
16
61
|
attr_accessor :image_ids
|
62
|
+
|
63
|
+
# @!attribute [rw] catkey
|
64
|
+
# @return [String] the catkey attribute in identity_metadata
|
17
65
|
attr_accessor :catkey
|
66
|
+
|
67
|
+
# @!attribute [rw] barcode
|
68
|
+
# @return [String] the barcode attribute in identity_metadata
|
18
69
|
attr_accessor :barcode
|
70
|
+
|
71
|
+
# @!attribute [rw] label
|
72
|
+
# @return [String] the objectLabel attribute in identity_metadata
|
19
73
|
attr_accessor :label
|
20
74
|
|
21
75
|
end
|
@@ -34,7 +34,7 @@ module DiscoveryIndexer
|
|
34
34
|
|
35
35
|
# extracts the identityMetadata for this fedora object, from the purl xml
|
36
36
|
# @return [Nokogiri::XML::Document] the identityMetadata for the fedora object
|
37
|
-
# @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no
|
37
|
+
# @raise [DiscoveryIndexer::Errors::MissingIdentityMetadata] if there is no identity_metadata
|
38
38
|
def parse_identity_metadata
|
39
39
|
begin
|
40
40
|
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/identityMetadata').to_xml)
|
@@ -113,13 +113,9 @@ module DiscoveryIndexer
|
|
113
113
|
# @return [Nokogiri::XML::Document] the contentMetadata for the fedora object
|
114
114
|
# @raise [DiscoveryIndexer::Errors::MissingContentMetadata] if there is no contentMetadata
|
115
115
|
def parse_content_metadata
|
116
|
-
# begin
|
117
116
|
ng_doc = Nokogiri::XML(@purlxml_ng_doc.root.xpath('/publicObject/contentMetadata').to_xml)
|
118
|
-
|
117
|
+
ng_doc = nil if !ng_doc || ng_doc.children.empty?
|
119
118
|
ng_doc
|
120
|
-
# rescue
|
121
|
-
# raise DiscoveryIndexer::Errors::MissingContentMetadata.new(@purlxml_ng_doc.inspect)
|
122
|
-
# end
|
123
119
|
end
|
124
120
|
|
125
121
|
# @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
|
@@ -153,18 +149,18 @@ module DiscoveryIndexer
|
|
153
149
|
# @return [String]
|
154
150
|
def parse_dor_content_type
|
155
151
|
content_md = parse_content_metadata
|
156
|
-
dct = content_md ? content_md.xpath('
|
152
|
+
dct = content_md ? content_md.xpath('contentMetadata/@type').text : nil
|
157
153
|
puts " has no DOR content type (<contentMetadata> element may be missing type attribute)" if !dct || dct.empty?
|
158
154
|
dct
|
159
155
|
end
|
160
156
|
|
161
|
-
# the @id attribute of resource/file elements that match the
|
157
|
+
# the @id attribute of resource/file elements that match the image type, including extension
|
162
158
|
# @return [Array<String>] filenames
|
163
159
|
def parse_image_ids
|
164
160
|
ids = []
|
165
161
|
content_md = parse_content_metadata
|
166
162
|
unless content_md.nil?
|
167
|
-
content_md.xpath('
|
163
|
+
content_md.xpath('//resource[@type="image"]/file/@id').each { |node|
|
168
164
|
ids << node.text if !node.text.empty?
|
169
165
|
}
|
170
166
|
return nil if ids.empty?
|
@@ -172,11 +168,13 @@ module DiscoveryIndexer
|
|
172
168
|
end
|
173
169
|
end
|
174
170
|
|
171
|
+
# the @id attribute of resource/file elements, including extension
|
172
|
+
# @return [Array<String>] filenames
|
175
173
|
def parse_file_ids
|
176
174
|
ids = []
|
177
175
|
content_md = parse_content_metadata
|
178
176
|
unless content_md.nil?
|
179
|
-
content_md.xpath('
|
177
|
+
content_md.xpath('//resource/file/@id').each { |node|
|
180
178
|
ids << node.text if !node.text.empty?
|
181
179
|
}
|
182
180
|
return nil if ids.empty?
|
@@ -184,6 +182,7 @@ module DiscoveryIndexer
|
|
184
182
|
end
|
185
183
|
end
|
186
184
|
|
185
|
+
# @return catkey value from the DOR identity_metadata, or nil if there is no catkey
|
187
186
|
def parse_catkey
|
188
187
|
catkey = nil
|
189
188
|
node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='catkey']")
|
@@ -191,6 +190,7 @@ module DiscoveryIndexer
|
|
191
190
|
return catkey
|
192
191
|
end
|
193
192
|
|
193
|
+
# @return barcode value from the DOR identity_metadata, or nil if there is no barcode
|
194
194
|
def parse_barcode
|
195
195
|
barcode = nil
|
196
196
|
node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/otherId[@name='barcode']")
|
@@ -198,6 +198,7 @@ module DiscoveryIndexer
|
|
198
198
|
return barcode
|
199
199
|
end
|
200
200
|
|
201
|
+
# @return objectLabel value from the DOR identity_metadata, or nil if there is no barcode
|
201
202
|
def parse_label
|
202
203
|
label = nil
|
203
204
|
node = @purlxml_ng_doc.xpath("/publicObject/identityMetadata/objectLabel")
|
data/lib/version.rb
CHANGED
data/lib/writer/solr_client.rb
CHANGED
@@ -4,7 +4,8 @@ require 'rsolr'
|
|
4
4
|
module DiscoveryIndexer
|
5
5
|
module Writer
|
6
6
|
class SolrClient
|
7
|
-
|
7
|
+
include DiscoveryIndexer::Logging
|
8
|
+
|
8
9
|
# Add the document to solr, retry if an error occurs.
|
9
10
|
# See https://github.com/ooyala/retries for docs on with_retries.
|
10
11
|
# @param [Hash] solr_doc a Hash representation of the solr document
|
@@ -24,22 +25,21 @@ module DiscoveryIndexer
|
|
24
25
|
end
|
25
26
|
|
26
27
|
def self.process(solr_doc, solr_connector, max_retries, is_delete=false)
|
27
|
-
|
28
|
-
id = solr_doc[:id]
|
28
|
+
id = solr_doc[:id]
|
29
29
|
puts id
|
30
30
|
handler = Proc.new do |exception, attempt_number, total_delay|
|
31
|
-
logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
|
31
|
+
DiscoveryIndexer::Logging.logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
|
32
32
|
end
|
33
33
|
|
34
34
|
with_retries(:max_tries => max_retries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
|
35
|
-
logger.debug "Attempt #{attempt} for #{id}"
|
35
|
+
DiscoveryIndexer::Logging.logger.debug "Attempt #{attempt} for #{id}"
|
36
36
|
|
37
37
|
if is_delete
|
38
38
|
solr_connector.delete_by_id(id)
|
39
|
-
logger.info "Successfully deleted #{id} on attempt #{attempt}"
|
39
|
+
DiscoveryIndexer::Logging.logger.info "Successfully deleted #{id} on attempt #{attempt}"
|
40
40
|
else
|
41
41
|
solr_connector.add(solr_doc)
|
42
|
-
logger.info "Successfully indexed #{id} on attempt #{attempt}"
|
42
|
+
DiscoveryIndexer::Logging.logger.info "Successfully indexed #{id} on attempt #{attempt}"
|
43
43
|
end
|
44
44
|
|
45
45
|
end
|
data/lib/writer/solr_writer.rb
CHANGED
@@ -4,6 +4,7 @@ require 'rsolr'
|
|
4
4
|
module DiscoveryIndexer
|
5
5
|
module Writer
|
6
6
|
class SolrWriter
|
7
|
+
include DiscoveryIndexer::Logging
|
7
8
|
|
8
9
|
def process(druid, index_doc, targets, solr_targets_configs)
|
9
10
|
@solr_targets_configs = solr_targets_configs
|
@@ -47,8 +48,6 @@ module DiscoveryIndexer
|
|
47
48
|
|
48
49
|
def get_connector_for_target(solr_target)
|
49
50
|
solr_connector = nil
|
50
|
-
puts solr_target
|
51
|
-
puts @solr_targets_configs
|
52
51
|
if @solr_targets_configs.keys.include?(solr_target) then
|
53
52
|
config = @solr_targets_configs[solr_target]
|
54
53
|
solr_connector = RSolr.connect(config)
|
metadata
CHANGED
@@ -1,125 +1,125 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: discovery-indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ahmed AlSum
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - '>='
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: stanford-mods
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - '>='
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: retries
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - '>='
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - '>='
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rsolr
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - '>='
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- -
|
66
|
+
- - '>='
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - '>='
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '0'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
80
|
+
- - '>='
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: webmock
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - '>='
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- -
|
94
|
+
- - '>='
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: equivalent-xml
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- -
|
101
|
+
- - '>='
|
102
102
|
- !ruby/object:Gem::Version
|
103
103
|
version: '0'
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- -
|
108
|
+
- - '>='
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: vcr
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
|
-
- -
|
115
|
+
- - '>='
|
116
116
|
- !ruby/object:Gem::Version
|
117
117
|
version: '0'
|
118
118
|
type: :development
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
|
-
- -
|
122
|
+
- - '>='
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
125
|
description: This library manages the core operations for the discovery indexing such
|
@@ -131,6 +131,7 @@ extra_rdoc_files: []
|
|
131
131
|
files:
|
132
132
|
- lib/discovery-indexer.rb
|
133
133
|
- lib/errors.rb
|
134
|
+
- lib/logging.rb
|
134
135
|
- lib/mapper/general_mapper.rb
|
135
136
|
- lib/mapper/index_mapper.rb
|
136
137
|
- lib/reader/modsxml.rb
|
@@ -154,12 +155,12 @@ require_paths:
|
|
154
155
|
- lib
|
155
156
|
required_ruby_version: !ruby/object:Gem::Requirement
|
156
157
|
requirements:
|
157
|
-
- -
|
158
|
+
- - '>='
|
158
159
|
- !ruby/object:Gem::Version
|
159
160
|
version: '0'
|
160
161
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
161
162
|
requirements:
|
162
|
-
- -
|
163
|
+
- - '>='
|
163
164
|
- !ruby/object:Gem::Version
|
164
165
|
version: '0'
|
165
166
|
requirements: []
|