harvestdor-indexer 1.0.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d9f1f173b34815b82678ad0919e7dea21b830297
4
- data.tar.gz: e62f1eed6b4037528d2ad16c7df4a72a71d50d94
3
+ metadata.gz: 60218ff6d0f0210900b1b6a0df4d3a09a7122c34
4
+ data.tar.gz: 04ba680a7f7864dca78cbc9751c8eb9f6f9c811b
5
5
  SHA512:
6
- metadata.gz: b3ee85a3b7d93200af455520f470152239ca185121bce1855bfd009b01f83b6135c29d2eccdcf8d6b0215637d675ccd6c6614d5380fbc003102eef0631bf9949
7
- data.tar.gz: 34eec119b372e6590e266bb1a0ee87420de780f7f0b908d5570df0f9c58ab68c4e2363e15a525533a4bd56cce48b73d6369f5fd41cb34e8cbb31718456e02b83
6
+ metadata.gz: 2df842d3b19d9750f7a5e88f9af9dab60cfffa34cc80d7584b5b16f825aff64478e4c1ba3dde05de5749a9b8c17cbabd29fb55b4c71c965253b32d446686013b
7
+ data.tar.gz: 3c479f7be2c27ad39acad3316b77987dce8c487a654f47758513a4c4d094973ed7b0168f85307a5135de212d7469478e4e475285bd490250058b03869fea6619
@@ -1,10 +1,12 @@
1
1
  language: ruby
2
2
  script: rake rspec
3
3
  rvm:
4
- - 2.1.1
4
+ - 2.2.0
5
+ - 2.1.5
5
6
  - 2.0.0
6
7
  - 1.9.3
7
- - jruby-19mode # JRuby in 1.9 mode
8
+ - jruby-1.7.9-d19 # fails after 1.7.10 for some reason
9
+ # - jruby-19mode # JRuby in 1.9 mode
8
10
  notifications:
9
11
  email:
10
12
  - ndushay@stanford.edu
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
3
  # See harvestdor-indexer.gemspec for this gem's dependencies
4
- gemspec
4
+ gemspec
@@ -118,6 +118,7 @@ I suggest you run your code on harvestdor-dev, as it is already set up to be abl
118
118
 
119
119
  == Releases
120
120
 
121
+ * <b>2.0.0</b> Complete refactor to update APIs, merge configuration yml files, update to rspec 3
121
122
  * <b>1.0.4</b> Set skip_heartbeat to true in the initialization of the DorFetcher::Client for ease of testing
122
123
  * <b>1.0.3</b> Implemented class level config so anything that inherits from Harvestdor::Indexer can share configuration settings
123
124
  * <b>1.0.0</b> Replaced OAI harvesting mechanism with dor-fetcher
@@ -1,7 +1,7 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
  lib = File.expand_path('../lib', __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'harvestdor-indexer/version'
4
+ require 'harvestdor/indexer/version'
5
5
 
6
6
  Gem::Specification.new do |gem|
7
7
  gem.name = "harvestdor-indexer"
@@ -22,6 +22,8 @@ Gem::Specification.new do |gem|
22
22
  gem.add_dependency 'harvestdor', '>=0.0.14'
23
23
  gem.add_dependency 'stanford-mods'
24
24
  gem.add_dependency 'dor-fetcher', '=1.0.5'
25
+ gem.add_dependency "activesupport"
26
+ gem.add_dependency "parallel"
25
27
 
26
28
  # Runtime dependencies
27
29
  gem.add_runtime_dependency 'confstruct'
@@ -34,7 +36,7 @@ Gem::Specification.new do |gem|
34
36
  gem.add_development_dependency "rdoc"
35
37
  gem.add_development_dependency "yard"
36
38
  # tests
37
- gem.add_development_dependency 'rspec'
39
+ gem.add_development_dependency 'rspec', "~> 3.0"
38
40
  gem.add_development_dependency 'coveralls'
39
41
  # gem.add_development_dependency 'ruby-debug19'
40
42
  gem.add_development_dependency 'vcr'
@@ -1,317 +1 @@
1
- # external gems
2
- require 'confstruct'
3
- require 'rsolr'
4
- require 'retries'
5
- require 'json'
6
-
7
- # sul-dlss gems
8
- require 'harvestdor'
9
- require 'stanford-mods'
10
- require 'dor-fetcher'
11
-
12
- # stdlib
13
- require 'logger'
14
-
15
- require "harvestdor-indexer/version"
16
-
17
- module Harvestdor
18
- # Base class to harvest from DOR via harvestdor gem and then index
19
- class Indexer
20
-
21
- attr_accessor :error_count, :success_count, :max_retries
22
- attr_accessor :total_time_to_parse,:total_time_to_solr
23
- attr_accessor :dor_fetcher_client, :client_config
24
-
25
-
26
- # Class level config variable
27
- @@config ||= Confstruct::Configuration.new()
28
-
29
- def initialize yml_path, client_config_path, options = {}
30
- @success_count=0 # the number of objects successfully indexed
31
- @error_count=0 # the number of objects that failed
32
- @max_retries=10 # the number of times to retry an object
33
- @total_time_to_solr=0
34
- @total_time_to_parse=0
35
- @yml_path = yml_path
36
- config.configure(YAML.load_file(yml_path)) if yml_path
37
- config.configure options
38
- yield(config) if block_given?
39
- @client_config = YAML.load_file(client_config_path) if client_config_path && File.exists?(client_config_path)
40
- # Adding skip_heartbeat param for easier testing
41
- @dor_fetcher_client=DorFetcher::Client.new({:service_url => client_config["dor_fetcher_service_url"], :skip_heartbeat => true})
42
- end
43
-
44
- # to allow class level access to config variables for record_merger and solr_doc_builder
45
- # (rather than passing a lot of params to constructor)
46
- def self.config
47
- @@config ||= Confstruct::Configuration.new()
48
- end
49
-
50
- def config
51
- Indexer.config
52
- end
53
-
54
- def logger
55
- @logger ||= load_logger(config.log_dir, config.log_name)
56
- end
57
-
58
- # per this Indexer's config options
59
- # harvest the druids via DorFetcher
60
- # create a Solr profiling document for each druid
61
- # write the result to the Solr index
62
- def harvest_and_index
63
- start_time=Time.now
64
- logger.info("Started harvest_and_index at #{start_time}")
65
- if whitelist.empty?
66
- druids.each { |druid| index druid }
67
- else
68
- whitelist.each { |druid| index druid }
69
- end
70
- solr_client.commit
71
- total_time=elapsed_time(start_time)
72
- total_objects=@success_count+@error_count
73
- logger.info("Finished harvest_and_index at #{Time.now}: final Solr commit returned")
74
- logger.info("Total elapsed time for harvest and index: #{(total_time/60.0).round(2)} minutes")
75
- logger.info("Avg solr commit time per object (successful): #{(@total_time_to_solr/@success_count).round(2)} seconds") unless (@total_time_to_solr == 0 || @success_count == 0)
76
- logger.info("Avg solr commit time per object (all): #{(@total_time_to_solr/total_objects).round(2)} seconds") unless (@total_time_to_solr == 0 || @error_count == 0 || total_objects == 0)
77
- logger.info("Avg parse time per object (successful): #{(@total_time_to_parse/@success_count).round(2)} seconds") unless (@total_time_to_parse == 0 || @success_count == 0)
78
- logger.info("Avg parse time per object (all): #{(@total_time_to_parse/total_objects).round(2)} seconds") unless (@total_time_to_parse == 0 || @error_count == 0 || total_objects == 0)
79
- logger.info("Avg complete index time per object (successful): #{(total_time/@success_count).round(2)} seconds") unless (@success_count == 0)
80
- logger.info("Avg complete index time per object (all): #{(total_time/total_objects).round(2)} seconds") unless (@error_count == 0 || total_objects == 0)
81
- logger.info("Successful count: #{@success_count}")
82
- logger.info("Error count: #{@error_count}")
83
- logger.info("Total records processed: #{total_objects}")
84
- end
85
-
86
- # return Array of druids contained in the DorFetcher pulling indicated by DorFetcher params
87
- # @return [Array<String>] or enumeration over it, if block is given. (strings are druids, e.g. ab123cd1234)
88
- def druids
89
- if @druids.nil?
90
- start_time=Time.now
91
- logger.info("Starting DorFetcher pulling of druids at #{start_time}.")
92
- @druids = @dor_fetcher_client.druid_array(@dor_fetcher_client.get_collection(strip_default_set_string(), {}))
93
- logger.info("Completed DorFetcher pulling of druids at #{Time.now}. Found #{@druids.size} druids. Total elapsed time for DorFetcher pulling = #{elapsed_time(start_time,:minutes)} minutes")
94
- end
95
- return @druids
96
- end
97
-
98
- # Add the document to solr, retry if an error occurs.
99
- # See https://github.com/ooyala/retries for docs on with_retries.
100
- # @param [Hash] doc a Hash representation of the solr document
101
- # @param [String] id the id of the document being sent, for logging
102
- def solr_add(doc, id)
103
- max_tries=@max_retries ? @max_retries : 10 #if @max_retries isn't set, use 10
104
-
105
- handler = Proc.new do |exception, attempt_number, total_delay|
106
- logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
107
- # logger.debug exception.backtrace
108
- end
109
-
110
- with_retries(:max_tries => max_tries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
111
- logger.debug "Attempt #{attempt} for #{id}"
112
- solr_client.add(doc)
113
- logger.info "Successfully indexed #{id} on attempt #{attempt}"
114
- end
115
- end
116
-
117
- # create Solr doc for the druid and add it to Solr, unless it is on the blacklist.
118
- # NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
119
- def index druid
120
- if blacklist.include?(druid)
121
- logger.info("Druid #{druid} is on the blacklist and will have no Solr doc created")
122
- else
123
- logger.fatal("You must override the index method to transform druids into Solr docs and add them to Solr")
124
-
125
- begin
126
- start_time=Time.now
127
- logger.info("About to index #{druid} at #{start_time}")
128
- #logger.debug "About to index #{druid}"
129
- doc_hash = {}
130
- doc_hash[:id] = druid
131
- # doc_hash[:title_tsim] = smods_rec(druid).short_title
132
-
133
- # you might add things from Indexer level class here
134
- # (e.g. things that are the same across all documents in the harvest)
135
-
136
- solr_client.add(doc_hash)
137
-
138
- logger.info("Indexed #{druid} in #{elapsed_time(start_time)} seconds")
139
- @success_count+=1
140
- # TODO: provide call to code to update DOR object's workflow datastream??
141
- rescue => e
142
- @error_count+=1
143
- logger.error "Failed to index #{druid} in #{elapsed_time(start_time)} seconds: #{e.message}"
144
- end
145
- end
146
- end
147
-
148
- # return the MODS for the druid as a Stanford::Mods::Record object
149
- # @param [String] druid e.g. ab123cd4567
150
- # @return [Stanford::Mods::Record] created from the MODS xml for the druid
151
- def smods_rec druid
152
- start_time=Time.now
153
- ng_doc = harvestdor_client.mods druid
154
- logger.info("Fetched MODs for #{druid} in #{elapsed_time(start_time)} seconds")
155
- raise "Empty MODS metadata for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
156
- mods_rec = Stanford::Mods::Record.new
157
- mods_rec.from_nk_node(ng_doc.root)
158
- mods_rec
159
- end
160
-
161
- # the public xml for this DOR object, from the purl page
162
- # @param [String] druid e.g. ab123cd4567
163
- # @return [Nokogiri::XML::Document] the public xml for the DOR object
164
- def public_xml druid
165
- start_time=Time.now
166
- ng_doc = harvestdor_client.public_xml druid
167
- logger.info("Fetched public_xml for #{druid} in #{elapsed_time(start_time)} seconds")
168
- raise "No public xml for #{druid}" if !ng_doc
169
- raise "Empty public xml for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
170
- ng_doc
171
- end
172
-
173
- # the contentMetadata for this DOR object, ultimately from the purl public xml
174
- # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
175
- # a Nokogiri::XML::Document containing the public_xml for an object
176
- # @return [Nokogiri::XML::Document] the contentMetadata for the DOR object
177
- def content_metadata object
178
- start_time=Time.now
179
- ng_doc = harvestdor_client.content_metadata object
180
- logger.info("Fetched content_metadata in #{elapsed_time(start_time)} seconds")
181
- raise "No contentMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
182
- ng_doc
183
- end
184
-
185
- # the identityMetadata for this DOR object, ultimately from the purl public xml
186
- # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
187
- # a Nokogiri::XML::Document containing the public_xml for an object
188
- # @return [Nokogiri::XML::Document] the identityMetadata for the DOR object
189
- def identity_metadata object
190
- start_time=Time.now
191
- ng_doc = harvestdor_client.identity_metadata object
192
- logger.info("Fetched identity_metadata in #{elapsed_time(start_time)} seconds")
193
- raise "No identityMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
194
- ng_doc
195
- end
196
-
197
- # the rightsMetadata for this DOR object, ultimately from the purl public xml
198
- # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
199
- # a Nokogiri::XML::Document containing the public_xml for an object
200
- # @return [Nokogiri::XML::Document] the rightsMetadata for the DOR object
201
- def rights_metadata object
202
- start_time=Time.now
203
- ng_doc = harvestdor_client.rights_metadata object
204
- logger.info("Fetched rights_metadata in #{elapsed_time(start_time)} seconds")
205
- raise "No rightsMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
206
- ng_doc
207
- end
208
-
209
- # the RDF for this DOR object, ultimately from the purl public xml
210
- # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
211
- # a Nokogiri::XML::Document containing the public_xml for an object
212
- # @return [Nokogiri::XML::Document] the RDF for the DOR object
213
- def rdf object
214
- start_time=Time.now
215
- ng_doc = harvestdor_client.rdf object
216
- logger.info("Fetched rdf in #{elapsed_time(start_time)} seconds")
217
- raise "No RDF for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
218
- ng_doc
219
- end
220
-
221
- def solr_client
222
- @solr_client ||= RSolr.connect(config.solr.to_hash)
223
- end
224
-
225
- # @return an Array of druids ('oo000oo0000') that should NOT be processed
226
- def blacklist
227
- # avoid trying to load the file multiple times
228
- if !@blacklist && !@loaded_blacklist
229
- @blacklist = load_blacklist(config.blacklist) if config.blacklist
230
- end
231
- @blacklist ||= []
232
- end
233
-
234
- # @return an Array of druids ('oo000oo0000') that should be processed
235
- def whitelist
236
- # avoid trying to load the file multiple times
237
- if !@whitelist && !@loaded_whitelist
238
- @whitelist = load_whitelist(config.whitelist) if config.whitelist
239
- end
240
- @whitelist ||= []
241
- end
242
-
243
- # Get only the druid from the end of the default_set string
244
- # from the yml file
245
- def strip_default_set_string()
246
- Indexer.config.default_set.split('_').last
247
- end
248
-
249
- protected #---------------------------------------------------------------------
250
-
251
- def harvestdor_client
252
- @harvestdor_client ||= Harvestdor::Client.new({:config_yml_path => @yml_path})
253
- end
254
-
255
- def elapsed_time(start_time,units=:seconds)
256
- elapsed_seconds=Time.now-start_time
257
- case units
258
- when :seconds
259
- return elapsed_seconds.round(2)
260
- when :minutes
261
- return (elapsed_seconds/60.0).round(1)
262
- when :hours
263
- return (elapsed_seconds/3600.0).round(2)
264
- else
265
- return elapsed_seconds
266
- end
267
- end
268
-
269
- # populate @blacklist as an Array of druids ('oo000oo0000') that will NOT be processed
270
- # by reading the File at the indicated path
271
- # @param [String] path - path of file containing a list of druids
272
- def load_blacklist path
273
- if path && !@loaded_blacklist
274
- @loaded_blacklist = true
275
- @blacklist = load_id_list path
276
- end
277
- end
278
-
279
- # populate @blacklist as an Array of druids ('oo000oo0000') that WILL be processed
280
- # (unless a druid is also on the blacklist)
281
- # by reading the File at the indicated path
282
- # @param [String] path - path of file containing a list of druids
283
- def load_whitelist path
284
- if path && !@loaded_whitelist
285
- @loaded_whitelist = true
286
- @whitelist = load_id_list path
287
- end
288
- end
289
-
290
- # return an Array of druids ('oo000oo0000')
291
- # populated by reading the File at the indicated path
292
- # @param [String] path - path of file containing a list of druids
293
- # @return [Array<String>] an Array of druids
294
- def load_id_list path
295
- if path
296
- list = []
297
- f = File.open(path).each_line { |line|
298
- list << line.gsub(/\s+/, '') if !line.gsub(/\s+/, '').empty? && !line.strip.start_with?('#')
299
- }
300
- list
301
- end
302
- rescue
303
- msg = "Unable to find list of druids at " + path
304
- logger.fatal msg
305
- raise msg
306
- end
307
-
308
- # Global, memoized, lazy initialized instance of a logger
309
- # @param [String] log_dir directory for to get log file
310
- # @param [String] log_name name of log file
311
- def load_logger(log_dir, log_name)
312
- Dir.mkdir(log_dir) unless File.directory?(log_dir)
313
- @logger ||= Logger.new(File.join(log_dir, log_name), 'daily')
314
- end
315
-
316
- end # Indexer class
317
- end # Harvestdor module
1
+ require 'harvestdor/indexer'
@@ -0,0 +1,159 @@
1
+
2
+ # external gems
3
+ require 'confstruct'
4
+ require 'rsolr'
5
+ require 'retries'
6
+ require 'parallel'
7
+ require 'json'
8
+
9
+ # sul-dlss gems
10
+ require 'harvestdor'
11
+ require 'stanford-mods'
12
+ require 'dor-fetcher'
13
+
14
+ # stdlib
15
+ require 'logger'
16
+
17
+ require "harvestdor/indexer/version"
18
+
19
+ require 'active_support/benchmarkable'
20
+ module Harvestdor
21
+ # Base class to harvest from DOR via harvestdor gem and then index
22
+ class Indexer
23
+ require "harvestdor/indexer/metrics"
24
+ require "harvestdor/indexer/resource"
25
+ require "harvestdor/indexer/solr"
26
+
27
+ include ActiveSupport::Benchmarkable
28
+
29
+ attr_accessor :metrics, :logger
30
+
31
+ def initialize options = {}
32
+ config.configure(options)
33
+ yield(config) if block_given?
34
+ @metrics = Harvestdor::Indexer::Metrics.new logger: logger
35
+ end
36
+
37
+ def config
38
+ @config ||= Confstruct::Configuration.new
39
+ end
40
+
41
+ def logger
42
+ @logger ||= begin
43
+ if config.harvestdor
44
+ Dir.mkdir(config.harvestdor.log_dir) unless File.directory?(config.harvestdor.log_dir)
45
+ Logger.new(File.join(config.harvestdor.log_dir, config.harvestdor.log_name), 'daily')
46
+ else
47
+ Logger.new STDERR
48
+ end
49
+ end
50
+ end
51
+
52
+ # per this Indexer's config options
53
+ # harvest the druids via DorFetcher
54
+ # create a Solr profiling document for each druid
55
+ # write the result to the Solr index
56
+ def harvest_and_index each_options = {in_threads: 4}
57
+ benchmark "Harvest and Indexing" do
58
+ each_resource(each_options) do |resource|
59
+ index resource
60
+ end
61
+
62
+ solr.commit!
63
+ end
64
+ end
65
+
66
+ def resources
67
+ druids.map do |x|
68
+ Harvestdor::Indexer::Resource.new(self, x)
69
+ end.map do |x|
70
+ [x, (x.items if x.collection?)]
71
+ end.flatten.uniq.compact
72
+ end
73
+
74
+ def each_resource options = {}, &block
75
+ benchmark "" do
76
+ Parallel.each(resources, options) do |resource|
77
+ metrics.tally on_error: method(:resource_error) do
78
+ yield resource
79
+ end
80
+ end
81
+ end
82
+
83
+ logger.info("Successful count: #{metrics.success_count}")
84
+ logger.info("Error count: #{metrics.error_count}")
85
+ logger.info("Total records processed: #{metrics.total}")
86
+ end
87
+
88
+ def resource_error e
89
+ if e.instance_of? Parallel::Break or e.instance_of? Parallel::Kill
90
+ raise e
91
+ end
92
+ end
93
+
94
+ # return Array of druids contained in the DorFetcher pulling indicated by DorFetcher params
95
+ # @return [Array<String>] or enumeration over it, if block is given. (strings are druids, e.g. ab123cd1234)
96
+ def druids
97
+ @druids ||= whitelist
98
+ end
99
+
100
+ # create Solr doc for the druid and add it to Solr
101
+ # NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
102
+ def index resource
103
+
104
+ benchmark "Indexing #{resource.druid}" do
105
+ logger.debug "About to index #{resource.druid}"
106
+ doc_hash = {}
107
+ doc_hash[:id] = resource.druid
108
+
109
+ # you might add things from Indexer level class here
110
+ # (e.g. things that are the same across all documents in the harvest)
111
+ solr.add doc_hash
112
+ # TODO: provide call to code to update DOR object's workflow datastream??
113
+ end
114
+ end
115
+
116
+ # @return an Array of druids ('oo000oo0000') that should be processed
117
+ def whitelist
118
+ @whitelist ||= config.whitelist if config.whitelist.is_a? Array
119
+ @whitelist ||= load_whitelist(config.whitelist) if config.whitelist
120
+ @whitelist ||= []
121
+ end
122
+
123
+ def harvestdor_client
124
+ @harvestdor_client ||= Harvestdor::Client.new(config.harvestdor)
125
+ end
126
+
127
+ def dor_fetcher_client
128
+ @dor_fetcher_client ||= DorFetcher::Client.new(config.dor_fetcher)
129
+ end
130
+
131
+ def solr
132
+ @solr ||= Harvestdor::Indexer::Solr.new self, config.solr.to_hash
133
+ end
134
+
135
+ protected #---------------------------------------------------------------------
136
+
137
+ # populate @whitelist as an Array of druids ('oo000oo0000') that WILL be processed
138
+ # by reading the File at the indicated path
139
+ # @param [String] path - path of file containing a list of druids
140
+ def load_whitelist path
141
+ @whitelist = load_id_list path
142
+ end
143
+
144
+ # return an Array of druids ('oo000oo0000')
145
+ # populated by reading the File at the indicated path
146
+ # @param [String] path - path of file containing a list of druids
147
+ # @return [Array<String>] an Array of druids
148
+ def load_id_list path
149
+ list = File.open(path).each_line
150
+ .map { |line| line.strip }
151
+ .reject { |line| line.strip.start_with?('#') }
152
+ .reject { |line| line.empty? }
153
+ rescue
154
+ msg = "Unable to find list of druids at " + path
155
+ logger.fatal msg
156
+ raise msg
157
+ end
158
+ end # Indexer class
159
+ end # Harvestdor module