harvestdor-indexer 1.0.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d9f1f173b34815b82678ad0919e7dea21b830297
4
- data.tar.gz: e62f1eed6b4037528d2ad16c7df4a72a71d50d94
3
+ metadata.gz: 60218ff6d0f0210900b1b6a0df4d3a09a7122c34
4
+ data.tar.gz: 04ba680a7f7864dca78cbc9751c8eb9f6f9c811b
5
5
  SHA512:
6
- metadata.gz: b3ee85a3b7d93200af455520f470152239ca185121bce1855bfd009b01f83b6135c29d2eccdcf8d6b0215637d675ccd6c6614d5380fbc003102eef0631bf9949
7
- data.tar.gz: 34eec119b372e6590e266bb1a0ee87420de780f7f0b908d5570df0f9c58ab68c4e2363e15a525533a4bd56cce48b73d6369f5fd41cb34e8cbb31718456e02b83
6
+ metadata.gz: 2df842d3b19d9750f7a5e88f9af9dab60cfffa34cc80d7584b5b16f825aff64478e4c1ba3dde05de5749a9b8c17cbabd29fb55b4c71c965253b32d446686013b
7
+ data.tar.gz: 3c479f7be2c27ad39acad3316b77987dce8c487a654f47758513a4c4d094973ed7b0168f85307a5135de212d7469478e4e475285bd490250058b03869fea6619
@@ -1,10 +1,12 @@
1
1
  language: ruby
2
2
  script: rake rspec
3
3
  rvm:
4
- - 2.1.1
4
+ - 2.2.0
5
+ - 2.1.5
5
6
  - 2.0.0
6
7
  - 1.9.3
7
- - jruby-19mode # JRuby in 1.9 mode
8
+ - jruby-1.7.9-d19 # fails after 1.7.10 for some reason
9
+ # - jruby-19mode # JRuby in 1.9 mode
8
10
  notifications:
9
11
  email:
10
12
  - ndushay@stanford.edu
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
3
  # See harvestdor-indexer.gemspec for this gem's dependencies
4
- gemspec
4
+ gemspec
@@ -118,6 +118,7 @@ I suggest you run your code on harvestdor-dev, as it is already set up to be abl
118
118
 
119
119
  == Releases
120
120
 
121
+ * <b>2.0.0</b> Complete refactor to update APIs, merge configuration yml files, update to rspec 3
121
122
  * <b>1.0.4</b> Set skip_heartbeat to true in the initialization of the DorFetcher::Client for ease of testing
122
123
  * <b>1.0.3</b> Implemented class level config so anything that inherits from Harvestdor::Indexer can share configuration settings
123
124
  * <b>1.0.0</b> Replaced OAI harvesting mechanism with dor-fetcher
@@ -1,7 +1,7 @@
1
1
  # -*- encoding: utf-8 -*-
2
2
  lib = File.expand_path('../lib', __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'harvestdor-indexer/version'
4
+ require 'harvestdor/indexer/version'
5
5
 
6
6
  Gem::Specification.new do |gem|
7
7
  gem.name = "harvestdor-indexer"
@@ -22,6 +22,8 @@ Gem::Specification.new do |gem|
22
22
  gem.add_dependency 'harvestdor', '>=0.0.14'
23
23
  gem.add_dependency 'stanford-mods'
24
24
  gem.add_dependency 'dor-fetcher', '=1.0.5'
25
+ gem.add_dependency "activesupport"
26
+ gem.add_dependency "parallel"
25
27
 
26
28
  # Runtime dependencies
27
29
  gem.add_runtime_dependency 'confstruct'
@@ -34,7 +36,7 @@ Gem::Specification.new do |gem|
34
36
  gem.add_development_dependency "rdoc"
35
37
  gem.add_development_dependency "yard"
36
38
  # tests
37
- gem.add_development_dependency 'rspec'
39
+ gem.add_development_dependency 'rspec', "~> 3.0"
38
40
  gem.add_development_dependency 'coveralls'
39
41
  # gem.add_development_dependency 'ruby-debug19'
40
42
  gem.add_development_dependency 'vcr'
@@ -1,317 +1 @@
1
- # external gems
2
- require 'confstruct'
3
- require 'rsolr'
4
- require 'retries'
5
- require 'json'
6
-
7
- # sul-dlss gems
8
- require 'harvestdor'
9
- require 'stanford-mods'
10
- require 'dor-fetcher'
11
-
12
- # stdlib
13
- require 'logger'
14
-
15
- require "harvestdor-indexer/version"
16
-
17
- module Harvestdor
18
- # Base class to harvest from DOR via harvestdor gem and then index
19
- class Indexer
20
-
21
- attr_accessor :error_count, :success_count, :max_retries
22
- attr_accessor :total_time_to_parse,:total_time_to_solr
23
- attr_accessor :dor_fetcher_client, :client_config
24
-
25
-
26
- # Class level config variable
27
- @@config ||= Confstruct::Configuration.new()
28
-
29
- def initialize yml_path, client_config_path, options = {}
30
- @success_count=0 # the number of objects successfully indexed
31
- @error_count=0 # the number of objects that failed
32
- @max_retries=10 # the number of times to retry an object
33
- @total_time_to_solr=0
34
- @total_time_to_parse=0
35
- @yml_path = yml_path
36
- config.configure(YAML.load_file(yml_path)) if yml_path
37
- config.configure options
38
- yield(config) if block_given?
39
- @client_config = YAML.load_file(client_config_path) if client_config_path && File.exists?(client_config_path)
40
- # Adding skip_heartbeat param for easier testing
41
- @dor_fetcher_client=DorFetcher::Client.new({:service_url => client_config["dor_fetcher_service_url"], :skip_heartbeat => true})
42
- end
43
-
44
- # to allow class level access to config variables for record_merger and solr_doc_builder
45
- # (rather than passing a lot of params to constructor)
46
- def self.config
47
- @@config ||= Confstruct::Configuration.new()
48
- end
49
-
50
- def config
51
- Indexer.config
52
- end
53
-
54
- def logger
55
- @logger ||= load_logger(config.log_dir, config.log_name)
56
- end
57
-
58
- # per this Indexer's config options
59
- # harvest the druids via DorFetcher
60
- # create a Solr profiling document for each druid
61
- # write the result to the Solr index
62
- def harvest_and_index
63
- start_time=Time.now
64
- logger.info("Started harvest_and_index at #{start_time}")
65
- if whitelist.empty?
66
- druids.each { |druid| index druid }
67
- else
68
- whitelist.each { |druid| index druid }
69
- end
70
- solr_client.commit
71
- total_time=elapsed_time(start_time)
72
- total_objects=@success_count+@error_count
73
- logger.info("Finished harvest_and_index at #{Time.now}: final Solr commit returned")
74
- logger.info("Total elapsed time for harvest and index: #{(total_time/60.0).round(2)} minutes")
75
- logger.info("Avg solr commit time per object (successful): #{(@total_time_to_solr/@success_count).round(2)} seconds") unless (@total_time_to_solr == 0 || @success_count == 0)
76
- logger.info("Avg solr commit time per object (all): #{(@total_time_to_solr/total_objects).round(2)} seconds") unless (@total_time_to_solr == 0 || @error_count == 0 || total_objects == 0)
77
- logger.info("Avg parse time per object (successful): #{(@total_time_to_parse/@success_count).round(2)} seconds") unless (@total_time_to_parse == 0 || @success_count == 0)
78
- logger.info("Avg parse time per object (all): #{(@total_time_to_parse/total_objects).round(2)} seconds") unless (@total_time_to_parse == 0 || @error_count == 0 || total_objects == 0)
79
- logger.info("Avg complete index time per object (successful): #{(total_time/@success_count).round(2)} seconds") unless (@success_count == 0)
80
- logger.info("Avg complete index time per object (all): #{(total_time/total_objects).round(2)} seconds") unless (@error_count == 0 || total_objects == 0)
81
- logger.info("Successful count: #{@success_count}")
82
- logger.info("Error count: #{@error_count}")
83
- logger.info("Total records processed: #{total_objects}")
84
- end
85
-
86
- # return Array of druids contained in the DorFetcher pulling indicated by DorFetcher params
87
- # @return [Array<String>] or enumeration over it, if block is given. (strings are druids, e.g. ab123cd1234)
88
- def druids
89
- if @druids.nil?
90
- start_time=Time.now
91
- logger.info("Starting DorFetcher pulling of druids at #{start_time}.")
92
- @druids = @dor_fetcher_client.druid_array(@dor_fetcher_client.get_collection(strip_default_set_string(), {}))
93
- logger.info("Completed DorFetcher pulling of druids at #{Time.now}. Found #{@druids.size} druids. Total elapsed time for DorFetcher pulling = #{elapsed_time(start_time,:minutes)} minutes")
94
- end
95
- return @druids
96
- end
97
-
98
- # Add the document to solr, retry if an error occurs.
99
- # See https://github.com/ooyala/retries for docs on with_retries.
100
- # @param [Hash] doc a Hash representation of the solr document
101
- # @param [String] id the id of the document being sent, for logging
102
- def solr_add(doc, id)
103
- max_tries=@max_retries ? @max_retries : 10 #if @max_retries isn't set, use 10
104
-
105
- handler = Proc.new do |exception, attempt_number, total_delay|
106
- logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
107
- # logger.debug exception.backtrace
108
- end
109
-
110
- with_retries(:max_tries => max_tries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
111
- logger.debug "Attempt #{attempt} for #{id}"
112
- solr_client.add(doc)
113
- logger.info "Successfully indexed #{id} on attempt #{attempt}"
114
- end
115
- end
116
-
117
- # create Solr doc for the druid and add it to Solr, unless it is on the blacklist.
118
- # NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
119
- def index druid
120
- if blacklist.include?(druid)
121
- logger.info("Druid #{druid} is on the blacklist and will have no Solr doc created")
122
- else
123
- logger.fatal("You must override the index method to transform druids into Solr docs and add them to Solr")
124
-
125
- begin
126
- start_time=Time.now
127
- logger.info("About to index #{druid} at #{start_time}")
128
- #logger.debug "About to index #{druid}"
129
- doc_hash = {}
130
- doc_hash[:id] = druid
131
- # doc_hash[:title_tsim] = smods_rec(druid).short_title
132
-
133
- # you might add things from Indexer level class here
134
- # (e.g. things that are the same across all documents in the harvest)
135
-
136
- solr_client.add(doc_hash)
137
-
138
- logger.info("Indexed #{druid} in #{elapsed_time(start_time)} seconds")
139
- @success_count+=1
140
- # TODO: provide call to code to update DOR object's workflow datastream??
141
- rescue => e
142
- @error_count+=1
143
- logger.error "Failed to index #{druid} in #{elapsed_time(start_time)} seconds: #{e.message}"
144
- end
145
- end
146
- end
147
-
148
- # return the MODS for the druid as a Stanford::Mods::Record object
149
- # @param [String] druid e.g. ab123cd4567
150
- # @return [Stanford::Mods::Record] created from the MODS xml for the druid
151
- def smods_rec druid
152
- start_time=Time.now
153
- ng_doc = harvestdor_client.mods druid
154
- logger.info("Fetched MODs for #{druid} in #{elapsed_time(start_time)} seconds")
155
- raise "Empty MODS metadata for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
156
- mods_rec = Stanford::Mods::Record.new
157
- mods_rec.from_nk_node(ng_doc.root)
158
- mods_rec
159
- end
160
-
161
- # the public xml for this DOR object, from the purl page
162
- # @param [String] druid e.g. ab123cd4567
163
- # @return [Nokogiri::XML::Document] the public xml for the DOR object
164
- def public_xml druid
165
- start_time=Time.now
166
- ng_doc = harvestdor_client.public_xml druid
167
- logger.info("Fetched public_xml for #{druid} in #{elapsed_time(start_time)} seconds")
168
- raise "No public xml for #{druid}" if !ng_doc
169
- raise "Empty public xml for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
170
- ng_doc
171
- end
172
-
173
- # the contentMetadata for this DOR object, ultimately from the purl public xml
174
- # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
175
- # a Nokogiri::XML::Document containing the public_xml for an object
176
- # @return [Nokogiri::XML::Document] the contentMetadata for the DOR object
177
- def content_metadata object
178
- start_time=Time.now
179
- ng_doc = harvestdor_client.content_metadata object
180
- logger.info("Fetched content_metadata in #{elapsed_time(start_time)} seconds")
181
- raise "No contentMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
182
- ng_doc
183
- end
184
-
185
- # the identityMetadata for this DOR object, ultimately from the purl public xml
186
- # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
187
- # a Nokogiri::XML::Document containing the public_xml for an object
188
- # @return [Nokogiri::XML::Document] the identityMetadata for the DOR object
189
- def identity_metadata object
190
- start_time=Time.now
191
- ng_doc = harvestdor_client.identity_metadata object
192
- logger.info("Fetched identity_metadata in #{elapsed_time(start_time)} seconds")
193
- raise "No identityMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
194
- ng_doc
195
- end
196
-
197
- # the rightsMetadata for this DOR object, ultimately from the purl public xml
198
- # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
199
- # a Nokogiri::XML::Document containing the public_xml for an object
200
- # @return [Nokogiri::XML::Document] the rightsMetadata for the DOR object
201
- def rights_metadata object
202
- start_time=Time.now
203
- ng_doc = harvestdor_client.rights_metadata object
204
- logger.info("Fetched rights_metadata in #{elapsed_time(start_time)} seconds")
205
- raise "No rightsMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
206
- ng_doc
207
- end
208
-
209
- # the RDF for this DOR object, ultimately from the purl public xml
210
- # @param [Object] object a String containing a druid (e.g. ab123cd4567), or
211
- # a Nokogiri::XML::Document containing the public_xml for an object
212
- # @return [Nokogiri::XML::Document] the RDF for the DOR object
213
- def rdf object
214
- start_time=Time.now
215
- ng_doc = harvestdor_client.rdf object
216
- logger.info("Fetched rdf in #{elapsed_time(start_time)} seconds")
217
- raise "No RDF for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
218
- ng_doc
219
- end
220
-
221
- def solr_client
222
- @solr_client ||= RSolr.connect(config.solr.to_hash)
223
- end
224
-
225
- # @return an Array of druids ('oo000oo0000') that should NOT be processed
226
- def blacklist
227
- # avoid trying to load the file multiple times
228
- if !@blacklist && !@loaded_blacklist
229
- @blacklist = load_blacklist(config.blacklist) if config.blacklist
230
- end
231
- @blacklist ||= []
232
- end
233
-
234
- # @return an Array of druids ('oo000oo0000') that should be processed
235
- def whitelist
236
- # avoid trying to load the file multiple times
237
- if !@whitelist && !@loaded_whitelist
238
- @whitelist = load_whitelist(config.whitelist) if config.whitelist
239
- end
240
- @whitelist ||= []
241
- end
242
-
243
- # Get only the druid from the end of the default_set string
244
- # from the yml file
245
- def strip_default_set_string()
246
- Indexer.config.default_set.split('_').last
247
- end
248
-
249
- protected #---------------------------------------------------------------------
250
-
251
- def harvestdor_client
252
- @harvestdor_client ||= Harvestdor::Client.new({:config_yml_path => @yml_path})
253
- end
254
-
255
- def elapsed_time(start_time,units=:seconds)
256
- elapsed_seconds=Time.now-start_time
257
- case units
258
- when :seconds
259
- return elapsed_seconds.round(2)
260
- when :minutes
261
- return (elapsed_seconds/60.0).round(1)
262
- when :hours
263
- return (elapsed_seconds/3600.0).round(2)
264
- else
265
- return elapsed_seconds
266
- end
267
- end
268
-
269
- # populate @blacklist as an Array of druids ('oo000oo0000') that will NOT be processed
270
- # by reading the File at the indicated path
271
- # @param [String] path - path of file containing a list of druids
272
- def load_blacklist path
273
- if path && !@loaded_blacklist
274
- @loaded_blacklist = true
275
- @blacklist = load_id_list path
276
- end
277
- end
278
-
279
- # populate @blacklist as an Array of druids ('oo000oo0000') that WILL be processed
280
- # (unless a druid is also on the blacklist)
281
- # by reading the File at the indicated path
282
- # @param [String] path - path of file containing a list of druids
283
- def load_whitelist path
284
- if path && !@loaded_whitelist
285
- @loaded_whitelist = true
286
- @whitelist = load_id_list path
287
- end
288
- end
289
-
290
- # return an Array of druids ('oo000oo0000')
291
- # populated by reading the File at the indicated path
292
- # @param [String] path - path of file containing a list of druids
293
- # @return [Array<String>] an Array of druids
294
- def load_id_list path
295
- if path
296
- list = []
297
- f = File.open(path).each_line { |line|
298
- list << line.gsub(/\s+/, '') if !line.gsub(/\s+/, '').empty? && !line.strip.start_with?('#')
299
- }
300
- list
301
- end
302
- rescue
303
- msg = "Unable to find list of druids at " + path
304
- logger.fatal msg
305
- raise msg
306
- end
307
-
308
- # Global, memoized, lazy initialized instance of a logger
309
- # @param [String] log_dir directory for to get log file
310
- # @param [String] log_name name of log file
311
- def load_logger(log_dir, log_name)
312
- Dir.mkdir(log_dir) unless File.directory?(log_dir)
313
- @logger ||= Logger.new(File.join(log_dir, log_name), 'daily')
314
- end
315
-
316
- end # Indexer class
317
- end # Harvestdor module
1
+ require 'harvestdor/indexer'
@@ -0,0 +1,159 @@
1
+
2
+ # external gems
3
+ require 'confstruct'
4
+ require 'rsolr'
5
+ require 'retries'
6
+ require 'parallel'
7
+ require 'json'
8
+
9
+ # sul-dlss gems
10
+ require 'harvestdor'
11
+ require 'stanford-mods'
12
+ require 'dor-fetcher'
13
+
14
+ # stdlib
15
+ require 'logger'
16
+
17
+ require "harvestdor/indexer/version"
18
+
19
+ require 'active_support/benchmarkable'
20
+ module Harvestdor
21
+ # Base class to harvest from DOR via harvestdor gem and then index
22
+ class Indexer
23
+ require "harvestdor/indexer/metrics"
24
+ require "harvestdor/indexer/resource"
25
+ require "harvestdor/indexer/solr"
26
+
27
+ include ActiveSupport::Benchmarkable
28
+
29
+ attr_accessor :metrics, :logger
30
+
31
+ def initialize options = {}
32
+ config.configure(options)
33
+ yield(config) if block_given?
34
+ @metrics = Harvestdor::Indexer::Metrics.new logger: logger
35
+ end
36
+
37
+ def config
38
+ @config ||= Confstruct::Configuration.new
39
+ end
40
+
41
+ def logger
42
+ @logger ||= begin
43
+ if config.harvestdor
44
+ Dir.mkdir(config.harvestdor.log_dir) unless File.directory?(config.harvestdor.log_dir)
45
+ Logger.new(File.join(config.harvestdor.log_dir, config.harvestdor.log_name), 'daily')
46
+ else
47
+ Logger.new STDERR
48
+ end
49
+ end
50
+ end
51
+
52
+ # per this Indexer's config options
53
+ # harvest the druids via DorFetcher
54
+ # create a Solr profiling document for each druid
55
+ # write the result to the Solr index
56
+ def harvest_and_index each_options = {in_threads: 4}
57
+ benchmark "Harvest and Indexing" do
58
+ each_resource(each_options) do |resource|
59
+ index resource
60
+ end
61
+
62
+ solr.commit!
63
+ end
64
+ end
65
+
66
+ def resources
67
+ druids.map do |x|
68
+ Harvestdor::Indexer::Resource.new(self, x)
69
+ end.map do |x|
70
+ [x, (x.items if x.collection?)]
71
+ end.flatten.uniq.compact
72
+ end
73
+
74
+ def each_resource options = {}, &block
75
+ benchmark "" do
76
+ Parallel.each(resources, options) do |resource|
77
+ metrics.tally on_error: method(:resource_error) do
78
+ yield resource
79
+ end
80
+ end
81
+ end
82
+
83
+ logger.info("Successful count: #{metrics.success_count}")
84
+ logger.info("Error count: #{metrics.error_count}")
85
+ logger.info("Total records processed: #{metrics.total}")
86
+ end
87
+
88
+ def resource_error e
89
+ if e.instance_of? Parallel::Break or e.instance_of? Parallel::Kill
90
+ raise e
91
+ end
92
+ end
93
+
94
+ # return Array of druids contained in the DorFetcher pulling indicated by DorFetcher params
95
+ # @return [Array<String>] or enumeration over it, if block is given. (strings are druids, e.g. ab123cd1234)
96
+ def druids
97
+ @druids ||= whitelist
98
+ end
99
+
100
+ # create Solr doc for the druid and add it to Solr
101
+ # NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
102
+ def index resource
103
+
104
+ benchmark "Indexing #{resource.druid}" do
105
+ logger.debug "About to index #{resource.druid}"
106
+ doc_hash = {}
107
+ doc_hash[:id] = resource.druid
108
+
109
+ # you might add things from Indexer level class here
110
+ # (e.g. things that are the same across all documents in the harvest)
111
+ solr.add doc_hash
112
+ # TODO: provide call to code to update DOR object's workflow datastream??
113
+ end
114
+ end
115
+
116
+ # @return an Array of druids ('oo000oo0000') that should be processed
117
+ def whitelist
118
+ @whitelist ||= config.whitelist if config.whitelist.is_a? Array
119
+ @whitelist ||= load_whitelist(config.whitelist) if config.whitelist
120
+ @whitelist ||= []
121
+ end
122
+
123
+ def harvestdor_client
124
+ @harvestdor_client ||= Harvestdor::Client.new(config.harvestdor)
125
+ end
126
+
127
+ def dor_fetcher_client
128
+ @dor_fetcher_client ||= DorFetcher::Client.new(config.dor_fetcher)
129
+ end
130
+
131
+ def solr
132
+ @solr ||= Harvestdor::Indexer::Solr.new self, config.solr.to_hash
133
+ end
134
+
135
+ protected #---------------------------------------------------------------------
136
+
137
+ # populate @whitelist as an Array of druids ('oo000oo0000') that WILL be processed
138
+ # by reading the File at the indicated path
139
+ # @param [String] path - path of file containing a list of druids
140
+ def load_whitelist path
141
+ @whitelist = load_id_list path
142
+ end
143
+
144
+ # return an Array of druids ('oo000oo0000')
145
+ # populated by reading the File at the indicated path
146
+ # @param [String] path - path of file containing a list of druids
147
+ # @return [Array<String>] an Array of druids
148
+ def load_id_list path
149
+ list = File.open(path).each_line
150
+ .map { |line| line.strip }
151
+ .reject { |line| line.strip.start_with?('#') }
152
+ .reject { |line| line.empty? }
153
+ rescue
154
+ msg = "Unable to find list of druids at " + path
155
+ logger.fatal msg
156
+ raise msg
157
+ end
158
+ end # Indexer class
159
+ end # Harvestdor module