harvestdor-indexer 1.0.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +4 -2
- data/Gemfile +1 -1
- data/README.rdoc +1 -0
- data/harvestdor-indexer.gemspec +4 -2
- data/lib/harvestdor-indexer.rb +1 -317
- data/lib/harvestdor/indexer.rb +159 -0
- data/lib/harvestdor/indexer/metrics.rb +53 -0
- data/lib/harvestdor/indexer/resource.rb +174 -0
- data/lib/harvestdor/indexer/solr.rb +39 -0
- data/lib/{harvestdor-indexer → harvestdor/indexer}/version.rb +1 -1
- data/spec/config/ap.yml +32 -44
- data/spec/fixtures/vcr_cassettes/get_collection_druids_call.yml +96 -0
- data/spec/fixtures/vcr_cassettes/process_druids_whitelist_call.yml +1494 -16
- data/spec/fixtures/vcr_cassettes/single_rsolr_connection_call.yml +80 -27
- data/spec/spec_helper.rb +1 -1
- data/spec/unit/harvestdor-indexer-resource_spec.rb +174 -0
- data/spec/unit/harvestdor-indexer-solr_spec.rb +32 -0
- data/spec/unit/harvestdor-indexer_spec.rb +47 -291
- data/spec/unit/harvestdor/indexer/metrics_spec.rb +46 -0
- metadata +45 -10
- data/config/dor-fetcher-client.yml +0 -4
- data/spec/config/ap_blacklist.txt +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 60218ff6d0f0210900b1b6a0df4d3a09a7122c34
|
4
|
+
data.tar.gz: 04ba680a7f7864dca78cbc9751c8eb9f6f9c811b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2df842d3b19d9750f7a5e88f9af9dab60cfffa34cc80d7584b5b16f825aff64478e4c1ba3dde05de5749a9b8c17cbabd29fb55b4c71c965253b32d446686013b
|
7
|
+
data.tar.gz: 3c479f7be2c27ad39acad3316b77987dce8c487a654f47758513a4c4d094973ed7b0168f85307a5135de212d7469478e4e475285bd490250058b03869fea6619
|
data/.travis.yml
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
language: ruby
|
2
2
|
script: rake rspec
|
3
3
|
rvm:
|
4
|
-
- 2.
|
4
|
+
- 2.2.0
|
5
|
+
- 2.1.5
|
5
6
|
- 2.0.0
|
6
7
|
- 1.9.3
|
7
|
-
- jruby-
|
8
|
+
- jruby-1.7.9-d19 # fails after 1.7.10 for some reason
|
9
|
+
# - jruby-19mode # JRuby in 1.9 mode
|
8
10
|
notifications:
|
9
11
|
email:
|
10
12
|
- ndushay@stanford.edu
|
data/Gemfile
CHANGED
data/README.rdoc
CHANGED
@@ -118,6 +118,7 @@ I suggest you run your code on harvestdor-dev, as it is already set up to be abl
|
|
118
118
|
|
119
119
|
== Releases
|
120
120
|
|
121
|
+
* <b>2.0.0</b> Complete refactor to update APIs, merge configuration yml files, update to rspec 3
|
121
122
|
* <b>1.0.4</b> Set skip_heartbeat to true in the initialization of the DorFetcher::Client for ease of testing
|
122
123
|
* <b>1.0.3</b> Implemented class level config so anything that inherits from Harvestdor::Indexer can share configuration settings
|
123
124
|
* <b>1.0.0</b> Replaced OAI harvesting mechanism with dor-fetcher
|
data/harvestdor-indexer.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
lib = File.expand_path('../lib', __FILE__)
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require 'harvestdor
|
4
|
+
require 'harvestdor/indexer/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |gem|
|
7
7
|
gem.name = "harvestdor-indexer"
|
@@ -22,6 +22,8 @@ Gem::Specification.new do |gem|
|
|
22
22
|
gem.add_dependency 'harvestdor', '>=0.0.14'
|
23
23
|
gem.add_dependency 'stanford-mods'
|
24
24
|
gem.add_dependency 'dor-fetcher', '=1.0.5'
|
25
|
+
gem.add_dependency "activesupport"
|
26
|
+
gem.add_dependency "parallel"
|
25
27
|
|
26
28
|
# Runtime dependencies
|
27
29
|
gem.add_runtime_dependency 'confstruct'
|
@@ -34,7 +36,7 @@ Gem::Specification.new do |gem|
|
|
34
36
|
gem.add_development_dependency "rdoc"
|
35
37
|
gem.add_development_dependency "yard"
|
36
38
|
# tests
|
37
|
-
gem.add_development_dependency 'rspec'
|
39
|
+
gem.add_development_dependency 'rspec', "~> 3.0"
|
38
40
|
gem.add_development_dependency 'coveralls'
|
39
41
|
# gem.add_development_dependency 'ruby-debug19'
|
40
42
|
gem.add_development_dependency 'vcr'
|
data/lib/harvestdor-indexer.rb
CHANGED
@@ -1,317 +1 @@
|
|
1
|
-
|
2
|
-
require 'confstruct'
|
3
|
-
require 'rsolr'
|
4
|
-
require 'retries'
|
5
|
-
require 'json'
|
6
|
-
|
7
|
-
# sul-dlss gems
|
8
|
-
require 'harvestdor'
|
9
|
-
require 'stanford-mods'
|
10
|
-
require 'dor-fetcher'
|
11
|
-
|
12
|
-
# stdlib
|
13
|
-
require 'logger'
|
14
|
-
|
15
|
-
require "harvestdor-indexer/version"
|
16
|
-
|
17
|
-
module Harvestdor
|
18
|
-
# Base class to harvest from DOR via harvestdor gem and then index
|
19
|
-
class Indexer
|
20
|
-
|
21
|
-
attr_accessor :error_count, :success_count, :max_retries
|
22
|
-
attr_accessor :total_time_to_parse,:total_time_to_solr
|
23
|
-
attr_accessor :dor_fetcher_client, :client_config
|
24
|
-
|
25
|
-
|
26
|
-
# Class level config variable
|
27
|
-
@@config ||= Confstruct::Configuration.new()
|
28
|
-
|
29
|
-
def initialize yml_path, client_config_path, options = {}
|
30
|
-
@success_count=0 # the number of objects successfully indexed
|
31
|
-
@error_count=0 # the number of objects that failed
|
32
|
-
@max_retries=10 # the number of times to retry an object
|
33
|
-
@total_time_to_solr=0
|
34
|
-
@total_time_to_parse=0
|
35
|
-
@yml_path = yml_path
|
36
|
-
config.configure(YAML.load_file(yml_path)) if yml_path
|
37
|
-
config.configure options
|
38
|
-
yield(config) if block_given?
|
39
|
-
@client_config = YAML.load_file(client_config_path) if client_config_path && File.exists?(client_config_path)
|
40
|
-
# Adding skip_heartbeat param for easier testing
|
41
|
-
@dor_fetcher_client=DorFetcher::Client.new({:service_url => client_config["dor_fetcher_service_url"], :skip_heartbeat => true})
|
42
|
-
end
|
43
|
-
|
44
|
-
# to allow class level access to config variables for record_merger and solr_doc_builder
|
45
|
-
# (rather than passing a lot of params to constructor)
|
46
|
-
def self.config
|
47
|
-
@@config ||= Confstruct::Configuration.new()
|
48
|
-
end
|
49
|
-
|
50
|
-
def config
|
51
|
-
Indexer.config
|
52
|
-
end
|
53
|
-
|
54
|
-
def logger
|
55
|
-
@logger ||= load_logger(config.log_dir, config.log_name)
|
56
|
-
end
|
57
|
-
|
58
|
-
# per this Indexer's config options
|
59
|
-
# harvest the druids via DorFetcher
|
60
|
-
# create a Solr profiling document for each druid
|
61
|
-
# write the result to the Solr index
|
62
|
-
def harvest_and_index
|
63
|
-
start_time=Time.now
|
64
|
-
logger.info("Started harvest_and_index at #{start_time}")
|
65
|
-
if whitelist.empty?
|
66
|
-
druids.each { |druid| index druid }
|
67
|
-
else
|
68
|
-
whitelist.each { |druid| index druid }
|
69
|
-
end
|
70
|
-
solr_client.commit
|
71
|
-
total_time=elapsed_time(start_time)
|
72
|
-
total_objects=@success_count+@error_count
|
73
|
-
logger.info("Finished harvest_and_index at #{Time.now}: final Solr commit returned")
|
74
|
-
logger.info("Total elapsed time for harvest and index: #{(total_time/60.0).round(2)} minutes")
|
75
|
-
logger.info("Avg solr commit time per object (successful): #{(@total_time_to_solr/@success_count).round(2)} seconds") unless (@total_time_to_solr == 0 || @success_count == 0)
|
76
|
-
logger.info("Avg solr commit time per object (all): #{(@total_time_to_solr/total_objects).round(2)} seconds") unless (@total_time_to_solr == 0 || @error_count == 0 || total_objects == 0)
|
77
|
-
logger.info("Avg parse time per object (successful): #{(@total_time_to_parse/@success_count).round(2)} seconds") unless (@total_time_to_parse == 0 || @success_count == 0)
|
78
|
-
logger.info("Avg parse time per object (all): #{(@total_time_to_parse/total_objects).round(2)} seconds") unless (@total_time_to_parse == 0 || @error_count == 0 || total_objects == 0)
|
79
|
-
logger.info("Avg complete index time per object (successful): #{(total_time/@success_count).round(2)} seconds") unless (@success_count == 0)
|
80
|
-
logger.info("Avg complete index time per object (all): #{(total_time/total_objects).round(2)} seconds") unless (@error_count == 0 || total_objects == 0)
|
81
|
-
logger.info("Successful count: #{@success_count}")
|
82
|
-
logger.info("Error count: #{@error_count}")
|
83
|
-
logger.info("Total records processed: #{total_objects}")
|
84
|
-
end
|
85
|
-
|
86
|
-
# return Array of druids contained in the DorFetcher pulling indicated by DorFetcher params
|
87
|
-
# @return [Array<String>] or enumeration over it, if block is given. (strings are druids, e.g. ab123cd1234)
|
88
|
-
def druids
|
89
|
-
if @druids.nil?
|
90
|
-
start_time=Time.now
|
91
|
-
logger.info("Starting DorFetcher pulling of druids at #{start_time}.")
|
92
|
-
@druids = @dor_fetcher_client.druid_array(@dor_fetcher_client.get_collection(strip_default_set_string(), {}))
|
93
|
-
logger.info("Completed DorFetcher pulling of druids at #{Time.now}. Found #{@druids.size} druids. Total elapsed time for DorFetcher pulling = #{elapsed_time(start_time,:minutes)} minutes")
|
94
|
-
end
|
95
|
-
return @druids
|
96
|
-
end
|
97
|
-
|
98
|
-
# Add the document to solr, retry if an error occurs.
|
99
|
-
# See https://github.com/ooyala/retries for docs on with_retries.
|
100
|
-
# @param [Hash] doc a Hash representation of the solr document
|
101
|
-
# @param [String] id the id of the document being sent, for logging
|
102
|
-
def solr_add(doc, id)
|
103
|
-
max_tries=@max_retries ? @max_retries : 10 #if @max_retries isn't set, use 10
|
104
|
-
|
105
|
-
handler = Proc.new do |exception, attempt_number, total_delay|
|
106
|
-
logger.debug "#{exception.class} on attempt #{attempt_number} for #{id}"
|
107
|
-
# logger.debug exception.backtrace
|
108
|
-
end
|
109
|
-
|
110
|
-
with_retries(:max_tries => max_tries, :handler => handler, :base_sleep_seconds => 1, :max_sleep_seconds => 5) do |attempt|
|
111
|
-
logger.debug "Attempt #{attempt} for #{id}"
|
112
|
-
solr_client.add(doc)
|
113
|
-
logger.info "Successfully indexed #{id} on attempt #{attempt}"
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
# create Solr doc for the druid and add it to Solr, unless it is on the blacklist.
|
118
|
-
# NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
|
119
|
-
def index druid
|
120
|
-
if blacklist.include?(druid)
|
121
|
-
logger.info("Druid #{druid} is on the blacklist and will have no Solr doc created")
|
122
|
-
else
|
123
|
-
logger.fatal("You must override the index method to transform druids into Solr docs and add them to Solr")
|
124
|
-
|
125
|
-
begin
|
126
|
-
start_time=Time.now
|
127
|
-
logger.info("About to index #{druid} at #{start_time}")
|
128
|
-
#logger.debug "About to index #{druid}"
|
129
|
-
doc_hash = {}
|
130
|
-
doc_hash[:id] = druid
|
131
|
-
# doc_hash[:title_tsim] = smods_rec(druid).short_title
|
132
|
-
|
133
|
-
# you might add things from Indexer level class here
|
134
|
-
# (e.g. things that are the same across all documents in the harvest)
|
135
|
-
|
136
|
-
solr_client.add(doc_hash)
|
137
|
-
|
138
|
-
logger.info("Indexed #{druid} in #{elapsed_time(start_time)} seconds")
|
139
|
-
@success_count+=1
|
140
|
-
# TODO: provide call to code to update DOR object's workflow datastream??
|
141
|
-
rescue => e
|
142
|
-
@error_count+=1
|
143
|
-
logger.error "Failed to index #{druid} in #{elapsed_time(start_time)} seconds: #{e.message}"
|
144
|
-
end
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
# return the MODS for the druid as a Stanford::Mods::Record object
|
149
|
-
# @param [String] druid e.g. ab123cd4567
|
150
|
-
# @return [Stanford::Mods::Record] created from the MODS xml for the druid
|
151
|
-
def smods_rec druid
|
152
|
-
start_time=Time.now
|
153
|
-
ng_doc = harvestdor_client.mods druid
|
154
|
-
logger.info("Fetched MODs for #{druid} in #{elapsed_time(start_time)} seconds")
|
155
|
-
raise "Empty MODS metadata for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
|
156
|
-
mods_rec = Stanford::Mods::Record.new
|
157
|
-
mods_rec.from_nk_node(ng_doc.root)
|
158
|
-
mods_rec
|
159
|
-
end
|
160
|
-
|
161
|
-
# the public xml for this DOR object, from the purl page
|
162
|
-
# @param [String] druid e.g. ab123cd4567
|
163
|
-
# @return [Nokogiri::XML::Document] the public xml for the DOR object
|
164
|
-
def public_xml druid
|
165
|
-
start_time=Time.now
|
166
|
-
ng_doc = harvestdor_client.public_xml druid
|
167
|
-
logger.info("Fetched public_xml for #{druid} in #{elapsed_time(start_time)} seconds")
|
168
|
-
raise "No public xml for #{druid}" if !ng_doc
|
169
|
-
raise "Empty public xml for #{druid}: #{ng_doc.to_xml}" if ng_doc.root.xpath('//text()').empty?
|
170
|
-
ng_doc
|
171
|
-
end
|
172
|
-
|
173
|
-
# the contentMetadata for this DOR object, ultimately from the purl public xml
|
174
|
-
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
175
|
-
# a Nokogiri::XML::Document containing the public_xml for an object
|
176
|
-
# @return [Nokogiri::XML::Document] the contentMetadata for the DOR object
|
177
|
-
def content_metadata object
|
178
|
-
start_time=Time.now
|
179
|
-
ng_doc = harvestdor_client.content_metadata object
|
180
|
-
logger.info("Fetched content_metadata in #{elapsed_time(start_time)} seconds")
|
181
|
-
raise "No contentMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
|
182
|
-
ng_doc
|
183
|
-
end
|
184
|
-
|
185
|
-
# the identityMetadata for this DOR object, ultimately from the purl public xml
|
186
|
-
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
187
|
-
# a Nokogiri::XML::Document containing the public_xml for an object
|
188
|
-
# @return [Nokogiri::XML::Document] the identityMetadata for the DOR object
|
189
|
-
def identity_metadata object
|
190
|
-
start_time=Time.now
|
191
|
-
ng_doc = harvestdor_client.identity_metadata object
|
192
|
-
logger.info("Fetched identity_metadata in #{elapsed_time(start_time)} seconds")
|
193
|
-
raise "No identityMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
|
194
|
-
ng_doc
|
195
|
-
end
|
196
|
-
|
197
|
-
# the rightsMetadata for this DOR object, ultimately from the purl public xml
|
198
|
-
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
199
|
-
# a Nokogiri::XML::Document containing the public_xml for an object
|
200
|
-
# @return [Nokogiri::XML::Document] the rightsMetadata for the DOR object
|
201
|
-
def rights_metadata object
|
202
|
-
start_time=Time.now
|
203
|
-
ng_doc = harvestdor_client.rights_metadata object
|
204
|
-
logger.info("Fetched rights_metadata in #{elapsed_time(start_time)} seconds")
|
205
|
-
raise "No rightsMetadata for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
|
206
|
-
ng_doc
|
207
|
-
end
|
208
|
-
|
209
|
-
# the RDF for this DOR object, ultimately from the purl public xml
|
210
|
-
# @param [Object] object a String containing a druid (e.g. ab123cd4567), or
|
211
|
-
# a Nokogiri::XML::Document containing the public_xml for an object
|
212
|
-
# @return [Nokogiri::XML::Document] the RDF for the DOR object
|
213
|
-
def rdf object
|
214
|
-
start_time=Time.now
|
215
|
-
ng_doc = harvestdor_client.rdf object
|
216
|
-
logger.info("Fetched rdf in #{elapsed_time(start_time)} seconds")
|
217
|
-
raise "No RDF for #{object.inspect}" if !ng_doc || ng_doc.children.empty?
|
218
|
-
ng_doc
|
219
|
-
end
|
220
|
-
|
221
|
-
def solr_client
|
222
|
-
@solr_client ||= RSolr.connect(config.solr.to_hash)
|
223
|
-
end
|
224
|
-
|
225
|
-
# @return an Array of druids ('oo000oo0000') that should NOT be processed
|
226
|
-
def blacklist
|
227
|
-
# avoid trying to load the file multiple times
|
228
|
-
if !@blacklist && !@loaded_blacklist
|
229
|
-
@blacklist = load_blacklist(config.blacklist) if config.blacklist
|
230
|
-
end
|
231
|
-
@blacklist ||= []
|
232
|
-
end
|
233
|
-
|
234
|
-
# @return an Array of druids ('oo000oo0000') that should be processed
|
235
|
-
def whitelist
|
236
|
-
# avoid trying to load the file multiple times
|
237
|
-
if !@whitelist && !@loaded_whitelist
|
238
|
-
@whitelist = load_whitelist(config.whitelist) if config.whitelist
|
239
|
-
end
|
240
|
-
@whitelist ||= []
|
241
|
-
end
|
242
|
-
|
243
|
-
# Get only the druid from the end of the default_set string
|
244
|
-
# from the yml file
|
245
|
-
def strip_default_set_string()
|
246
|
-
Indexer.config.default_set.split('_').last
|
247
|
-
end
|
248
|
-
|
249
|
-
protected #---------------------------------------------------------------------
|
250
|
-
|
251
|
-
def harvestdor_client
|
252
|
-
@harvestdor_client ||= Harvestdor::Client.new({:config_yml_path => @yml_path})
|
253
|
-
end
|
254
|
-
|
255
|
-
def elapsed_time(start_time,units=:seconds)
|
256
|
-
elapsed_seconds=Time.now-start_time
|
257
|
-
case units
|
258
|
-
when :seconds
|
259
|
-
return elapsed_seconds.round(2)
|
260
|
-
when :minutes
|
261
|
-
return (elapsed_seconds/60.0).round(1)
|
262
|
-
when :hours
|
263
|
-
return (elapsed_seconds/3600.0).round(2)
|
264
|
-
else
|
265
|
-
return elapsed_seconds
|
266
|
-
end
|
267
|
-
end
|
268
|
-
|
269
|
-
# populate @blacklist as an Array of druids ('oo000oo0000') that will NOT be processed
|
270
|
-
# by reading the File at the indicated path
|
271
|
-
# @param [String] path - path of file containing a list of druids
|
272
|
-
def load_blacklist path
|
273
|
-
if path && !@loaded_blacklist
|
274
|
-
@loaded_blacklist = true
|
275
|
-
@blacklist = load_id_list path
|
276
|
-
end
|
277
|
-
end
|
278
|
-
|
279
|
-
# populate @blacklist as an Array of druids ('oo000oo0000') that WILL be processed
|
280
|
-
# (unless a druid is also on the blacklist)
|
281
|
-
# by reading the File at the indicated path
|
282
|
-
# @param [String] path - path of file containing a list of druids
|
283
|
-
def load_whitelist path
|
284
|
-
if path && !@loaded_whitelist
|
285
|
-
@loaded_whitelist = true
|
286
|
-
@whitelist = load_id_list path
|
287
|
-
end
|
288
|
-
end
|
289
|
-
|
290
|
-
# return an Array of druids ('oo000oo0000')
|
291
|
-
# populated by reading the File at the indicated path
|
292
|
-
# @param [String] path - path of file containing a list of druids
|
293
|
-
# @return [Array<String>] an Array of druids
|
294
|
-
def load_id_list path
|
295
|
-
if path
|
296
|
-
list = []
|
297
|
-
f = File.open(path).each_line { |line|
|
298
|
-
list << line.gsub(/\s+/, '') if !line.gsub(/\s+/, '').empty? && !line.strip.start_with?('#')
|
299
|
-
}
|
300
|
-
list
|
301
|
-
end
|
302
|
-
rescue
|
303
|
-
msg = "Unable to find list of druids at " + path
|
304
|
-
logger.fatal msg
|
305
|
-
raise msg
|
306
|
-
end
|
307
|
-
|
308
|
-
# Global, memoized, lazy initialized instance of a logger
|
309
|
-
# @param [String] log_dir directory for to get log file
|
310
|
-
# @param [String] log_name name of log file
|
311
|
-
def load_logger(log_dir, log_name)
|
312
|
-
Dir.mkdir(log_dir) unless File.directory?(log_dir)
|
313
|
-
@logger ||= Logger.new(File.join(log_dir, log_name), 'daily')
|
314
|
-
end
|
315
|
-
|
316
|
-
end # Indexer class
|
317
|
-
end # Harvestdor module
|
1
|
+
require 'harvestdor/indexer'
|
@@ -0,0 +1,159 @@
|
|
1
|
+
|
2
|
+
# external gems
|
3
|
+
require 'confstruct'
|
4
|
+
require 'rsolr'
|
5
|
+
require 'retries'
|
6
|
+
require 'parallel'
|
7
|
+
require 'json'
|
8
|
+
|
9
|
+
# sul-dlss gems
|
10
|
+
require 'harvestdor'
|
11
|
+
require 'stanford-mods'
|
12
|
+
require 'dor-fetcher'
|
13
|
+
|
14
|
+
# stdlib
|
15
|
+
require 'logger'
|
16
|
+
|
17
|
+
require "harvestdor/indexer/version"
|
18
|
+
|
19
|
+
require 'active_support/benchmarkable'
|
20
|
+
module Harvestdor
|
21
|
+
# Base class to harvest from DOR via harvestdor gem and then index
|
22
|
+
class Indexer
|
23
|
+
require "harvestdor/indexer/metrics"
|
24
|
+
require "harvestdor/indexer/resource"
|
25
|
+
require "harvestdor/indexer/solr"
|
26
|
+
|
27
|
+
include ActiveSupport::Benchmarkable
|
28
|
+
|
29
|
+
attr_accessor :metrics, :logger
|
30
|
+
|
31
|
+
def initialize options = {}
|
32
|
+
config.configure(options)
|
33
|
+
yield(config) if block_given?
|
34
|
+
@metrics = Harvestdor::Indexer::Metrics.new logger: logger
|
35
|
+
end
|
36
|
+
|
37
|
+
def config
|
38
|
+
@config ||= Confstruct::Configuration.new
|
39
|
+
end
|
40
|
+
|
41
|
+
def logger
|
42
|
+
@logger ||= begin
|
43
|
+
if config.harvestdor
|
44
|
+
Dir.mkdir(config.harvestdor.log_dir) unless File.directory?(config.harvestdor.log_dir)
|
45
|
+
Logger.new(File.join(config.harvestdor.log_dir, config.harvestdor.log_name), 'daily')
|
46
|
+
else
|
47
|
+
Logger.new STDERR
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# per this Indexer's config options
|
53
|
+
# harvest the druids via DorFetcher
|
54
|
+
# create a Solr profiling document for each druid
|
55
|
+
# write the result to the Solr index
|
56
|
+
def harvest_and_index each_options = {in_threads: 4}
|
57
|
+
benchmark "Harvest and Indexing" do
|
58
|
+
each_resource(each_options) do |resource|
|
59
|
+
index resource
|
60
|
+
end
|
61
|
+
|
62
|
+
solr.commit!
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def resources
|
67
|
+
druids.map do |x|
|
68
|
+
Harvestdor::Indexer::Resource.new(self, x)
|
69
|
+
end.map do |x|
|
70
|
+
[x, (x.items if x.collection?)]
|
71
|
+
end.flatten.uniq.compact
|
72
|
+
end
|
73
|
+
|
74
|
+
def each_resource options = {}, &block
|
75
|
+
benchmark "" do
|
76
|
+
Parallel.each(resources, options) do |resource|
|
77
|
+
metrics.tally on_error: method(:resource_error) do
|
78
|
+
yield resource
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
logger.info("Successful count: #{metrics.success_count}")
|
84
|
+
logger.info("Error count: #{metrics.error_count}")
|
85
|
+
logger.info("Total records processed: #{metrics.total}")
|
86
|
+
end
|
87
|
+
|
88
|
+
def resource_error e
|
89
|
+
if e.instance_of? Parallel::Break or e.instance_of? Parallel::Kill
|
90
|
+
raise e
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# return Array of druids contained in the DorFetcher pulling indicated by DorFetcher params
|
95
|
+
# @return [Array<String>] or enumeration over it, if block is given. (strings are druids, e.g. ab123cd1234)
|
96
|
+
def druids
|
97
|
+
@druids ||= whitelist
|
98
|
+
end
|
99
|
+
|
100
|
+
# create Solr doc for the druid and add it to Solr
|
101
|
+
# NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
|
102
|
+
def index resource
|
103
|
+
|
104
|
+
benchmark "Indexing #{resource.druid}" do
|
105
|
+
logger.debug "About to index #{resource.druid}"
|
106
|
+
doc_hash = {}
|
107
|
+
doc_hash[:id] = resource.druid
|
108
|
+
|
109
|
+
# you might add things from Indexer level class here
|
110
|
+
# (e.g. things that are the same across all documents in the harvest)
|
111
|
+
solr.add doc_hash
|
112
|
+
# TODO: provide call to code to update DOR object's workflow datastream??
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# @return an Array of druids ('oo000oo0000') that should be processed
|
117
|
+
def whitelist
|
118
|
+
@whitelist ||= config.whitelist if config.whitelist.is_a? Array
|
119
|
+
@whitelist ||= load_whitelist(config.whitelist) if config.whitelist
|
120
|
+
@whitelist ||= []
|
121
|
+
end
|
122
|
+
|
123
|
+
def harvestdor_client
|
124
|
+
@harvestdor_client ||= Harvestdor::Client.new(config.harvestdor)
|
125
|
+
end
|
126
|
+
|
127
|
+
def dor_fetcher_client
|
128
|
+
@dor_fetcher_client ||= DorFetcher::Client.new(config.dor_fetcher)
|
129
|
+
end
|
130
|
+
|
131
|
+
def solr
|
132
|
+
@solr ||= Harvestdor::Indexer::Solr.new self, config.solr.to_hash
|
133
|
+
end
|
134
|
+
|
135
|
+
protected #---------------------------------------------------------------------
|
136
|
+
|
137
|
+
# populate @whitelist as an Array of druids ('oo000oo0000') that WILL be processed
|
138
|
+
# by reading the File at the indicated path
|
139
|
+
# @param [String] path - path of file containing a list of druids
|
140
|
+
def load_whitelist path
|
141
|
+
@whitelist = load_id_list path
|
142
|
+
end
|
143
|
+
|
144
|
+
# return an Array of druids ('oo000oo0000')
|
145
|
+
# populated by reading the File at the indicated path
|
146
|
+
# @param [String] path - path of file containing a list of druids
|
147
|
+
# @return [Array<String>] an Array of druids
|
148
|
+
def load_id_list path
|
149
|
+
list = File.open(path).each_line
|
150
|
+
.map { |line| line.strip }
|
151
|
+
.reject { |line| line.strip.start_with?('#') }
|
152
|
+
.reject { |line| line.empty? }
|
153
|
+
rescue
|
154
|
+
msg = "Unable to find list of druids at " + path
|
155
|
+
logger.fatal msg
|
156
|
+
raise msg
|
157
|
+
end
|
158
|
+
end # Indexer class
|
159
|
+
end # Harvestdor module
|