gdor-indexer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.hound.yml +2 -0
- data/.rubocop.yml +3 -0
- data/.rubocop_todo.yml +131 -0
- data/.yardopts +3 -0
- data/Capfile +26 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +5 -0
- data/README.md +67 -0
- data/Rakefile +57 -0
- data/VERSION +1 -0
- data/bin/indexer +71 -0
- data/config/deploy.rb +31 -0
- data/config/deploy/dev.rb +41 -0
- data/config/deploy/fetcher.rb +6 -0
- data/config/deploy/prod.rb +41 -0
- data/config/deploy/stage.rb +41 -0
- data/gdor-indexer.gemspec +43 -0
- data/lib/gdor/indexer.rb +327 -0
- data/lib/gdor/indexer/mods_fields.rb +114 -0
- data/lib/gdor/indexer/nokogiri_xml_node_mixin.rb +42 -0
- data/lib/gdor/indexer/public_xml_fields.rb +81 -0
- data/lib/gdor/indexer/solr_doc_builder.rb +85 -0
- data/lib/gdor/indexer/solr_doc_hash.rb +112 -0
- data/lib/gdor/indexer/version.rb +5 -0
- data/spec/config/walters_integration_spec.yml +44 -0
- data/spec/spec_helper.rb +26 -0
- data/spec/unit/gdor_mods_fields_spec.rb +812 -0
- data/spec/unit/indexer_spec.rb +411 -0
- data/spec/unit/public_xml_fields_spec.rb +286 -0
- data/spec/unit/solr_doc_builder_spec.rb +128 -0
- data/spec/unit/solr_doc_hash_spec.rb +399 -0
- data/spec/vcr_cassettes/no_coll_druid_in_druid_array_call.yml +745 -0
- metadata +411 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
# Simple Role Syntax
|
2
|
+
# ==================
|
3
|
+
# Supports bulk-adding hosts to roles, the primary
|
4
|
+
# server in each group is considered to be the first
|
5
|
+
# unless any hosts have the primary property set.
|
6
|
+
# Don't declare `role :all`, it's a meta role
|
7
|
+
# role :app, %w{deploy@example.com}
|
8
|
+
# role :web, %w{deploy@example.com}
|
9
|
+
# role :db, %w{deploy@example.com}
|
10
|
+
|
11
|
+
# Extended Server Syntax
|
12
|
+
# ======================
|
13
|
+
# This can be used to drop a more detailed server
|
14
|
+
# definition into the server list. The second argument
|
15
|
+
# something that quacks like a hash can be used to set
|
16
|
+
# extended properties on the server.
|
17
|
+
server 'harvestdor-dev.stanford.edu', user: 'lyberadmin', roles: %w(web app db)
|
18
|
+
|
19
|
+
Capistrano::OneTimeKey.generate_one_time_key!
|
20
|
+
|
21
|
+
# you can set custom ssh options
|
22
|
+
# it's possible to pass any option but you need to keep in mind that net/ssh understand limited list of options
|
23
|
+
# you can see them in [net/ssh documentation](http://net-ssh.github.io/net-ssh/classes/Net/SSH.html#method-c-start)
|
24
|
+
# set it globally
|
25
|
+
# set :ssh_options, {
|
26
|
+
# keys: %w(/home/rlisowski/.ssh/id_rsa),
|
27
|
+
# forward_agent: false,
|
28
|
+
# auth_methods: %w(password)
|
29
|
+
# }
|
30
|
+
# and/or per server
|
31
|
+
# server 'example.com',
|
32
|
+
# user: 'user_name',
|
33
|
+
# roles: %w{web app},
|
34
|
+
# ssh_options: {
|
35
|
+
# user: 'user_name', # overrides user setting above
|
36
|
+
# keys: %w(/home/user_name/.ssh/id_rsa),
|
37
|
+
# forward_agent: false,
|
38
|
+
# auth_methods: %w(publickey password)
|
39
|
+
# password: 'please use keys'
|
40
|
+
# }
|
41
|
+
# setting per server overrides global ssh_options
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Simple Role Syntax
|
2
|
+
# ==================
|
3
|
+
# Supports bulk-adding hosts to roles, the primary
|
4
|
+
# server in each group is considered to be the first
|
5
|
+
# unless any hosts have the primary property set.
|
6
|
+
# Don't declare `role :all`, it's a meta role
|
7
|
+
# role :app, %w{deploy@example.com}
|
8
|
+
# role :web, %w{deploy@example.com}
|
9
|
+
# role :db, %w{deploy@example.com}
|
10
|
+
|
11
|
+
# Extended Server Syntax
|
12
|
+
# ======================
|
13
|
+
# This can be used to drop a more detailed server
|
14
|
+
# definition into the server list. The second argument
|
15
|
+
# something that quacks like a hash can be used to set
|
16
|
+
# extended properties on the server.
|
17
|
+
server 'harvestdor-prod.stanford.edu', user: 'lyberadmin', roles: %w(web app db)
|
18
|
+
|
19
|
+
Capistrano::OneTimeKey.generate_one_time_key!
|
20
|
+
|
21
|
+
# you can set custom ssh options
|
22
|
+
# it's possible to pass any option but you need to keep in mind that net/ssh understand limited list of options
|
23
|
+
# you can see them in [net/ssh documentation](http://net-ssh.github.io/net-ssh/classes/Net/SSH.html#method-c-start)
|
24
|
+
# set it globally
|
25
|
+
# set :ssh_options, {
|
26
|
+
# keys: %w(/home/rlisowski/.ssh/id_rsa),
|
27
|
+
# forward_agent: false,
|
28
|
+
# auth_methods: %w(password)
|
29
|
+
# }
|
30
|
+
# and/or per server
|
31
|
+
# server 'example.com',
|
32
|
+
# user: 'user_name',
|
33
|
+
# roles: %w{web app},
|
34
|
+
# ssh_options: {
|
35
|
+
# user: 'user_name', # overrides user setting above
|
36
|
+
# keys: %w(/home/user_name/.ssh/id_rsa),
|
37
|
+
# forward_agent: false,
|
38
|
+
# auth_methods: %w(publickey password)
|
39
|
+
# password: 'please use keys'
|
40
|
+
# }
|
41
|
+
# setting per server overrides global ssh_options
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# Simple Role Syntax
|
2
|
+
# ==================
|
3
|
+
# Supports bulk-adding hosts to roles, the primary
|
4
|
+
# server in each group is considered to be the first
|
5
|
+
# unless any hosts have the primary property set.
|
6
|
+
# Don't declare `role :all`, it's a meta role
|
7
|
+
# role :app, %w{deploy@example.com}
|
8
|
+
# role :web, %w{deploy@example.com}
|
9
|
+
# role :db, %w{deploy@example.com}
|
10
|
+
|
11
|
+
# Extended Server Syntax
|
12
|
+
# ======================
|
13
|
+
# This can be used to drop a more detailed server
|
14
|
+
# definition into the server list. The second argument
|
15
|
+
# something that quacks like a hash can be used to set
|
16
|
+
# extended properties on the server.
|
17
|
+
server 'harvestdor-stage.stanford.edu', user: 'lyberadmin', roles: %w(web app db)
|
18
|
+
|
19
|
+
Capistrano::OneTimeKey.generate_one_time_key!
|
20
|
+
|
21
|
+
# you can set custom ssh options
|
22
|
+
# it's possible to pass any option but you need to keep in mind that net/ssh understand limited list of options
|
23
|
+
# you can see them in [net/ssh documentation](http://net-ssh.github.io/net-ssh/classes/Net/SSH.html#method-c-start)
|
24
|
+
# set it globally
|
25
|
+
# set :ssh_options, {
|
26
|
+
# keys: %w(/home/rlisowski/.ssh/id_rsa),
|
27
|
+
# forward_agent: false,
|
28
|
+
# auth_methods: %w(password)
|
29
|
+
# }
|
30
|
+
# and/or per server
|
31
|
+
# server 'example.com',
|
32
|
+
# user: 'user_name',
|
33
|
+
# roles: %w{web app},
|
34
|
+
# ssh_options: {
|
35
|
+
# user: 'user_name', # overrides user setting above
|
36
|
+
# keys: %w(/home/user_name/.ssh/id_rsa),
|
37
|
+
# forward_agent: false,
|
38
|
+
# auth_methods: %w(publickey password)
|
39
|
+
# password: 'please use keys'
|
40
|
+
# }
|
41
|
+
# setting per server overrides global ssh_options
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'gdor/indexer/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'gdor-indexer'
|
8
|
+
spec.version = GDor::Indexer::VERSION
|
9
|
+
spec.authors = ['Naomi Dushay', 'Laney McGlohon', 'Chris Beer']
|
10
|
+
spec.email = ['cabeer@stanford.edu']
|
11
|
+
spec.summary = 'Gryphondor Solr indexing logic'
|
12
|
+
spec.homepage = 'https://github.com/sul-dlss/gdor-indexer'
|
13
|
+
spec.license = 'Apache 2'
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
|
20
|
+
spec.add_dependency 'harvestdor-indexer'
|
21
|
+
spec.add_dependency 'stanford-mods'
|
22
|
+
spec.add_dependency 'nokogiri'
|
23
|
+
spec.add_dependency 'rsolr'
|
24
|
+
spec.add_dependency 'activesupport'
|
25
|
+
spec.add_dependency 'mail'
|
26
|
+
spec.add_dependency 'hooks'
|
27
|
+
spec.add_dependency 'trollop'
|
28
|
+
spec.add_development_dependency 'bundler', '~> 1.5'
|
29
|
+
spec.add_development_dependency 'rake'
|
30
|
+
spec.add_development_dependency 'rdoc'
|
31
|
+
spec.add_development_dependency 'yard'
|
32
|
+
spec.add_development_dependency 'rspec', '~> 3.1'
|
33
|
+
spec.add_development_dependency 'rspec-rails'
|
34
|
+
spec.add_development_dependency 'rubocop'
|
35
|
+
spec.add_development_dependency 'rubocop-rspec'
|
36
|
+
spec.add_development_dependency 'simplecov'
|
37
|
+
spec.add_development_dependency 'equivalent-xml', '~> 0.5'
|
38
|
+
spec.add_development_dependency 'capybara'
|
39
|
+
spec.add_development_dependency 'poltergeist', '>= 1.5.0'
|
40
|
+
spec.add_development_dependency 'vcr'
|
41
|
+
spec.add_development_dependency 'jettywrapper'
|
42
|
+
spec.add_development_dependency 'webmock'
|
43
|
+
end
|
data/lib/gdor/indexer.rb
ADDED
@@ -0,0 +1,327 @@
|
|
1
|
+
# external gems
|
2
|
+
require 'confstruct'
|
3
|
+
require 'harvestdor-indexer'
|
4
|
+
require 'rsolr'
|
5
|
+
require 'mail'
|
6
|
+
require 'dor-fetcher'
|
7
|
+
require 'hooks'
|
8
|
+
require 'active_support/core_ext/array/extract_options'
|
9
|
+
|
10
|
+
# stdlib
|
11
|
+
require 'logger'
|
12
|
+
require 'net/smtp'
|
13
|
+
require 'set'
|
14
|
+
|
15
|
+
# Base class to harvest from DOR via harvestdor gem
|
16
|
+
module GDor
|
17
|
+
class Indexer
|
18
|
+
include Hooks
|
19
|
+
|
20
|
+
define_hooks :before_index, :before_merge
|
21
|
+
|
22
|
+
# local files
|
23
|
+
require 'gdor/indexer/version'
|
24
|
+
require 'gdor/indexer/solr_doc_hash'
|
25
|
+
require 'gdor/indexer/solr_doc_builder'
|
26
|
+
require 'gdor/indexer/nokogiri_xml_node_mixin' if defined? JRUBY_VERSION
|
27
|
+
|
28
|
+
attr_accessor :harvestdor
|
29
|
+
attr_reader :config, :druids_failed_to_ix
|
30
|
+
|
31
|
+
class <<self
|
32
|
+
attr_accessor :config
|
33
|
+
end
|
34
|
+
|
35
|
+
# Initialize with configuration files
|
36
|
+
# @param yml_path [String] /path/to
|
37
|
+
# @param options [Hash]
|
38
|
+
def initialize(*args)
|
39
|
+
options = args.extract_options!
|
40
|
+
yml_path = args.first
|
41
|
+
|
42
|
+
@success_count = 0
|
43
|
+
@error_count = 0
|
44
|
+
@total_time_to_solr = 0
|
45
|
+
@total_time_to_parse = 0
|
46
|
+
@retries = 0
|
47
|
+
@druids_failed_to_ix = []
|
48
|
+
@validation_messages = []
|
49
|
+
@config ||= Confstruct::Configuration.new options
|
50
|
+
@config.configure(YAML.load_file(yml_path)) if yml_path && File.exist?(yml_path)
|
51
|
+
yield @config if block_given?
|
52
|
+
@harvestdor = Harvestdor::Indexer.new @config
|
53
|
+
end
|
54
|
+
|
55
|
+
def logger
|
56
|
+
harvestdor.logger
|
57
|
+
end
|
58
|
+
|
59
|
+
def solr_client
|
60
|
+
harvestdor.solr
|
61
|
+
end
|
62
|
+
|
63
|
+
def metrics
|
64
|
+
harvestdor.metrics
|
65
|
+
end
|
66
|
+
|
67
|
+
# per this Indexer's config options
|
68
|
+
# harvest the druids via DorFetcher
|
69
|
+
# create a Solr document for each druid suitable for SearchWorks and
|
70
|
+
# write the result to the SearchWorks Solr index
|
71
|
+
# (all members of the collection + coll rec itself)
|
72
|
+
def harvest_and_index(nocommit = nil)
|
73
|
+
nocommit = config.nocommit if nocommit.nil?
|
74
|
+
|
75
|
+
start_time = Time.now.getlocal
|
76
|
+
logger.info("Started harvest_and_index at #{start_time}")
|
77
|
+
|
78
|
+
harvestdor.each_resource(in_threads: 3) do |resource|
|
79
|
+
index_with_exception_handling resource
|
80
|
+
end
|
81
|
+
|
82
|
+
unless nocommit
|
83
|
+
logger.info('Beginning Commit.')
|
84
|
+
solr_client.commit!
|
85
|
+
logger.info('Finished Commit.')
|
86
|
+
else
|
87
|
+
logger.info('Skipping commit per nocommit flag')
|
88
|
+
end
|
89
|
+
|
90
|
+
@total_time = elapsed_time(start_time)
|
91
|
+
logger.info("Finished harvest_and_index at #{Time.now.getlocal}")
|
92
|
+
logger.info("Total elapsed time for harvest and index: #{(@total_time / 60).round(2)} minutes")
|
93
|
+
|
94
|
+
log_results
|
95
|
+
email_results
|
96
|
+
end
|
97
|
+
|
98
|
+
def index(resource)
|
99
|
+
doc_hash = solr_document resource
|
100
|
+
run_hook :before_index, resource, doc_hash
|
101
|
+
solr_client.add(doc_hash)
|
102
|
+
end
|
103
|
+
|
104
|
+
def solr_document(resource)
|
105
|
+
if resource.collection?
|
106
|
+
collection_solr_document resource
|
107
|
+
else
|
108
|
+
item_solr_document resource
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def index_with_exception_handling(resource)
|
113
|
+
index resource
|
114
|
+
rescue => e
|
115
|
+
@error_count += 1
|
116
|
+
@druids_failed_to_ix << resource.druid
|
117
|
+
logger.error "Failed to index item #{resource.druid}: #{e.message} #{e.backtrace}"
|
118
|
+
raise e
|
119
|
+
end
|
120
|
+
|
121
|
+
# create Solr doc for the druid and add it to Solr, unless it is on the blacklist.
|
122
|
+
# NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
|
123
|
+
# @param [Harvestdor::Indexer::Resource] resource an item record (a member of a collection)
|
124
|
+
def item_solr_document(resource)
|
125
|
+
sdb = GDor::Indexer::SolrDocBuilder.new(resource, logger)
|
126
|
+
|
127
|
+
fields_to_add = GDor::Indexer::SolrDocHash.new(
|
128
|
+
druid: resource.bare_druid,
|
129
|
+
url_fulltext: "https://purl.stanford.edu/#{resource.bare_druid}",
|
130
|
+
access_facet: 'Online',
|
131
|
+
display_type: sdb.display_type, # defined in public_xml_fields
|
132
|
+
building_facet: 'Stanford Digital Repository' # INDEX-53 add building_facet = Stanford Digital Repository here for item
|
133
|
+
)
|
134
|
+
fields_to_add[:file_id] = sdb.file_ids if sdb.file_ids # defined in public_xml_fields
|
135
|
+
|
136
|
+
logger.info "indexing item #{resource.bare_druid}"
|
137
|
+
doc_hash = sdb.doc_hash
|
138
|
+
doc_hash.combine fields_to_add
|
139
|
+
add_coll_info doc_hash, resource.collections # defined in public_xml_fields
|
140
|
+
validation_messages = fields_to_add.validate_item(config)
|
141
|
+
validation_messages.concat doc_hash.validate_mods(config)
|
142
|
+
@validation_messages.concat(validation_messages)
|
143
|
+
doc_hash.to_h
|
144
|
+
end
|
145
|
+
|
146
|
+
# Create Solr document for the collection druid suitable for SearchWorks
|
147
|
+
# and write the result to the SearchWorks Solr Index
|
148
|
+
# @param [Harvestdor::Indexer::Resource] resource a collection record
|
149
|
+
def collection_solr_document(resource)
|
150
|
+
coll_sdb = GDor::Indexer::SolrDocBuilder.new(resource, logger)
|
151
|
+
|
152
|
+
fields_to_add = GDor::Indexer::SolrDocHash.new(
|
153
|
+
druid: resource.bare_druid,
|
154
|
+
url_fulltext: "https://purl.stanford.edu/#{resource.bare_druid}",
|
155
|
+
access_facet: 'Online',
|
156
|
+
collection_type: 'Digital Collection',
|
157
|
+
display_type: coll_display_types_from_items(resource),
|
158
|
+
format_main_ssim: 'Archive/Manuscript', # per INDEX-12, add this to all collection records (does not add dups)
|
159
|
+
format: 'Manuscript/Archive', # per INDEX-144, add this to all collection records (does not add dups)
|
160
|
+
building_facet: 'Stanford Digital Repository' # INDEX-53 add building_facet = Stanford Digital Repository here for collection
|
161
|
+
)
|
162
|
+
|
163
|
+
logger.info "Indexing collection object #{resource.druid} (unmerged)"
|
164
|
+
doc_hash = coll_sdb.doc_hash
|
165
|
+
doc_hash.combine fields_to_add
|
166
|
+
validation_messages = doc_hash.validate_collection(config)
|
167
|
+
validation_messages.concat doc_hash.validate_mods(config)
|
168
|
+
@validation_messages.concat(validation_messages)
|
169
|
+
doc_hash.to_h
|
170
|
+
end
|
171
|
+
|
172
|
+
# add coll level data to this solr doc and/or cache collection level information
|
173
|
+
# @param [Hash] doc_hash representing the Solr document (for an item)
|
174
|
+
# @param [Array<Harvestdor::Indexer::Resource>] collections the collections the item is a member of
|
175
|
+
def add_coll_info(doc_hash, collections)
|
176
|
+
if collections
|
177
|
+
doc_hash[:collection] = []
|
178
|
+
doc_hash[:collection_with_title] = []
|
179
|
+
|
180
|
+
collections.each do |collection|
|
181
|
+
cache_display_type_for_collection collection, doc_hash[:display_type]
|
182
|
+
doc_hash[:collection] << collection.bare_druid
|
183
|
+
doc_hash[:collection_with_title] << "#{collection.bare_druid}-|-#{coll_title(collection)}"
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# cache the coll title so we don't have to look it up more than once
|
189
|
+
def coll_title(resource)
|
190
|
+
@collection_titles ||= {}
|
191
|
+
@collection_titles[resource.druid] ||= begin
|
192
|
+
resource.identity_md_obj_label
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
# cache of display_type from each item so we have this info for indexing collection record
|
197
|
+
# @return [Hash<String, Array<String>>] collection druids as keys, array of item display_types as values
|
198
|
+
def coll_display_types_from_items(resource)
|
199
|
+
@collection_display_types ||= {}
|
200
|
+
@collection_display_types[resource.druid] ||= Set.new
|
201
|
+
end
|
202
|
+
|
203
|
+
# cache the display_type of this (item) object with a collection, so when the collection rec
|
204
|
+
# is being indexed, it can get all of the display_types of the members
|
205
|
+
def cache_display_type_for_collection(resource, display_type)
|
206
|
+
if display_type && display_type.instance_of?(String)
|
207
|
+
coll_display_types_from_items(resource) << display_type
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
# count the number of records in solr for this collection (and the collection record itself)
|
212
|
+
# and check for a purl in the collection record
|
213
|
+
def num_found_in_solr(fqs)
|
214
|
+
params = { fl: 'id', rows: 1000 }
|
215
|
+
params[:fq] = fqs.map { |k, v| "#{k}:\"#{v}\"" }
|
216
|
+
params[:start] ||= 0
|
217
|
+
resp = solr_client.client.get 'select', params: params
|
218
|
+
num_found = resp['response']['numFound'].to_i
|
219
|
+
|
220
|
+
if fqs.key? :collection
|
221
|
+
num_found += num_found_in_solr id: fqs[:collection]
|
222
|
+
end
|
223
|
+
|
224
|
+
num_found
|
225
|
+
end
|
226
|
+
|
227
|
+
# create messages about various record counts
|
228
|
+
# @return [Array<String>] Array of messages suitable for notificaiton email and/or logs
|
229
|
+
def record_count_msgs
|
230
|
+
@record_count_msgs ||= begin
|
231
|
+
msgs = []
|
232
|
+
msgs << "Successful count (items + coll record indexed w/o error): #{metrics.success_count}"
|
233
|
+
|
234
|
+
harvestdor.resources.select(&:collection?).each do |collection|
|
235
|
+
solr_count = num_found_in_solr(collection: collection.bare_druid)
|
236
|
+
msgs << "#{config.harvestdor.log_name.chomp('.log')} indexed coll record is: #{collection.druid}\n"
|
237
|
+
msgs << "coll title: #{coll_title(collection)}\n"
|
238
|
+
msgs << "Solr query for items: #{config[:solr][:url]}/select?fq=collection:#{collection.druid}&fl=id,title_245a_display\n"
|
239
|
+
msgs << "Records verified in solr for collection #{collection.druid} (items + coll record): #{num_found_in_solr collection: collection.bare_druid}"
|
240
|
+
msgs << "WARNING: Expected #{collection.druid} to contain #{collection.items.length} items, but only found #{solr_count}."
|
241
|
+
end
|
242
|
+
|
243
|
+
msgs << "Error count (items + coll record w any error; may have indexed on retry if it was a timeout): #{metrics.error_count}"
|
244
|
+
# msgs << "Retry count: #{@retries}" # currently useless due to bug in harvestdor-indexer 0.0.12
|
245
|
+
msgs << "Total records processed: #{metrics.total}"
|
246
|
+
msgs
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
# log details about the results of indexing
|
251
|
+
def log_results
|
252
|
+
record_count_msgs.each do |msg|
|
253
|
+
logger.info msg
|
254
|
+
end
|
255
|
+
logger.info("Avg solr commit time per object (successful): #{(@total_time_to_solr / metrics.success_count).round(2)} seconds") unless metrics.success_count == 0
|
256
|
+
logger.info("Avg solr commit time per object (all): #{(@total_time_to_solr / metrics.total).round(2)} seconds") unless metrics.total == 0
|
257
|
+
logger.info("Avg parse time per object (successful): #{(@total_time_to_parse / metrics.success_count).round(2)} seconds") unless metrics.success_count == 0
|
258
|
+
logger.info("Avg parse time per object (all): #{(@total_time_to_parse / metrics.total).round(2)} seconds") unless metrics.total == 0
|
259
|
+
logger.info("Avg complete index time per object (successful): #{(@total_time / metrics.success_count).round(2)} seconds") unless metrics.success_count == 0
|
260
|
+
logger.info("Avg complete index time per object (all): #{(@total_time / metrics.total).round(2)} seconds") unless metrics.total == 0
|
261
|
+
end
|
262
|
+
|
263
|
+
def email_report_body
|
264
|
+
body = ''
|
265
|
+
|
266
|
+
body += "\n" + record_count_msgs.join("\n") + "\n"
|
267
|
+
|
268
|
+
if @druids_failed_to_ix.size > 0
|
269
|
+
body += "\n"
|
270
|
+
body += "records that may have failed to index (merged recs as druids, not ckeys): \n"
|
271
|
+
body += @druids_failed_to_ix.join("\n") + "\n"
|
272
|
+
end
|
273
|
+
|
274
|
+
body += "\n"
|
275
|
+
body += "full log is at gdor_indexer/shared/#{config.harvestdor.log_dir}/#{config.harvestdor.log_name} on #{Socket.gethostname}"
|
276
|
+
body += "\n"
|
277
|
+
|
278
|
+
body += @validation_messages.join("\n") + "\n"
|
279
|
+
end
|
280
|
+
|
281
|
+
# email the results of indexing if we are on one of the harvestdor boxes
|
282
|
+
def email_results
|
283
|
+
if config.notification
|
284
|
+
to_email = config.notification
|
285
|
+
|
286
|
+
opts = {}
|
287
|
+
opts[:subject] = "#{config.harvestdor.log_name.chomp('.log')} into Solr server #{config[:solr][:url]} is finished"
|
288
|
+
opts[:body] = email_report_body
|
289
|
+
begin
|
290
|
+
send_email(to_email, opts)
|
291
|
+
rescue => e
|
292
|
+
logger.error('Failed to send email notification!')
|
293
|
+
logger.error(e)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
def send_email(to, opts = {})
|
299
|
+
opts[:server] ||= 'localhost'
|
300
|
+
opts[:from] ||= 'gryphondor@stanford.edu'
|
301
|
+
opts[:from_alias] ||= 'gryphondor'
|
302
|
+
opts[:subject] ||= 'default subject'
|
303
|
+
opts[:body] ||= 'default message body'
|
304
|
+
mail = Mail.new do
|
305
|
+
from opts[:from]
|
306
|
+
to to
|
307
|
+
subject opts[:subject]
|
308
|
+
body opts[:body]
|
309
|
+
end
|
310
|
+
mail.deliver!
|
311
|
+
end
|
312
|
+
|
313
|
+
def elapsed_time(start_time, units = :seconds)
|
314
|
+
elapsed_seconds = Time.now.getlocal - start_time
|
315
|
+
case units
|
316
|
+
when :seconds
|
317
|
+
return elapsed_seconds.round(2)
|
318
|
+
when :minutes
|
319
|
+
return (elapsed_seconds / 60.0).round(1)
|
320
|
+
when :hours
|
321
|
+
return (elapsed_seconds / 3600.0).round(2)
|
322
|
+
else
|
323
|
+
return elapsed_seconds
|
324
|
+
end
|
325
|
+
end
|
326
|
+
end
|
327
|
+
end
|