gdor-indexer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,41 @@
1
+ # Simple Role Syntax
2
+ # ==================
3
+ # Supports bulk-adding hosts to roles, the primary
4
+ # server in each group is considered to be the first
5
+ # unless any hosts have the primary property set.
6
+ # Don't declare `role :all`, it's a meta role
7
+ # role :app, %w{deploy@example.com}
8
+ # role :web, %w{deploy@example.com}
9
+ # role :db, %w{deploy@example.com}
10
+
11
+ # Extended Server Syntax
12
+ # ======================
13
+ # This can be used to drop a more detailed server
14
+ # definition into the server list. The second argument
15
+ # something that quacks like a hash can be used to set
16
+ # extended properties on the server.
17
+ server 'harvestdor-dev.stanford.edu', user: 'lyberadmin', roles: %w(web app db)
18
+
19
+ Capistrano::OneTimeKey.generate_one_time_key!
20
+
21
+ # you can set custom ssh options
22
+ # it's possible to pass any option but you need to keep in mind that net/ssh understand limited list of options
23
+ # you can see them in [net/ssh documentation](http://net-ssh.github.io/net-ssh/classes/Net/SSH.html#method-c-start)
24
+ # set it globally
25
+ # set :ssh_options, {
26
+ # keys: %w(/home/rlisowski/.ssh/id_rsa),
27
+ # forward_agent: false,
28
+ # auth_methods: %w(password)
29
+ # }
30
+ # and/or per server
31
+ # server 'example.com',
32
+ # user: 'user_name',
33
+ # roles: %w{web app},
34
+ # ssh_options: {
35
+ # user: 'user_name', # overrides user setting above
36
+ # keys: %w(/home/user_name/.ssh/id_rsa),
37
+ # forward_agent: false,
38
+ # auth_methods: %w(publickey password)
39
+ # password: 'please use keys'
40
+ # }
41
+ # setting per server overrides global ssh_options
@@ -0,0 +1,6 @@
1
+ # Temporary deployment target for DorFetcher work
2
+ server 'harvestdor-dev.stanford.edu', user: 'lyberadmin', roles: %w(app)
3
+
4
+ Capistrano::OneTimeKey.generate_one_time_key!
5
+
6
+ set :deploy_to, "/home/#{fetch(:user)}/gdor-indexer-fetcher"
@@ -0,0 +1,41 @@
1
+ # Simple Role Syntax
2
+ # ==================
3
+ # Supports bulk-adding hosts to roles, the primary
4
+ # server in each group is considered to be the first
5
+ # unless any hosts have the primary property set.
6
+ # Don't declare `role :all`, it's a meta role
7
+ # role :app, %w{deploy@example.com}
8
+ # role :web, %w{deploy@example.com}
9
+ # role :db, %w{deploy@example.com}
10
+
11
+ # Extended Server Syntax
12
+ # ======================
13
+ # This can be used to drop a more detailed server
14
+ # definition into the server list. The second argument
15
+ # something that quacks like a hash can be used to set
16
+ # extended properties on the server.
17
+ server 'harvestdor-prod.stanford.edu', user: 'lyberadmin', roles: %w(web app db)
18
+
19
+ Capistrano::OneTimeKey.generate_one_time_key!
20
+
21
+ # you can set custom ssh options
22
+ # it's possible to pass any option but you need to keep in mind that net/ssh understand limited list of options
23
+ # you can see them in [net/ssh documentation](http://net-ssh.github.io/net-ssh/classes/Net/SSH.html#method-c-start)
24
+ # set it globally
25
+ # set :ssh_options, {
26
+ # keys: %w(/home/rlisowski/.ssh/id_rsa),
27
+ # forward_agent: false,
28
+ # auth_methods: %w(password)
29
+ # }
30
+ # and/or per server
31
+ # server 'example.com',
32
+ # user: 'user_name',
33
+ # roles: %w{web app},
34
+ # ssh_options: {
35
+ # user: 'user_name', # overrides user setting above
36
+ # keys: %w(/home/user_name/.ssh/id_rsa),
37
+ # forward_agent: false,
38
+ # auth_methods: %w(publickey password)
39
+ # password: 'please use keys'
40
+ # }
41
+ # setting per server overrides global ssh_options
@@ -0,0 +1,41 @@
1
+ # Simple Role Syntax
2
+ # ==================
3
+ # Supports bulk-adding hosts to roles, the primary
4
+ # server in each group is considered to be the first
5
+ # unless any hosts have the primary property set.
6
+ # Don't declare `role :all`, it's a meta role
7
+ # role :app, %w{deploy@example.com}
8
+ # role :web, %w{deploy@example.com}
9
+ # role :db, %w{deploy@example.com}
10
+
11
+ # Extended Server Syntax
12
+ # ======================
13
+ # This can be used to drop a more detailed server
14
+ # definition into the server list. The second argument
15
+ # something that quacks like a hash can be used to set
16
+ # extended properties on the server.
17
+ server 'harvestdor-stage.stanford.edu', user: 'lyberadmin', roles: %w(web app db)
18
+
19
+ Capistrano::OneTimeKey.generate_one_time_key!
20
+
21
+ # you can set custom ssh options
22
+ # it's possible to pass any option but you need to keep in mind that net/ssh understand limited list of options
23
+ # you can see them in [net/ssh documentation](http://net-ssh.github.io/net-ssh/classes/Net/SSH.html#method-c-start)
24
+ # set it globally
25
+ # set :ssh_options, {
26
+ # keys: %w(/home/rlisowski/.ssh/id_rsa),
27
+ # forward_agent: false,
28
+ # auth_methods: %w(password)
29
+ # }
30
+ # and/or per server
31
+ # server 'example.com',
32
+ # user: 'user_name',
33
+ # roles: %w{web app},
34
+ # ssh_options: {
35
+ # user: 'user_name', # overrides user setting above
36
+ # keys: %w(/home/user_name/.ssh/id_rsa),
37
+ # forward_agent: false,
38
+ # auth_methods: %w(publickey password)
39
+ # password: 'please use keys'
40
+ # }
41
+ # setting per server overrides global ssh_options
@@ -0,0 +1,43 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gdor/indexer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'gdor-indexer'
8
+ spec.version = GDor::Indexer::VERSION
9
+ spec.authors = ['Naomi Dushay', 'Laney McGlohon', 'Chris Beer']
10
+ spec.email = ['cabeer@stanford.edu']
11
+ spec.summary = 'Gryphondor Solr indexing logic'
12
+ spec.homepage = 'https://github.com/sul-dlss/gdor-indexer'
13
+ spec.license = 'Apache 2'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.add_dependency 'harvestdor-indexer'
21
+ spec.add_dependency 'stanford-mods'
22
+ spec.add_dependency 'nokogiri'
23
+ spec.add_dependency 'rsolr'
24
+ spec.add_dependency 'activesupport'
25
+ spec.add_dependency 'mail'
26
+ spec.add_dependency 'hooks'
27
+ spec.add_dependency 'trollop'
28
+ spec.add_development_dependency 'bundler', '~> 1.5'
29
+ spec.add_development_dependency 'rake'
30
+ spec.add_development_dependency 'rdoc'
31
+ spec.add_development_dependency 'yard'
32
+ spec.add_development_dependency 'rspec', '~> 3.1'
33
+ spec.add_development_dependency 'rspec-rails'
34
+ spec.add_development_dependency 'rubocop'
35
+ spec.add_development_dependency 'rubocop-rspec'
36
+ spec.add_development_dependency 'simplecov'
37
+ spec.add_development_dependency 'equivalent-xml', '~> 0.5'
38
+ spec.add_development_dependency 'capybara'
39
+ spec.add_development_dependency 'poltergeist', '>= 1.5.0'
40
+ spec.add_development_dependency 'vcr'
41
+ spec.add_development_dependency 'jettywrapper'
42
+ spec.add_development_dependency 'webmock'
43
+ end
@@ -0,0 +1,327 @@
1
+ # external gems
2
+ require 'confstruct'
3
+ require 'harvestdor-indexer'
4
+ require 'rsolr'
5
+ require 'mail'
6
+ require 'dor-fetcher'
7
+ require 'hooks'
8
+ require 'active_support/core_ext/array/extract_options'
9
+
10
+ # stdlib
11
+ require 'logger'
12
+ require 'net/smtp'
13
+ require 'set'
14
+
15
+ # Base class to harvest from DOR via harvestdor gem
16
+ module GDor
17
+ class Indexer
18
+ include Hooks
19
+
20
+ define_hooks :before_index, :before_merge
21
+
22
+ # local files
23
+ require 'gdor/indexer/version'
24
+ require 'gdor/indexer/solr_doc_hash'
25
+ require 'gdor/indexer/solr_doc_builder'
26
+ require 'gdor/indexer/nokogiri_xml_node_mixin' if defined? JRUBY_VERSION
27
+
28
+ attr_accessor :harvestdor
29
+ attr_reader :config, :druids_failed_to_ix
30
+
31
+ class <<self
32
+ attr_accessor :config
33
+ end
34
+
35
+ # Initialize with configuration files
36
+ # @param yml_path [String] /path/to
37
+ # @param options [Hash]
38
+ def initialize(*args)
39
+ options = args.extract_options!
40
+ yml_path = args.first
41
+
42
+ @success_count = 0
43
+ @error_count = 0
44
+ @total_time_to_solr = 0
45
+ @total_time_to_parse = 0
46
+ @retries = 0
47
+ @druids_failed_to_ix = []
48
+ @validation_messages = []
49
+ @config ||= Confstruct::Configuration.new options
50
+ @config.configure(YAML.load_file(yml_path)) if yml_path && File.exist?(yml_path)
51
+ yield @config if block_given?
52
+ @harvestdor = Harvestdor::Indexer.new @config
53
+ end
54
+
55
+ def logger
56
+ harvestdor.logger
57
+ end
58
+
59
+ def solr_client
60
+ harvestdor.solr
61
+ end
62
+
63
+ def metrics
64
+ harvestdor.metrics
65
+ end
66
+
67
+ # per this Indexer's config options
68
+ # harvest the druids via DorFetcher
69
+ # create a Solr document for each druid suitable for SearchWorks and
70
+ # write the result to the SearchWorks Solr index
71
+ # (all members of the collection + coll rec itself)
72
+ def harvest_and_index(nocommit = nil)
73
+ nocommit = config.nocommit if nocommit.nil?
74
+
75
+ start_time = Time.now.getlocal
76
+ logger.info("Started harvest_and_index at #{start_time}")
77
+
78
+ harvestdor.each_resource(in_threads: 3) do |resource|
79
+ index_with_exception_handling resource
80
+ end
81
+
82
+ unless nocommit
83
+ logger.info('Beginning Commit.')
84
+ solr_client.commit!
85
+ logger.info('Finished Commit.')
86
+ else
87
+ logger.info('Skipping commit per nocommit flag')
88
+ end
89
+
90
+ @total_time = elapsed_time(start_time)
91
+ logger.info("Finished harvest_and_index at #{Time.now.getlocal}")
92
+ logger.info("Total elapsed time for harvest and index: #{(@total_time / 60).round(2)} minutes")
93
+
94
+ log_results
95
+ email_results
96
+ end
97
+
98
+ def index(resource)
99
+ doc_hash = solr_document resource
100
+ run_hook :before_index, resource, doc_hash
101
+ solr_client.add(doc_hash)
102
+ end
103
+
104
+ def solr_document(resource)
105
+ if resource.collection?
106
+ collection_solr_document resource
107
+ else
108
+ item_solr_document resource
109
+ end
110
+ end
111
+
112
+ def index_with_exception_handling(resource)
113
+ index resource
114
+ rescue => e
115
+ @error_count += 1
116
+ @druids_failed_to_ix << resource.druid
117
+ logger.error "Failed to index item #{resource.druid}: #{e.message} #{e.backtrace}"
118
+ raise e
119
+ end
120
+
121
+ # create Solr doc for the druid and add it to Solr, unless it is on the blacklist.
122
+ # NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
123
+ # @param [Harvestdor::Indexer::Resource] resource an item record (a member of a collection)
124
+ def item_solr_document(resource)
125
+ sdb = GDor::Indexer::SolrDocBuilder.new(resource, logger)
126
+
127
+ fields_to_add = GDor::Indexer::SolrDocHash.new(
128
+ druid: resource.bare_druid,
129
+ url_fulltext: "https://purl.stanford.edu/#{resource.bare_druid}",
130
+ access_facet: 'Online',
131
+ display_type: sdb.display_type, # defined in public_xml_fields
132
+ building_facet: 'Stanford Digital Repository' # INDEX-53 add building_facet = Stanford Digital Repository here for item
133
+ )
134
+ fields_to_add[:file_id] = sdb.file_ids if sdb.file_ids # defined in public_xml_fields
135
+
136
+ logger.info "indexing item #{resource.bare_druid}"
137
+ doc_hash = sdb.doc_hash
138
+ doc_hash.combine fields_to_add
139
+ add_coll_info doc_hash, resource.collections # defined in public_xml_fields
140
+ validation_messages = fields_to_add.validate_item(config)
141
+ validation_messages.concat doc_hash.validate_mods(config)
142
+ @validation_messages.concat(validation_messages)
143
+ doc_hash.to_h
144
+ end
145
+
146
+ # Create Solr document for the collection druid suitable for SearchWorks
147
+ # and write the result to the SearchWorks Solr Index
148
+ # @param [Harvestdor::Indexer::Resource] resource a collection record
149
+ def collection_solr_document(resource)
150
+ coll_sdb = GDor::Indexer::SolrDocBuilder.new(resource, logger)
151
+
152
+ fields_to_add = GDor::Indexer::SolrDocHash.new(
153
+ druid: resource.bare_druid,
154
+ url_fulltext: "https://purl.stanford.edu/#{resource.bare_druid}",
155
+ access_facet: 'Online',
156
+ collection_type: 'Digital Collection',
157
+ display_type: coll_display_types_from_items(resource),
158
+ format_main_ssim: 'Archive/Manuscript', # per INDEX-12, add this to all collection records (does not add dups)
159
+ format: 'Manuscript/Archive', # per INDEX-144, add this to all collection records (does not add dups)
160
+ building_facet: 'Stanford Digital Repository' # INDEX-53 add building_facet = Stanford Digital Repository here for collection
161
+ )
162
+
163
+ logger.info "Indexing collection object #{resource.druid} (unmerged)"
164
+ doc_hash = coll_sdb.doc_hash
165
+ doc_hash.combine fields_to_add
166
+ validation_messages = doc_hash.validate_collection(config)
167
+ validation_messages.concat doc_hash.validate_mods(config)
168
+ @validation_messages.concat(validation_messages)
169
+ doc_hash.to_h
170
+ end
171
+
172
+ # add coll level data to this solr doc and/or cache collection level information
173
+ # @param [Hash] doc_hash representing the Solr document (for an item)
174
+ # @param [Array<Harvestdor::Indexer::Resource>] collections the collections the item is a member of
175
+ def add_coll_info(doc_hash, collections)
176
+ if collections
177
+ doc_hash[:collection] = []
178
+ doc_hash[:collection_with_title] = []
179
+
180
+ collections.each do |collection|
181
+ cache_display_type_for_collection collection, doc_hash[:display_type]
182
+ doc_hash[:collection] << collection.bare_druid
183
+ doc_hash[:collection_with_title] << "#{collection.bare_druid}-|-#{coll_title(collection)}"
184
+ end
185
+ end
186
+ end
187
+
188
+ # cache the coll title so we don't have to look it up more than once
189
+ def coll_title(resource)
190
+ @collection_titles ||= {}
191
+ @collection_titles[resource.druid] ||= begin
192
+ resource.identity_md_obj_label
193
+ end
194
+ end
195
+
196
+ # cache of display_type from each item so we have this info for indexing collection record
197
+ # @return [Hash<String, Array<String>>] collection druids as keys, array of item display_types as values
198
+ def coll_display_types_from_items(resource)
199
+ @collection_display_types ||= {}
200
+ @collection_display_types[resource.druid] ||= Set.new
201
+ end
202
+
203
+ # cache the display_type of this (item) object with a collection, so when the collection rec
204
+ # is being indexed, it can get all of the display_types of the members
205
+ def cache_display_type_for_collection(resource, display_type)
206
+ if display_type && display_type.instance_of?(String)
207
+ coll_display_types_from_items(resource) << display_type
208
+ end
209
+ end
210
+
211
+ # count the number of records in solr for this collection (and the collection record itself)
212
+ # and check for a purl in the collection record
213
+ def num_found_in_solr(fqs)
214
+ params = { fl: 'id', rows: 1000 }
215
+ params[:fq] = fqs.map { |k, v| "#{k}:\"#{v}\"" }
216
+ params[:start] ||= 0
217
+ resp = solr_client.client.get 'select', params: params
218
+ num_found = resp['response']['numFound'].to_i
219
+
220
+ if fqs.key? :collection
221
+ num_found += num_found_in_solr id: fqs[:collection]
222
+ end
223
+
224
+ num_found
225
+ end
226
+
227
+ # create messages about various record counts
228
+ # @return [Array<String>] Array of messages suitable for notificaiton email and/or logs
229
+ def record_count_msgs
230
+ @record_count_msgs ||= begin
231
+ msgs = []
232
+ msgs << "Successful count (items + coll record indexed w/o error): #{metrics.success_count}"
233
+
234
+ harvestdor.resources.select(&:collection?).each do |collection|
235
+ solr_count = num_found_in_solr(collection: collection.bare_druid)
236
+ msgs << "#{config.harvestdor.log_name.chomp('.log')} indexed coll record is: #{collection.druid}\n"
237
+ msgs << "coll title: #{coll_title(collection)}\n"
238
+ msgs << "Solr query for items: #{config[:solr][:url]}/select?fq=collection:#{collection.druid}&fl=id,title_245a_display\n"
239
+ msgs << "Records verified in solr for collection #{collection.druid} (items + coll record): #{num_found_in_solr collection: collection.bare_druid}"
240
+ msgs << "WARNING: Expected #{collection.druid} to contain #{collection.items.length} items, but only found #{solr_count}."
241
+ end
242
+
243
+ msgs << "Error count (items + coll record w any error; may have indexed on retry if it was a timeout): #{metrics.error_count}"
244
+ # msgs << "Retry count: #{@retries}" # currently useless due to bug in harvestdor-indexer 0.0.12
245
+ msgs << "Total records processed: #{metrics.total}"
246
+ msgs
247
+ end
248
+ end
249
+
250
+ # log details about the results of indexing
251
+ def log_results
252
+ record_count_msgs.each do |msg|
253
+ logger.info msg
254
+ end
255
+ logger.info("Avg solr commit time per object (successful): #{(@total_time_to_solr / metrics.success_count).round(2)} seconds") unless metrics.success_count == 0
256
+ logger.info("Avg solr commit time per object (all): #{(@total_time_to_solr / metrics.total).round(2)} seconds") unless metrics.total == 0
257
+ logger.info("Avg parse time per object (successful): #{(@total_time_to_parse / metrics.success_count).round(2)} seconds") unless metrics.success_count == 0
258
+ logger.info("Avg parse time per object (all): #{(@total_time_to_parse / metrics.total).round(2)} seconds") unless metrics.total == 0
259
+ logger.info("Avg complete index time per object (successful): #{(@total_time / metrics.success_count).round(2)} seconds") unless metrics.success_count == 0
260
+ logger.info("Avg complete index time per object (all): #{(@total_time / metrics.total).round(2)} seconds") unless metrics.total == 0
261
+ end
262
+
263
+ def email_report_body
264
+ body = ''
265
+
266
+ body += "\n" + record_count_msgs.join("\n") + "\n"
267
+
268
+ if @druids_failed_to_ix.size > 0
269
+ body += "\n"
270
+ body += "records that may have failed to index (merged recs as druids, not ckeys): \n"
271
+ body += @druids_failed_to_ix.join("\n") + "\n"
272
+ end
273
+
274
+ body += "\n"
275
+ body += "full log is at gdor_indexer/shared/#{config.harvestdor.log_dir}/#{config.harvestdor.log_name} on #{Socket.gethostname}"
276
+ body += "\n"
277
+
278
+ body += @validation_messages.join("\n") + "\n"
279
+ end
280
+
281
+ # email the results of indexing if we are on one of the harvestdor boxes
282
+ def email_results
283
+ if config.notification
284
+ to_email = config.notification
285
+
286
+ opts = {}
287
+ opts[:subject] = "#{config.harvestdor.log_name.chomp('.log')} into Solr server #{config[:solr][:url]} is finished"
288
+ opts[:body] = email_report_body
289
+ begin
290
+ send_email(to_email, opts)
291
+ rescue => e
292
+ logger.error('Failed to send email notification!')
293
+ logger.error(e)
294
+ end
295
+ end
296
+ end
297
+
298
+ def send_email(to, opts = {})
299
+ opts[:server] ||= 'localhost'
300
+ opts[:from] ||= 'gryphondor@stanford.edu'
301
+ opts[:from_alias] ||= 'gryphondor'
302
+ opts[:subject] ||= 'default subject'
303
+ opts[:body] ||= 'default message body'
304
+ mail = Mail.new do
305
+ from opts[:from]
306
+ to to
307
+ subject opts[:subject]
308
+ body opts[:body]
309
+ end
310
+ mail.deliver!
311
+ end
312
+
313
+ def elapsed_time(start_time, units = :seconds)
314
+ elapsed_seconds = Time.now.getlocal - start_time
315
+ case units
316
+ when :seconds
317
+ return elapsed_seconds.round(2)
318
+ when :minutes
319
+ return (elapsed_seconds / 60.0).round(1)
320
+ when :hours
321
+ return (elapsed_seconds / 3600.0).round(2)
322
+ else
323
+ return elapsed_seconds
324
+ end
325
+ end
326
+ end
327
+ end