gdor-indexer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ # Simple Role Syntax
2
+ # ==================
3
+ # Supports bulk-adding hosts to roles, the primary
4
+ # server in each group is considered to be the first
5
+ # unless any hosts have the primary property set.
6
+ # Don't declare `role :all`, it's a meta role
7
+ # role :app, %w{deploy@example.com}
8
+ # role :web, %w{deploy@example.com}
9
+ # role :db, %w{deploy@example.com}
10
+
11
+ # Extended Server Syntax
12
+ # ======================
13
+ # This can be used to drop a more detailed server
14
+ # definition into the server list. The second argument
15
+ # something that quacks like a hash can be used to set
16
+ # extended properties on the server.
17
+ server 'harvestdor-dev.stanford.edu', user: 'lyberadmin', roles: %w(web app db)
18
+
19
+ Capistrano::OneTimeKey.generate_one_time_key!
20
+
21
+ # you can set custom ssh options
22
+ # it's possible to pass any option but you need to keep in mind that net/ssh understand limited list of options
23
+ # you can see them in [net/ssh documentation](http://net-ssh.github.io/net-ssh/classes/Net/SSH.html#method-c-start)
24
+ # set it globally
25
+ # set :ssh_options, {
26
+ # keys: %w(/home/rlisowski/.ssh/id_rsa),
27
+ # forward_agent: false,
28
+ # auth_methods: %w(password)
29
+ # }
30
+ # and/or per server
31
+ # server 'example.com',
32
+ # user: 'user_name',
33
+ # roles: %w{web app},
34
+ # ssh_options: {
35
+ # user: 'user_name', # overrides user setting above
36
+ # keys: %w(/home/user_name/.ssh/id_rsa),
37
+ # forward_agent: false,
38
+ # auth_methods: %w(publickey password)
39
+ # password: 'please use keys'
40
+ # }
41
+ # setting per server overrides global ssh_options
@@ -0,0 +1,6 @@
1
+ # Temporary deployment target for DorFetcher work
2
+ server 'harvestdor-dev.stanford.edu', user: 'lyberadmin', roles: %w(app)
3
+
4
+ Capistrano::OneTimeKey.generate_one_time_key!
5
+
6
+ set :deploy_to, "/home/#{fetch(:user)}/gdor-indexer-fetcher"
@@ -0,0 +1,41 @@
1
+ # Simple Role Syntax
2
+ # ==================
3
+ # Supports bulk-adding hosts to roles, the primary
4
+ # server in each group is considered to be the first
5
+ # unless any hosts have the primary property set.
6
+ # Don't declare `role :all`, it's a meta role
7
+ # role :app, %w{deploy@example.com}
8
+ # role :web, %w{deploy@example.com}
9
+ # role :db, %w{deploy@example.com}
10
+
11
+ # Extended Server Syntax
12
+ # ======================
13
+ # This can be used to drop a more detailed server
14
+ # definition into the server list. The second argument
15
+ # something that quacks like a hash can be used to set
16
+ # extended properties on the server.
17
+ server 'harvestdor-prod.stanford.edu', user: 'lyberadmin', roles: %w(web app db)
18
+
19
+ Capistrano::OneTimeKey.generate_one_time_key!
20
+
21
+ # you can set custom ssh options
22
+ # it's possible to pass any option but you need to keep in mind that net/ssh understand limited list of options
23
+ # you can see them in [net/ssh documentation](http://net-ssh.github.io/net-ssh/classes/Net/SSH.html#method-c-start)
24
+ # set it globally
25
+ # set :ssh_options, {
26
+ # keys: %w(/home/rlisowski/.ssh/id_rsa),
27
+ # forward_agent: false,
28
+ # auth_methods: %w(password)
29
+ # }
30
+ # and/or per server
31
+ # server 'example.com',
32
+ # user: 'user_name',
33
+ # roles: %w{web app},
34
+ # ssh_options: {
35
+ # user: 'user_name', # overrides user setting above
36
+ # keys: %w(/home/user_name/.ssh/id_rsa),
37
+ # forward_agent: false,
38
+ # auth_methods: %w(publickey password)
39
+ # password: 'please use keys'
40
+ # }
41
+ # setting per server overrides global ssh_options
@@ -0,0 +1,41 @@
1
+ # Simple Role Syntax
2
+ # ==================
3
+ # Supports bulk-adding hosts to roles, the primary
4
+ # server in each group is considered to be the first
5
+ # unless any hosts have the primary property set.
6
+ # Don't declare `role :all`, it's a meta role
7
+ # role :app, %w{deploy@example.com}
8
+ # role :web, %w{deploy@example.com}
9
+ # role :db, %w{deploy@example.com}
10
+
11
+ # Extended Server Syntax
12
+ # ======================
13
+ # This can be used to drop a more detailed server
14
+ # definition into the server list. The second argument
15
+ # something that quacks like a hash can be used to set
16
+ # extended properties on the server.
17
+ server 'harvestdor-stage.stanford.edu', user: 'lyberadmin', roles: %w(web app db)
18
+
19
+ Capistrano::OneTimeKey.generate_one_time_key!
20
+
21
+ # you can set custom ssh options
22
+ # it's possible to pass any option but you need to keep in mind that net/ssh understand limited list of options
23
+ # you can see them in [net/ssh documentation](http://net-ssh.github.io/net-ssh/classes/Net/SSH.html#method-c-start)
24
+ # set it globally
25
+ # set :ssh_options, {
26
+ # keys: %w(/home/rlisowski/.ssh/id_rsa),
27
+ # forward_agent: false,
28
+ # auth_methods: %w(password)
29
+ # }
30
+ # and/or per server
31
+ # server 'example.com',
32
+ # user: 'user_name',
33
+ # roles: %w{web app},
34
+ # ssh_options: {
35
+ # user: 'user_name', # overrides user setting above
36
+ # keys: %w(/home/user_name/.ssh/id_rsa),
37
+ # forward_agent: false,
38
+ # auth_methods: %w(publickey password)
39
+ # password: 'please use keys'
40
+ # }
41
+ # setting per server overrides global ssh_options
@@ -0,0 +1,43 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gdor/indexer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'gdor-indexer'
8
+ spec.version = GDor::Indexer::VERSION
9
+ spec.authors = ['Naomi Dushay', 'Laney McGlohon', 'Chris Beer']
10
+ spec.email = ['cabeer@stanford.edu']
11
+ spec.summary = 'Gryphondor Solr indexing logic'
12
+ spec.homepage = 'https://github.com/sul-dlss/gdor-indexer'
13
+ spec.license = 'Apache 2'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.add_dependency 'harvestdor-indexer'
21
+ spec.add_dependency 'stanford-mods'
22
+ spec.add_dependency 'nokogiri'
23
+ spec.add_dependency 'rsolr'
24
+ spec.add_dependency 'activesupport'
25
+ spec.add_dependency 'mail'
26
+ spec.add_dependency 'hooks'
27
+ spec.add_dependency 'trollop'
28
+ spec.add_development_dependency 'bundler', '~> 1.5'
29
+ spec.add_development_dependency 'rake'
30
+ spec.add_development_dependency 'rdoc'
31
+ spec.add_development_dependency 'yard'
32
+ spec.add_development_dependency 'rspec', '~> 3.1'
33
+ spec.add_development_dependency 'rspec-rails'
34
+ spec.add_development_dependency 'rubocop'
35
+ spec.add_development_dependency 'rubocop-rspec'
36
+ spec.add_development_dependency 'simplecov'
37
+ spec.add_development_dependency 'equivalent-xml', '~> 0.5'
38
+ spec.add_development_dependency 'capybara'
39
+ spec.add_development_dependency 'poltergeist', '>= 1.5.0'
40
+ spec.add_development_dependency 'vcr'
41
+ spec.add_development_dependency 'jettywrapper'
42
+ spec.add_development_dependency 'webmock'
43
+ end
@@ -0,0 +1,327 @@
1
+ # external gems
2
+ require 'confstruct'
3
+ require 'harvestdor-indexer'
4
+ require 'rsolr'
5
+ require 'mail'
6
+ require 'dor-fetcher'
7
+ require 'hooks'
8
+ require 'active_support/core_ext/array/extract_options'
9
+
10
+ # stdlib
11
+ require 'logger'
12
+ require 'net/smtp'
13
+ require 'set'
14
+
15
+ # Base class to harvest from DOR via harvestdor gem
16
+ module GDor
17
+ class Indexer
18
+ include Hooks
19
+
20
+ define_hooks :before_index, :before_merge
21
+
22
+ # local files
23
+ require 'gdor/indexer/version'
24
+ require 'gdor/indexer/solr_doc_hash'
25
+ require 'gdor/indexer/solr_doc_builder'
26
+ require 'gdor/indexer/nokogiri_xml_node_mixin' if defined? JRUBY_VERSION
27
+
28
+ attr_accessor :harvestdor
29
+ attr_reader :config, :druids_failed_to_ix
30
+
31
+ class <<self
32
+ attr_accessor :config
33
+ end
34
+
35
+ # Initialize with configuration files
36
+ # @param yml_path [String] /path/to
37
+ # @param options [Hash]
38
+ def initialize(*args)
39
+ options = args.extract_options!
40
+ yml_path = args.first
41
+
42
+ @success_count = 0
43
+ @error_count = 0
44
+ @total_time_to_solr = 0
45
+ @total_time_to_parse = 0
46
+ @retries = 0
47
+ @druids_failed_to_ix = []
48
+ @validation_messages = []
49
+ @config ||= Confstruct::Configuration.new options
50
+ @config.configure(YAML.load_file(yml_path)) if yml_path && File.exist?(yml_path)
51
+ yield @config if block_given?
52
+ @harvestdor = Harvestdor::Indexer.new @config
53
+ end
54
+
55
+ def logger
56
+ harvestdor.logger
57
+ end
58
+
59
+ def solr_client
60
+ harvestdor.solr
61
+ end
62
+
63
+ def metrics
64
+ harvestdor.metrics
65
+ end
66
+
67
+ # per this Indexer's config options
68
+ # harvest the druids via DorFetcher
69
+ # create a Solr document for each druid suitable for SearchWorks and
70
+ # write the result to the SearchWorks Solr index
71
+ # (all members of the collection + coll rec itself)
72
+ def harvest_and_index(nocommit = nil)
73
+ nocommit = config.nocommit if nocommit.nil?
74
+
75
+ start_time = Time.now.getlocal
76
+ logger.info("Started harvest_and_index at #{start_time}")
77
+
78
+ harvestdor.each_resource(in_threads: 3) do |resource|
79
+ index_with_exception_handling resource
80
+ end
81
+
82
+ unless nocommit
83
+ logger.info('Beginning Commit.')
84
+ solr_client.commit!
85
+ logger.info('Finished Commit.')
86
+ else
87
+ logger.info('Skipping commit per nocommit flag')
88
+ end
89
+
90
+ @total_time = elapsed_time(start_time)
91
+ logger.info("Finished harvest_and_index at #{Time.now.getlocal}")
92
+ logger.info("Total elapsed time for harvest and index: #{(@total_time / 60).round(2)} minutes")
93
+
94
+ log_results
95
+ email_results
96
+ end
97
+
98
+ def index(resource)
99
+ doc_hash = solr_document resource
100
+ run_hook :before_index, resource, doc_hash
101
+ solr_client.add(doc_hash)
102
+ end
103
+
104
+ def solr_document(resource)
105
+ if resource.collection?
106
+ collection_solr_document resource
107
+ else
108
+ item_solr_document resource
109
+ end
110
+ end
111
+
112
+ def index_with_exception_handling(resource)
113
+ index resource
114
+ rescue => e
115
+ @error_count += 1
116
+ @druids_failed_to_ix << resource.druid
117
+ logger.error "Failed to index item #{resource.druid}: #{e.message} #{e.backtrace}"
118
+ raise e
119
+ end
120
+
121
+ # create Solr doc for the druid and add it to Solr, unless it is on the blacklist.
122
+ # NOTE: don't forget to send commit to Solr, either once at end (already in harvest_and_index), or for each add, or ...
123
+ # @param [Harvestdor::Indexer::Resource] resource an item record (a member of a collection)
124
+ def item_solr_document(resource)
125
+ sdb = GDor::Indexer::SolrDocBuilder.new(resource, logger)
126
+
127
+ fields_to_add = GDor::Indexer::SolrDocHash.new(
128
+ druid: resource.bare_druid,
129
+ url_fulltext: "https://purl.stanford.edu/#{resource.bare_druid}",
130
+ access_facet: 'Online',
131
+ display_type: sdb.display_type, # defined in public_xml_fields
132
+ building_facet: 'Stanford Digital Repository' # INDEX-53 add building_facet = Stanford Digital Repository here for item
133
+ )
134
+ fields_to_add[:file_id] = sdb.file_ids if sdb.file_ids # defined in public_xml_fields
135
+
136
+ logger.info "indexing item #{resource.bare_druid}"
137
+ doc_hash = sdb.doc_hash
138
+ doc_hash.combine fields_to_add
139
+ add_coll_info doc_hash, resource.collections # defined in public_xml_fields
140
+ validation_messages = fields_to_add.validate_item(config)
141
+ validation_messages.concat doc_hash.validate_mods(config)
142
+ @validation_messages.concat(validation_messages)
143
+ doc_hash.to_h
144
+ end
145
+
146
+ # Create Solr document for the collection druid suitable for SearchWorks
147
+ # and write the result to the SearchWorks Solr Index
148
+ # @param [Harvestdor::Indexer::Resource] resource a collection record
149
+ def collection_solr_document(resource)
150
+ coll_sdb = GDor::Indexer::SolrDocBuilder.new(resource, logger)
151
+
152
+ fields_to_add = GDor::Indexer::SolrDocHash.new(
153
+ druid: resource.bare_druid,
154
+ url_fulltext: "https://purl.stanford.edu/#{resource.bare_druid}",
155
+ access_facet: 'Online',
156
+ collection_type: 'Digital Collection',
157
+ display_type: coll_display_types_from_items(resource),
158
+ format_main_ssim: 'Archive/Manuscript', # per INDEX-12, add this to all collection records (does not add dups)
159
+ format: 'Manuscript/Archive', # per INDEX-144, add this to all collection records (does not add dups)
160
+ building_facet: 'Stanford Digital Repository' # INDEX-53 add building_facet = Stanford Digital Repository here for collection
161
+ )
162
+
163
+ logger.info "Indexing collection object #{resource.druid} (unmerged)"
164
+ doc_hash = coll_sdb.doc_hash
165
+ doc_hash.combine fields_to_add
166
+ validation_messages = doc_hash.validate_collection(config)
167
+ validation_messages.concat doc_hash.validate_mods(config)
168
+ @validation_messages.concat(validation_messages)
169
+ doc_hash.to_h
170
+ end
171
+
172
+ # add coll level data to this solr doc and/or cache collection level information
173
+ # @param [Hash] doc_hash representing the Solr document (for an item)
174
+ # @param [Array<Harvestdor::Indexer::Resource>] collections the collections the item is a member of
175
+ def add_coll_info(doc_hash, collections)
176
+ if collections
177
+ doc_hash[:collection] = []
178
+ doc_hash[:collection_with_title] = []
179
+
180
+ collections.each do |collection|
181
+ cache_display_type_for_collection collection, doc_hash[:display_type]
182
+ doc_hash[:collection] << collection.bare_druid
183
+ doc_hash[:collection_with_title] << "#{collection.bare_druid}-|-#{coll_title(collection)}"
184
+ end
185
+ end
186
+ end
187
+
188
+ # cache the coll title so we don't have to look it up more than once
189
+ def coll_title(resource)
190
+ @collection_titles ||= {}
191
+ @collection_titles[resource.druid] ||= begin
192
+ resource.identity_md_obj_label
193
+ end
194
+ end
195
+
196
+ # cache of display_type from each item so we have this info for indexing collection record
197
+ # @return [Hash<String, Array<String>>] collection druids as keys, array of item display_types as values
198
+ def coll_display_types_from_items(resource)
199
+ @collection_display_types ||= {}
200
+ @collection_display_types[resource.druid] ||= Set.new
201
+ end
202
+
203
+ # cache the display_type of this (item) object with a collection, so when the collection rec
204
+ # is being indexed, it can get all of the display_types of the members
205
+ def cache_display_type_for_collection(resource, display_type)
206
+ if display_type && display_type.instance_of?(String)
207
+ coll_display_types_from_items(resource) << display_type
208
+ end
209
+ end
210
+
211
+ # count the number of records in solr for this collection (and the collection record itself)
212
+ # and check for a purl in the collection record
213
+ def num_found_in_solr(fqs)
214
+ params = { fl: 'id', rows: 1000 }
215
+ params[:fq] = fqs.map { |k, v| "#{k}:\"#{v}\"" }
216
+ params[:start] ||= 0
217
+ resp = solr_client.client.get 'select', params: params
218
+ num_found = resp['response']['numFound'].to_i
219
+
220
+ if fqs.key? :collection
221
+ num_found += num_found_in_solr id: fqs[:collection]
222
+ end
223
+
224
+ num_found
225
+ end
226
+
227
+ # create messages about various record counts
228
+ # @return [Array<String>] Array of messages suitable for notificaiton email and/or logs
229
+ def record_count_msgs
230
+ @record_count_msgs ||= begin
231
+ msgs = []
232
+ msgs << "Successful count (items + coll record indexed w/o error): #{metrics.success_count}"
233
+
234
+ harvestdor.resources.select(&:collection?).each do |collection|
235
+ solr_count = num_found_in_solr(collection: collection.bare_druid)
236
+ msgs << "#{config.harvestdor.log_name.chomp('.log')} indexed coll record is: #{collection.druid}\n"
237
+ msgs << "coll title: #{coll_title(collection)}\n"
238
+ msgs << "Solr query for items: #{config[:solr][:url]}/select?fq=collection:#{collection.druid}&fl=id,title_245a_display\n"
239
+ msgs << "Records verified in solr for collection #{collection.druid} (items + coll record): #{num_found_in_solr collection: collection.bare_druid}"
240
+ msgs << "WARNING: Expected #{collection.druid} to contain #{collection.items.length} items, but only found #{solr_count}."
241
+ end
242
+
243
+ msgs << "Error count (items + coll record w any error; may have indexed on retry if it was a timeout): #{metrics.error_count}"
244
+ # msgs << "Retry count: #{@retries}" # currently useless due to bug in harvestdor-indexer 0.0.12
245
+ msgs << "Total records processed: #{metrics.total}"
246
+ msgs
247
+ end
248
+ end
249
+
250
+ # log details about the results of indexing
251
+ def log_results
252
+ record_count_msgs.each do |msg|
253
+ logger.info msg
254
+ end
255
+ logger.info("Avg solr commit time per object (successful): #{(@total_time_to_solr / metrics.success_count).round(2)} seconds") unless metrics.success_count == 0
256
+ logger.info("Avg solr commit time per object (all): #{(@total_time_to_solr / metrics.total).round(2)} seconds") unless metrics.total == 0
257
+ logger.info("Avg parse time per object (successful): #{(@total_time_to_parse / metrics.success_count).round(2)} seconds") unless metrics.success_count == 0
258
+ logger.info("Avg parse time per object (all): #{(@total_time_to_parse / metrics.total).round(2)} seconds") unless metrics.total == 0
259
+ logger.info("Avg complete index time per object (successful): #{(@total_time / metrics.success_count).round(2)} seconds") unless metrics.success_count == 0
260
+ logger.info("Avg complete index time per object (all): #{(@total_time / metrics.total).round(2)} seconds") unless metrics.total == 0
261
+ end
262
+
263
+ def email_report_body
264
+ body = ''
265
+
266
+ body += "\n" + record_count_msgs.join("\n") + "\n"
267
+
268
+ if @druids_failed_to_ix.size > 0
269
+ body += "\n"
270
+ body += "records that may have failed to index (merged recs as druids, not ckeys): \n"
271
+ body += @druids_failed_to_ix.join("\n") + "\n"
272
+ end
273
+
274
+ body += "\n"
275
+ body += "full log is at gdor_indexer/shared/#{config.harvestdor.log_dir}/#{config.harvestdor.log_name} on #{Socket.gethostname}"
276
+ body += "\n"
277
+
278
+ body += @validation_messages.join("\n") + "\n"
279
+ end
280
+
281
+ # email the results of indexing if we are on one of the harvestdor boxes
282
+ def email_results
283
+ if config.notification
284
+ to_email = config.notification
285
+
286
+ opts = {}
287
+ opts[:subject] = "#{config.harvestdor.log_name.chomp('.log')} into Solr server #{config[:solr][:url]} is finished"
288
+ opts[:body] = email_report_body
289
+ begin
290
+ send_email(to_email, opts)
291
+ rescue => e
292
+ logger.error('Failed to send email notification!')
293
+ logger.error(e)
294
+ end
295
+ end
296
+ end
297
+
298
+ def send_email(to, opts = {})
299
+ opts[:server] ||= 'localhost'
300
+ opts[:from] ||= 'gryphondor@stanford.edu'
301
+ opts[:from_alias] ||= 'gryphondor'
302
+ opts[:subject] ||= 'default subject'
303
+ opts[:body] ||= 'default message body'
304
+ mail = Mail.new do
305
+ from opts[:from]
306
+ to to
307
+ subject opts[:subject]
308
+ body opts[:body]
309
+ end
310
+ mail.deliver!
311
+ end
312
+
313
+ def elapsed_time(start_time, units = :seconds)
314
+ elapsed_seconds = Time.now.getlocal - start_time
315
+ case units
316
+ when :seconds
317
+ return elapsed_seconds.round(2)
318
+ when :minutes
319
+ return (elapsed_seconds / 60.0).round(1)
320
+ when :hours
321
+ return (elapsed_seconds / 3600.0).round(2)
322
+ else
323
+ return elapsed_seconds
324
+ end
325
+ end
326
+ end
327
+ end