base_indexer 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2d118d2ed75c9c51447b0478817540be6e771d70
4
- data.tar.gz: af50248dadb044c772d0617941c65a5f35089755
3
+ metadata.gz: 835d4deacab7029147d2dd106b1cf6518ddae2a8
4
+ data.tar.gz: 8f44888fb252805cc0695006c6833c41805e582b
5
5
  SHA512:
6
- metadata.gz: 41a51441cf151a7ae1e2c7eac35c21cf87915f70f27a68cb26bc112db87d3f8e56c0d4ce75c090cd9c969696df95135f46be72ba2771b12604bc0adc4a4c6870
7
- data.tar.gz: 7301d83e547234ab2671bcc1bdae8bb008cd4f55c155cb2521d5365a28ba5c1e6a49a10ee573ab43d396daa20f4f9594d4b1d9391aa4510c4216708f13f910bc
6
+ metadata.gz: 9599ad3b0ccff641dd9d09d59cf4a3d676376b197ef48439318699a30afc32684c120e17d1a5f7ae2ec4a24d93765a0086c5034bf86a4c6e78bb9b2a16d7bd60
7
+ data.tar.gz: 9bec426885112b59c5010ede1ab6f7c53b3ae29f1f52c20fc8d7a4eb828cd1f1acf836b0126fb8baa9b6b32ccd073e7926324c3c001322072689617e87eaced1
@@ -1,3 +1,3 @@
1
1
  module BaseIndexer
2
- VERSION = '1.0.1'
2
+ VERSION = '1.0.2'
3
3
  end
data/lib/tasks/index.rake CHANGED
@@ -11,13 +11,13 @@ def log(logger,message,log_type=:info)
11
11
  end
12
12
  puts message
13
13
  $stdout.flush
14
-
14
+
15
15
  end
16
16
 
17
17
  desc 'Index a specific list of druids from a pre-assembly log YAML file, a remediate log file, or a simple CSV. Specify target to index into and log file to index from.'
18
- #Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.yaml log_type=preassembly
19
- #Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1_remediation.yaml log_type=remediate
20
- #Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.csv log_type=csv # csv must contain a heading called "druid" with the druid to index
18
+ #Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.yaml log_type=preassembly
19
+ #Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1_remediation.yaml log_type=remediate
20
+ #Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.csv log_type=csv # csv must contain a heading called "druid" with the druid to index
21
21
 
22
22
  # Examples:
23
23
  task :log_indexer => :environment do |t, args|
@@ -25,55 +25,55 @@ task :log_indexer => :environment do |t, args|
25
25
  target = ENV['target'] # must pass in the target so specify solr core to index into
26
26
  log_file_path = ENV['log_file'] # must specify pre-assembly log file to index from
27
27
  log_type = ENV['log_type'] || 'preassembly' # log type (either preassembly, csv, or remediate), defaults to preassembly
28
-
28
+
29
29
  raise 'You must specify a target and log file.' if target.blank? || log_file_path.blank?
30
30
  raise 'Log type must be preassembly, remediate or csv.' unless ['preassembly','remediate','csv'].include? log_type
31
31
  raise 'Log file not found.' unless File.readable? log_file_path
32
-
32
+
33
33
  target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
34
-
34
+
35
35
  raise 'Target not found.' if target_config.nil?
36
-
36
+
37
37
  if log_type.blank? || log_type == 'preassembly'
38
38
  log_completed=:pre_assem_finished
39
39
  elsif log_type == 'remediate'
40
40
  log_completed=:remediate_completed
41
41
  end
42
-
42
+
43
43
  output_log_file_name="#{Rails.root}/log/#{File.basename(log_file_path,File.extname(log_file_path))}_indexer_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
44
44
  my_logger=Logger.new(output_log_file_name) # set up a new log file
45
-
45
+
46
46
  start_time=Time.now
47
-
47
+
48
48
  errors=0
49
49
  indexed=0
50
50
 
51
51
  druids=[]
52
-
52
+
53
53
  if ['preassembly','remediate'].include? log_type
54
- YAML.load_stream(IO.read(log_file_path)) { |obj| druids << obj[:pid] if obj[log_completed] == true}
54
+ YAML.load_stream(IO.read(log_file_path)) { |obj| druids << obj[:pid] if obj[log_completed] == true}
55
55
  else
56
56
  csv = CSV.parse(IO.read(log_file_path), :headers => true)
57
57
  druids=csv.map { |row| row.to_hash.with_indifferent_access['druid'] }.delete_if {|druid| druid.nil?}
58
58
  end
59
-
59
+
60
60
  solr_server=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]['url']
61
-
61
+
62
62
  log my_logger,"** Indexing #{druids.size} druids from #{log_file_path} into solr server #{solr_server} (target=#{target}). Log file is of type #{log_type}."
63
63
  log my_logger,"Indexing started at #{start_time}"
64
64
 
65
65
  indexer = BaseIndexer.indexer_class.constantize.new
66
66
 
67
67
  counter=0
68
-
68
+
69
69
  druids.each do |druid|
70
-
70
+
71
71
  druid.gsub!('druid:','')
72
72
  counter+=1
73
-
73
+
74
74
  begin
75
75
  with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
76
- indexer.index(druid,[target])
76
+ indexer.index(druid,{target=>true})
77
77
  log my_logger,"#{counter} of #{druids.size}: #{druid}"
78
78
  indexed += 1
79
79
  end
@@ -83,34 +83,34 @@ task :log_indexer => :environment do |t, args|
83
83
  end
84
84
 
85
85
  end
86
-
86
+
87
87
  log my_logger,"Objects indexed: #{indexed} out of #{druids.size}"
88
88
  log(my_logger,"ERRORS Encountered, #{errors} objects not indexed") if errors > 0
89
89
  log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
90
90
  puts "Logged output at #{output_log_file_name}"
91
-
91
+
92
92
  end
93
-
93
+
94
94
  desc "Delete a single druid. It will be deleted from all targets!"
95
95
  #Run me: rake delete RAILS_ENV=production druid=oo000oo0001
96
96
  # Examples:
97
97
  task :delete => :environment do |t, args|
98
98
 
99
- druid = ENV['druid']
100
-
99
+ druid = ENV['druid']
100
+
101
101
  raise 'You must specify a druid.' if druid.blank?
102
102
 
103
103
  print "Are you sure you wish to delete this druid from all targets? (y/n) "
104
- STDOUT.flush
104
+ STDOUT.flush
105
105
  answer=STDIN.gets.chomp
106
-
106
+
107
107
  raise 'STOP!' unless (answer && ['y','yes'].include?(answer.downcase))
108
-
108
+
109
109
  puts "** Delete #{druid} druid from all targets."
110
110
 
111
111
  indexer = BaseIndexer.indexer_class.constantize.new
112
112
  indexer.delete druid.gsub('druid:','')
113
-
113
+
114
114
  end
115
115
 
116
116
  desc 'Index a single druid. Specify target to index into and druid to index.'
@@ -119,21 +119,21 @@ desc 'Index a single druid. Specify target to index into and druid to index.'
119
119
  task :index => :environment do |t, args|
120
120
 
121
121
  target = ENV['target'] # must pass in the target so specify solr core to index into
122
- druid = ENV['druid']
123
-
122
+ druid = ENV['druid']
123
+
124
124
  raise 'You must specify a target and druid.' if target.blank? || druid.blank?
125
-
125
+
126
126
  target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
127
-
127
+
128
128
  raise 'Target not found.' if target_config.nil?
129
129
 
130
130
  solr_server=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]['url']
131
-
131
+
132
132
  puts "** Indexing #{druid} druid into solr server #{solr_server} (target=#{target})."
133
133
 
134
134
  indexer = BaseIndexer.indexer_class.constantize.new
135
- indexer.index(druid.gsub('druid:',''),[target])
136
-
135
+ indexer.index(druid.gsub('druid:',''),{target=>true})
136
+
137
137
  end
138
138
 
139
139
  desc 'Index an entire collection, including the collection itself and all of its members. Specify target to index into and collection druid to index.'
@@ -142,21 +142,21 @@ desc 'Index an entire collection, including the collection itself and all of its
142
142
  task :collection_indexer => :environment do |t, args|
143
143
 
144
144
  target = ENV['target'] # must pass in the target so specify solr core to index into
145
- collection_druid = ENV['collection_druid']
146
-
145
+ collection_druid = ENV['collection_druid']
146
+
147
147
  raise 'You must specify a target and collection druid.' if target.blank? || collection_druid.blank?
148
-
148
+
149
149
  target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
150
-
150
+
151
151
  raise 'Target not found.' if target_config.nil?
152
152
 
153
153
  solr_server=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]['url']
154
154
 
155
155
  output_log_file_name="#{Rails.root}/log/collection_#{collection_druid}_indexer_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
156
156
  my_logger=Logger.new(output_log_file_name) # set up a new log file
157
-
157
+
158
158
  log my_logger,"** Indexing collection #{collection_druid} druid and all of its members into solr server #{solr_server} (target=#{target})."
159
-
159
+
160
160
  start_time=Time.now
161
161
  log my_logger,"Indexing started at #{start_time}"
162
162
 
@@ -165,10 +165,10 @@ task :collection_indexer => :environment do |t, args|
165
165
  df = DorFetcher::Client.new({:service_url => Rails.application.config.dor_fetcher_url})
166
166
 
167
167
  collection_druid=collection_druid.gsub('druid:','')
168
-
169
- indexer.index(collection_druid,[target])
168
+
169
+ indexer.index(collection_druid,{target=>true})
170
170
  log my_logger,"Indexed collection: #{collection_druid}"
171
-
171
+
172
172
  druids = df.druid_array(df.get_collection(collection_druid, {}))
173
173
 
174
174
  log my_logger,"** Found #{druids.size} members of the collection"
@@ -176,15 +176,15 @@ task :collection_indexer => :environment do |t, args|
176
176
  counter=0
177
177
  indexed=0
178
178
  errors=0
179
-
179
+
180
180
  druids.each do |druid|
181
-
181
+
182
182
  druid=druid.gsub('druid:','')
183
183
  counter+=1
184
-
184
+
185
185
  begin
186
186
  with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
187
- indexer.index(druid,[target])
187
+ indexer.index(druid,{target=>true})
188
188
  log my_logger,"#{counter} of #{druids.size}: #{druid}"
189
189
  indexed += 1
190
190
  end
@@ -194,13 +194,13 @@ task :collection_indexer => :environment do |t, args|
194
194
  end
195
195
 
196
196
  end
197
-
197
+
198
198
  log my_logger,"Objects indexed: #{indexed} out of #{druids.size} + 1 collection druid"
199
199
  log(my_logger,"ERRORS Encountered, #{errors} objects not indexed") if errors > 0
200
200
  log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
201
201
  puts "Logged output at #{output_log_file_name}"
202
-
203
- end
202
+
203
+ end
204
204
 
205
205
  desc 'ReIndex just the druids that errored out from a previous batch index run. Specify target to index into and batch errored log file to index from.'
206
206
  #Run me: rake reindexer RAILS_ENV=production target=revs_prod file=./log/index.log
@@ -209,16 +209,16 @@ task :reindexer => :environment do |t, args|
209
209
 
210
210
  target = ENV['target'] # must pass in the target so specify solr core to index into
211
211
  file_path = ENV['file'] # must specify previous indexing log file to index from
212
-
212
+
213
213
  raise 'You must specify a target and file.' if target.blank? || file_path.blank?
214
214
  raise 'File not found.' unless File.readable? file_path
215
-
215
+
216
216
  target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
217
-
217
+
218
218
  raise 'Target not found.' if target_config.nil?
219
219
 
220
220
  start_time=Time.now
221
-
221
+
222
222
  errors=0
223
223
  indexed=0
224
224
 
@@ -226,7 +226,7 @@ task :reindexer => :environment do |t, args|
226
226
 
227
227
  output_log_file_name="#{Rails.root}/log/#{File.basename(file_path,File.extname(file_path))}_reindex_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
228
228
  my_logger=Logger.new(output_log_file_name) # set up a new log file
229
-
229
+
230
230
  log my_logger,"** Indexing errored out druids from #{file_path} into solr server #{solr_server} (target=#{target})."
231
231
  log my_logger,"Indexing started at #{start_time}"
232
232
 
@@ -237,15 +237,15 @@ task :reindexer => :environment do |t, args|
237
237
  IO.readlines(file_path).each do |line|
238
238
 
239
239
  downcased_line=line.downcase
240
-
240
+
241
241
  if downcased_line.include? 'error'
242
242
  druid=downcased_line.scan(/[a-z][a-z][0-9][0-9][0-9][a-z][a-z][0-9][0-9][0-9][0-9]/).first
243
-
244
- unless druid.blank?
243
+
244
+ unless druid.blank?
245
245
  begin
246
246
  counter+=1
247
247
  with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
248
- indexer.index(druid,[target])
248
+ indexer.index(druid,{target=>true})
249
249
  log my_logger,"#{counter}: #{druid}"
250
250
  indexed += 1
251
251
  end
@@ -254,16 +254,16 @@ task :reindexer => :environment do |t, args|
254
254
  errors += 1
255
255
  end
256
256
  end
257
-
257
+
258
258
  end
259
-
259
+
260
260
  end
261
-
261
+
262
262
  log my_logger,"Objects indexed: #{indexed}"
263
263
  log(my_logger,"ERRORS Encountered, #{errors} objects not indexed") if errors > 0
264
264
  log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
265
265
  puts "Logged output at #{output_log_file_name}"
266
-
266
+
267
267
  end
268
268
 
269
269
  desc 'Delete the druids specified in the supplied text file (one druid per line, header not necessary). Be careful! It will delete from all targets.'
@@ -272,24 +272,24 @@ desc 'Delete the druids specified in the supplied text file (one druid per line,
272
272
  task :delete_druids => :environment do |t, args|
273
273
 
274
274
  file_path = ENV['file'] # must specify previous indexing log file to index from
275
-
275
+
276
276
  raise 'You must specify a druid file.' if file_path.blank?
277
277
  raise 'File not found.' unless File.readable? file_path
278
278
 
279
279
  print "Are you sure you wish to delete all of the druids from all targets specified in #{file_path}? (y/n) "
280
- STDOUT.flush
280
+ STDOUT.flush
281
281
  answer=STDIN.gets.chomp
282
-
282
+
283
283
  raise 'STOP!' unless (answer && ['y','yes'].include?(answer.downcase))
284
-
284
+
285
285
  output_log_file_name="#{Rails.root}/log/#{File.basename(file_path,File.extname(file_path))}_delete_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
286
286
  my_logger=Logger.new(output_log_file_name) # set up a new log file
287
-
287
+
288
288
  start_time=Time.now
289
-
289
+
290
290
  errors=0
291
291
  indexed=0
292
-
292
+
293
293
  log my_logger,"** Deleting druids from #{file_path} in all targets."
294
294
  log my_logger,"Deleting started at #{start_time}"
295
295
 
@@ -301,10 +301,10 @@ task :delete_druids => :environment do |t, args|
301
301
 
302
302
  downcased_line=line.downcase
303
303
  druid=downcased_line.scan(/[a-z][a-z][0-9][0-9][0-9][a-z][a-z][0-9][0-9][0-9][0-9]/).first
304
-
304
+
305
305
  unless druid.blank?
306
306
  counter+=1
307
-
307
+
308
308
  begin
309
309
  with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
310
310
  indexer.delete druid
@@ -315,11 +315,11 @@ task :delete_druids => :environment do |t, args|
315
315
  log my_logger,"ERROR: Failed to delete #{druid}: #{e.message}",:error
316
316
  errors += 1
317
317
  end
318
- end
318
+ end
319
319
  end
320
-
320
+
321
321
  log my_logger,"Objects deleted: #{indexed}"
322
322
  log(my_logger,"ERRORS Encountered, #{errors} objects not deleted",:error) if errors > 0
323
323
  log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
324
-
325
- end
324
+
325
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: base_indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ahmed Alsum
@@ -17,20 +17,14 @@ dependencies:
17
17
  requirements:
18
18
  - - "~>"
19
19
  - !ruby/object:Gem::Version
20
- version: '4.1'
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 4.1.9
20
+ version: '4'
24
21
  type: :runtime
25
22
  prerelease: false
26
23
  version_requirements: !ruby/object:Gem::Requirement
27
24
  requirements:
28
25
  - - "~>"
29
26
  - !ruby/object:Gem::Version
30
- version: '4.1'
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: 4.1.9
27
+ version: '4'
34
28
  - !ruby/object:Gem::Dependency
35
29
  name: discovery-indexer
36
30
  requirement: !ruby/object:Gem::Requirement