base_indexer 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/base_indexer/version.rb +1 -1
- data/lib/tasks/index.rake +77 -77
- metadata +3 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 835d4deacab7029147d2dd106b1cf6518ddae2a8
|
4
|
+
data.tar.gz: 8f44888fb252805cc0695006c6833c41805e582b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9599ad3b0ccff641dd9d09d59cf4a3d676376b197ef48439318699a30afc32684c120e17d1a5f7ae2ec4a24d93765a0086c5034bf86a4c6e78bb9b2a16d7bd60
|
7
|
+
data.tar.gz: 9bec426885112b59c5010ede1ab6f7c53b3ae29f1f52c20fc8d7a4eb828cd1f1acf836b0126fb8baa9b6b32ccd073e7926324c3c001322072689617e87eaced1
|
data/lib/base_indexer/version.rb
CHANGED
data/lib/tasks/index.rake
CHANGED
@@ -11,13 +11,13 @@ def log(logger,message,log_type=:info)
|
|
11
11
|
end
|
12
12
|
puts message
|
13
13
|
$stdout.flush
|
14
|
-
|
14
|
+
|
15
15
|
end
|
16
16
|
|
17
17
|
desc 'Index a specific list of druids from a pre-assembly log YAML file, a remediate log file, or a simple CSV. Specify target to index into and log file to index from.'
|
18
|
-
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.yaml log_type=preassembly
|
19
|
-
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1_remediation.yaml log_type=remediate
|
20
|
-
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.csv log_type=csv # csv must contain a heading called "druid" with the druid to index
|
18
|
+
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.yaml log_type=preassembly
|
19
|
+
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1_remediation.yaml log_type=remediate
|
20
|
+
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.csv log_type=csv # csv must contain a heading called "druid" with the druid to index
|
21
21
|
|
22
22
|
# Examples:
|
23
23
|
task :log_indexer => :environment do |t, args|
|
@@ -25,55 +25,55 @@ task :log_indexer => :environment do |t, args|
|
|
25
25
|
target = ENV['target'] # must pass in the target so specify solr core to index into
|
26
26
|
log_file_path = ENV['log_file'] # must specify pre-assembly log file to index from
|
27
27
|
log_type = ENV['log_type'] || 'preassembly' # log type (either preassembly, csv, or remediate), defaults to preassembly
|
28
|
-
|
28
|
+
|
29
29
|
raise 'You must specify a target and log file.' if target.blank? || log_file_path.blank?
|
30
30
|
raise 'Log type must be preassembly, remediate or csv.' unless ['preassembly','remediate','csv'].include? log_type
|
31
31
|
raise 'Log file not found.' unless File.readable? log_file_path
|
32
|
-
|
32
|
+
|
33
33
|
target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
|
34
|
-
|
34
|
+
|
35
35
|
raise 'Target not found.' if target_config.nil?
|
36
|
-
|
36
|
+
|
37
37
|
if log_type.blank? || log_type == 'preassembly'
|
38
38
|
log_completed=:pre_assem_finished
|
39
39
|
elsif log_type == 'remediate'
|
40
40
|
log_completed=:remediate_completed
|
41
41
|
end
|
42
|
-
|
42
|
+
|
43
43
|
output_log_file_name="#{Rails.root}/log/#{File.basename(log_file_path,File.extname(log_file_path))}_indexer_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
|
44
44
|
my_logger=Logger.new(output_log_file_name) # set up a new log file
|
45
|
-
|
45
|
+
|
46
46
|
start_time=Time.now
|
47
|
-
|
47
|
+
|
48
48
|
errors=0
|
49
49
|
indexed=0
|
50
50
|
|
51
51
|
druids=[]
|
52
|
-
|
52
|
+
|
53
53
|
if ['preassembly','remediate'].include? log_type
|
54
|
-
YAML.load_stream(IO.read(log_file_path)) { |obj| druids << obj[:pid] if obj[log_completed] == true}
|
54
|
+
YAML.load_stream(IO.read(log_file_path)) { |obj| druids << obj[:pid] if obj[log_completed] == true}
|
55
55
|
else
|
56
56
|
csv = CSV.parse(IO.read(log_file_path), :headers => true)
|
57
57
|
druids=csv.map { |row| row.to_hash.with_indifferent_access['druid'] }.delete_if {|druid| druid.nil?}
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
solr_server=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]['url']
|
61
|
-
|
61
|
+
|
62
62
|
log my_logger,"** Indexing #{druids.size} druids from #{log_file_path} into solr server #{solr_server} (target=#{target}). Log file is of type #{log_type}."
|
63
63
|
log my_logger,"Indexing started at #{start_time}"
|
64
64
|
|
65
65
|
indexer = BaseIndexer.indexer_class.constantize.new
|
66
66
|
|
67
67
|
counter=0
|
68
|
-
|
68
|
+
|
69
69
|
druids.each do |druid|
|
70
|
-
|
70
|
+
|
71
71
|
druid.gsub!('druid:','')
|
72
72
|
counter+=1
|
73
|
-
|
73
|
+
|
74
74
|
begin
|
75
75
|
with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
|
76
|
-
indexer.index(druid,
|
76
|
+
indexer.index(druid,{target=>true})
|
77
77
|
log my_logger,"#{counter} of #{druids.size}: #{druid}"
|
78
78
|
indexed += 1
|
79
79
|
end
|
@@ -83,34 +83,34 @@ task :log_indexer => :environment do |t, args|
|
|
83
83
|
end
|
84
84
|
|
85
85
|
end
|
86
|
-
|
86
|
+
|
87
87
|
log my_logger,"Objects indexed: #{indexed} out of #{druids.size}"
|
88
88
|
log(my_logger,"ERRORS Encountered, #{errors} objects not indexed") if errors > 0
|
89
89
|
log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
|
90
90
|
puts "Logged output at #{output_log_file_name}"
|
91
|
-
|
91
|
+
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
desc "Delete a single druid. It will be deleted from all targets!"
|
95
95
|
#Run me: rake delete RAILS_ENV=production druid=oo000oo0001
|
96
96
|
# Examples:
|
97
97
|
task :delete => :environment do |t, args|
|
98
98
|
|
99
|
-
druid = ENV['druid']
|
100
|
-
|
99
|
+
druid = ENV['druid']
|
100
|
+
|
101
101
|
raise 'You must specify a druid.' if druid.blank?
|
102
102
|
|
103
103
|
print "Are you sure you wish to delete this druid from all targets? (y/n) "
|
104
|
-
STDOUT.flush
|
104
|
+
STDOUT.flush
|
105
105
|
answer=STDIN.gets.chomp
|
106
|
-
|
106
|
+
|
107
107
|
raise 'STOP!' unless (answer && ['y','yes'].include?(answer.downcase))
|
108
|
-
|
108
|
+
|
109
109
|
puts "** Delete #{druid} druid from all targets."
|
110
110
|
|
111
111
|
indexer = BaseIndexer.indexer_class.constantize.new
|
112
112
|
indexer.delete druid.gsub('druid:','')
|
113
|
-
|
113
|
+
|
114
114
|
end
|
115
115
|
|
116
116
|
desc 'Index a single druid. Specify target to index into and druid to index.'
|
@@ -119,21 +119,21 @@ desc 'Index a single druid. Specify target to index into and druid to index.'
|
|
119
119
|
task :index => :environment do |t, args|
|
120
120
|
|
121
121
|
target = ENV['target'] # must pass in the target so specify solr core to index into
|
122
|
-
druid = ENV['druid']
|
123
|
-
|
122
|
+
druid = ENV['druid']
|
123
|
+
|
124
124
|
raise 'You must specify a target and druid.' if target.blank? || druid.blank?
|
125
|
-
|
125
|
+
|
126
126
|
target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
|
127
|
-
|
127
|
+
|
128
128
|
raise 'Target not found.' if target_config.nil?
|
129
129
|
|
130
130
|
solr_server=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]['url']
|
131
|
-
|
131
|
+
|
132
132
|
puts "** Indexing #{druid} druid into solr server #{solr_server} (target=#{target})."
|
133
133
|
|
134
134
|
indexer = BaseIndexer.indexer_class.constantize.new
|
135
|
-
indexer.index(druid.gsub('druid:',''),
|
136
|
-
|
135
|
+
indexer.index(druid.gsub('druid:',''),{target=>true})
|
136
|
+
|
137
137
|
end
|
138
138
|
|
139
139
|
desc 'Index an entire collection, including the collection itself and all of its members. Specify target to index into and collection druid to index.'
|
@@ -142,21 +142,21 @@ desc 'Index an entire collection, including the collection itself and all of its
|
|
142
142
|
task :collection_indexer => :environment do |t, args|
|
143
143
|
|
144
144
|
target = ENV['target'] # must pass in the target so specify solr core to index into
|
145
|
-
collection_druid = ENV['collection_druid']
|
146
|
-
|
145
|
+
collection_druid = ENV['collection_druid']
|
146
|
+
|
147
147
|
raise 'You must specify a target and collection druid.' if target.blank? || collection_druid.blank?
|
148
|
-
|
148
|
+
|
149
149
|
target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
|
150
|
-
|
150
|
+
|
151
151
|
raise 'Target not found.' if target_config.nil?
|
152
152
|
|
153
153
|
solr_server=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]['url']
|
154
154
|
|
155
155
|
output_log_file_name="#{Rails.root}/log/collection_#{collection_druid}_indexer_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
|
156
156
|
my_logger=Logger.new(output_log_file_name) # set up a new log file
|
157
|
-
|
157
|
+
|
158
158
|
log my_logger,"** Indexing collection #{collection_druid} druid and all of its members into solr server #{solr_server} (target=#{target})."
|
159
|
-
|
159
|
+
|
160
160
|
start_time=Time.now
|
161
161
|
log my_logger,"Indexing started at #{start_time}"
|
162
162
|
|
@@ -165,10 +165,10 @@ task :collection_indexer => :environment do |t, args|
|
|
165
165
|
df = DorFetcher::Client.new({:service_url => Rails.application.config.dor_fetcher_url})
|
166
166
|
|
167
167
|
collection_druid=collection_druid.gsub('druid:','')
|
168
|
-
|
169
|
-
indexer.index(collection_druid,
|
168
|
+
|
169
|
+
indexer.index(collection_druid,{target=>true})
|
170
170
|
log my_logger,"Indexed collection: #{collection_druid}"
|
171
|
-
|
171
|
+
|
172
172
|
druids = df.druid_array(df.get_collection(collection_druid, {}))
|
173
173
|
|
174
174
|
log my_logger,"** Found #{druids.size} members of the collection"
|
@@ -176,15 +176,15 @@ task :collection_indexer => :environment do |t, args|
|
|
176
176
|
counter=0
|
177
177
|
indexed=0
|
178
178
|
errors=0
|
179
|
-
|
179
|
+
|
180
180
|
druids.each do |druid|
|
181
|
-
|
181
|
+
|
182
182
|
druid=druid.gsub('druid:','')
|
183
183
|
counter+=1
|
184
|
-
|
184
|
+
|
185
185
|
begin
|
186
186
|
with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
|
187
|
-
indexer.index(druid,
|
187
|
+
indexer.index(druid,{target=>true})
|
188
188
|
log my_logger,"#{counter} of #{druids.size}: #{druid}"
|
189
189
|
indexed += 1
|
190
190
|
end
|
@@ -194,13 +194,13 @@ task :collection_indexer => :environment do |t, args|
|
|
194
194
|
end
|
195
195
|
|
196
196
|
end
|
197
|
-
|
197
|
+
|
198
198
|
log my_logger,"Objects indexed: #{indexed} out of #{druids.size} + 1 collection druid"
|
199
199
|
log(my_logger,"ERRORS Encountered, #{errors} objects not indexed") if errors > 0
|
200
200
|
log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
|
201
201
|
puts "Logged output at #{output_log_file_name}"
|
202
|
-
|
203
|
-
end
|
202
|
+
|
203
|
+
end
|
204
204
|
|
205
205
|
desc 'ReIndex just the druids that errored out from a previous batch index run. Specify target to index into and batch errored log file to index from.'
|
206
206
|
#Run me: rake reindexer RAILS_ENV=production target=revs_prod file=./log/index.log
|
@@ -209,16 +209,16 @@ task :reindexer => :environment do |t, args|
|
|
209
209
|
|
210
210
|
target = ENV['target'] # must pass in the target so specify solr core to index into
|
211
211
|
file_path = ENV['file'] # must specify previous indexing log file to index from
|
212
|
-
|
212
|
+
|
213
213
|
raise 'You must specify a target and file.' if target.blank? || file_path.blank?
|
214
214
|
raise 'File not found.' unless File.readable? file_path
|
215
|
-
|
215
|
+
|
216
216
|
target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
|
217
|
-
|
217
|
+
|
218
218
|
raise 'Target not found.' if target_config.nil?
|
219
219
|
|
220
220
|
start_time=Time.now
|
221
|
-
|
221
|
+
|
222
222
|
errors=0
|
223
223
|
indexed=0
|
224
224
|
|
@@ -226,7 +226,7 @@ task :reindexer => :environment do |t, args|
|
|
226
226
|
|
227
227
|
output_log_file_name="#{Rails.root}/log/#{File.basename(file_path,File.extname(file_path))}_reindex_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
|
228
228
|
my_logger=Logger.new(output_log_file_name) # set up a new log file
|
229
|
-
|
229
|
+
|
230
230
|
log my_logger,"** Indexing errored out druids from #{file_path} into solr server #{solr_server} (target=#{target})."
|
231
231
|
log my_logger,"Indexing started at #{start_time}"
|
232
232
|
|
@@ -237,15 +237,15 @@ task :reindexer => :environment do |t, args|
|
|
237
237
|
IO.readlines(file_path).each do |line|
|
238
238
|
|
239
239
|
downcased_line=line.downcase
|
240
|
-
|
240
|
+
|
241
241
|
if downcased_line.include? 'error'
|
242
242
|
druid=downcased_line.scan(/[a-z][a-z][0-9][0-9][0-9][a-z][a-z][0-9][0-9][0-9][0-9]/).first
|
243
|
-
|
244
|
-
unless druid.blank?
|
243
|
+
|
244
|
+
unless druid.blank?
|
245
245
|
begin
|
246
246
|
counter+=1
|
247
247
|
with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
|
248
|
-
indexer.index(druid,
|
248
|
+
indexer.index(druid,{target=>true})
|
249
249
|
log my_logger,"#{counter}: #{druid}"
|
250
250
|
indexed += 1
|
251
251
|
end
|
@@ -254,16 +254,16 @@ task :reindexer => :environment do |t, args|
|
|
254
254
|
errors += 1
|
255
255
|
end
|
256
256
|
end
|
257
|
-
|
257
|
+
|
258
258
|
end
|
259
|
-
|
259
|
+
|
260
260
|
end
|
261
|
-
|
261
|
+
|
262
262
|
log my_logger,"Objects indexed: #{indexed}"
|
263
263
|
log(my_logger,"ERRORS Encountered, #{errors} objects not indexed") if errors > 0
|
264
264
|
log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
|
265
265
|
puts "Logged output at #{output_log_file_name}"
|
266
|
-
|
266
|
+
|
267
267
|
end
|
268
268
|
|
269
269
|
desc 'Delete the druids specified in the supplied text file (one druid per line, header not necessary). Be careful! It will delete from all targets.'
|
@@ -272,24 +272,24 @@ desc 'Delete the druids specified in the supplied text file (one druid per line,
|
|
272
272
|
task :delete_druids => :environment do |t, args|
|
273
273
|
|
274
274
|
file_path = ENV['file'] # must specify previous indexing log file to index from
|
275
|
-
|
275
|
+
|
276
276
|
raise 'You must specify a druid file.' if file_path.blank?
|
277
277
|
raise 'File not found.' unless File.readable? file_path
|
278
278
|
|
279
279
|
print "Are you sure you wish to delete all of the druids from all targets specified in #{file_path}? (y/n) "
|
280
|
-
STDOUT.flush
|
280
|
+
STDOUT.flush
|
281
281
|
answer=STDIN.gets.chomp
|
282
|
-
|
282
|
+
|
283
283
|
raise 'STOP!' unless (answer && ['y','yes'].include?(answer.downcase))
|
284
|
-
|
284
|
+
|
285
285
|
output_log_file_name="#{Rails.root}/log/#{File.basename(file_path,File.extname(file_path))}_delete_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
|
286
286
|
my_logger=Logger.new(output_log_file_name) # set up a new log file
|
287
|
-
|
287
|
+
|
288
288
|
start_time=Time.now
|
289
|
-
|
289
|
+
|
290
290
|
errors=0
|
291
291
|
indexed=0
|
292
|
-
|
292
|
+
|
293
293
|
log my_logger,"** Deleting druids from #{file_path} in all targets."
|
294
294
|
log my_logger,"Deleting started at #{start_time}"
|
295
295
|
|
@@ -301,10 +301,10 @@ task :delete_druids => :environment do |t, args|
|
|
301
301
|
|
302
302
|
downcased_line=line.downcase
|
303
303
|
druid=downcased_line.scan(/[a-z][a-z][0-9][0-9][0-9][a-z][a-z][0-9][0-9][0-9][0-9]/).first
|
304
|
-
|
304
|
+
|
305
305
|
unless druid.blank?
|
306
306
|
counter+=1
|
307
|
-
|
307
|
+
|
308
308
|
begin
|
309
309
|
with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
|
310
310
|
indexer.delete druid
|
@@ -315,11 +315,11 @@ task :delete_druids => :environment do |t, args|
|
|
315
315
|
log my_logger,"ERROR: Failed to delete #{druid}: #{e.message}",:error
|
316
316
|
errors += 1
|
317
317
|
end
|
318
|
-
end
|
318
|
+
end
|
319
319
|
end
|
320
|
-
|
320
|
+
|
321
321
|
log my_logger,"Objects deleted: #{indexed}"
|
322
322
|
log(my_logger,"ERRORS Encountered, #{errors} objects not deleted",:error) if errors > 0
|
323
323
|
log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
|
324
|
-
|
325
|
-
end
|
324
|
+
|
325
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: base_indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ahmed Alsum
|
@@ -17,20 +17,14 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - "~>"
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: '4
|
21
|
-
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 4.1.9
|
20
|
+
version: '4'
|
24
21
|
type: :runtime
|
25
22
|
prerelease: false
|
26
23
|
version_requirements: !ruby/object:Gem::Requirement
|
27
24
|
requirements:
|
28
25
|
- - "~>"
|
29
26
|
- !ruby/object:Gem::Version
|
30
|
-
version: '4
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: 4.1.9
|
27
|
+
version: '4'
|
34
28
|
- !ruby/object:Gem::Dependency
|
35
29
|
name: discovery-indexer
|
36
30
|
requirement: !ruby/object:Gem::Requirement
|