base_indexer 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/base_indexer/version.rb +1 -1
- data/lib/tasks/index.rake +77 -77
- metadata +3 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 835d4deacab7029147d2dd106b1cf6518ddae2a8
|
4
|
+
data.tar.gz: 8f44888fb252805cc0695006c6833c41805e582b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9599ad3b0ccff641dd9d09d59cf4a3d676376b197ef48439318699a30afc32684c120e17d1a5f7ae2ec4a24d93765a0086c5034bf86a4c6e78bb9b2a16d7bd60
|
7
|
+
data.tar.gz: 9bec426885112b59c5010ede1ab6f7c53b3ae29f1f52c20fc8d7a4eb828cd1f1acf836b0126fb8baa9b6b32ccd073e7926324c3c001322072689617e87eaced1
|
data/lib/base_indexer/version.rb
CHANGED
data/lib/tasks/index.rake
CHANGED
@@ -11,13 +11,13 @@ def log(logger,message,log_type=:info)
|
|
11
11
|
end
|
12
12
|
puts message
|
13
13
|
$stdout.flush
|
14
|
-
|
14
|
+
|
15
15
|
end
|
16
16
|
|
17
17
|
desc 'Index a specific list of druids from a pre-assembly log YAML file, a remediate log file, or a simple CSV. Specify target to index into and log file to index from.'
|
18
|
-
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.yaml log_type=preassembly
|
19
|
-
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1_remediation.yaml log_type=remediate
|
20
|
-
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.csv log_type=csv # csv must contain a heading called "druid" with the druid to index
|
18
|
+
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.yaml log_type=preassembly
|
19
|
+
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1_remediation.yaml log_type=remediate
|
20
|
+
#Run me: rake log_indexer RAILS_ENV=production target=revs_prod log_file=/tmp/mailander_1.csv log_type=csv # csv must contain a heading called "druid" with the druid to index
|
21
21
|
|
22
22
|
# Examples:
|
23
23
|
task :log_indexer => :environment do |t, args|
|
@@ -25,55 +25,55 @@ task :log_indexer => :environment do |t, args|
|
|
25
25
|
target = ENV['target'] # must pass in the target so specify solr core to index into
|
26
26
|
log_file_path = ENV['log_file'] # must specify pre-assembly log file to index from
|
27
27
|
log_type = ENV['log_type'] || 'preassembly' # log type (either preassembly, csv, or remediate), defaults to preassembly
|
28
|
-
|
28
|
+
|
29
29
|
raise 'You must specify a target and log file.' if target.blank? || log_file_path.blank?
|
30
30
|
raise 'Log type must be preassembly, remediate or csv.' unless ['preassembly','remediate','csv'].include? log_type
|
31
31
|
raise 'Log file not found.' unless File.readable? log_file_path
|
32
|
-
|
32
|
+
|
33
33
|
target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
|
34
|
-
|
34
|
+
|
35
35
|
raise 'Target not found.' if target_config.nil?
|
36
|
-
|
36
|
+
|
37
37
|
if log_type.blank? || log_type == 'preassembly'
|
38
38
|
log_completed=:pre_assem_finished
|
39
39
|
elsif log_type == 'remediate'
|
40
40
|
log_completed=:remediate_completed
|
41
41
|
end
|
42
|
-
|
42
|
+
|
43
43
|
output_log_file_name="#{Rails.root}/log/#{File.basename(log_file_path,File.extname(log_file_path))}_indexer_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
|
44
44
|
my_logger=Logger.new(output_log_file_name) # set up a new log file
|
45
|
-
|
45
|
+
|
46
46
|
start_time=Time.now
|
47
|
-
|
47
|
+
|
48
48
|
errors=0
|
49
49
|
indexed=0
|
50
50
|
|
51
51
|
druids=[]
|
52
|
-
|
52
|
+
|
53
53
|
if ['preassembly','remediate'].include? log_type
|
54
|
-
YAML.load_stream(IO.read(log_file_path)) { |obj| druids << obj[:pid] if obj[log_completed] == true}
|
54
|
+
YAML.load_stream(IO.read(log_file_path)) { |obj| druids << obj[:pid] if obj[log_completed] == true}
|
55
55
|
else
|
56
56
|
csv = CSV.parse(IO.read(log_file_path), :headers => true)
|
57
57
|
druids=csv.map { |row| row.to_hash.with_indifferent_access['druid'] }.delete_if {|druid| druid.nil?}
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
solr_server=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]['url']
|
61
|
-
|
61
|
+
|
62
62
|
log my_logger,"** Indexing #{druids.size} druids from #{log_file_path} into solr server #{solr_server} (target=#{target}). Log file is of type #{log_type}."
|
63
63
|
log my_logger,"Indexing started at #{start_time}"
|
64
64
|
|
65
65
|
indexer = BaseIndexer.indexer_class.constantize.new
|
66
66
|
|
67
67
|
counter=0
|
68
|
-
|
68
|
+
|
69
69
|
druids.each do |druid|
|
70
|
-
|
70
|
+
|
71
71
|
druid.gsub!('druid:','')
|
72
72
|
counter+=1
|
73
|
-
|
73
|
+
|
74
74
|
begin
|
75
75
|
with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
|
76
|
-
indexer.index(druid,
|
76
|
+
indexer.index(druid,{target=>true})
|
77
77
|
log my_logger,"#{counter} of #{druids.size}: #{druid}"
|
78
78
|
indexed += 1
|
79
79
|
end
|
@@ -83,34 +83,34 @@ task :log_indexer => :environment do |t, args|
|
|
83
83
|
end
|
84
84
|
|
85
85
|
end
|
86
|
-
|
86
|
+
|
87
87
|
log my_logger,"Objects indexed: #{indexed} out of #{druids.size}"
|
88
88
|
log(my_logger,"ERRORS Encountered, #{errors} objects not indexed") if errors > 0
|
89
89
|
log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
|
90
90
|
puts "Logged output at #{output_log_file_name}"
|
91
|
-
|
91
|
+
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
desc "Delete a single druid. It will be deleted from all targets!"
|
95
95
|
#Run me: rake delete RAILS_ENV=production druid=oo000oo0001
|
96
96
|
# Examples:
|
97
97
|
task :delete => :environment do |t, args|
|
98
98
|
|
99
|
-
druid = ENV['druid']
|
100
|
-
|
99
|
+
druid = ENV['druid']
|
100
|
+
|
101
101
|
raise 'You must specify a druid.' if druid.blank?
|
102
102
|
|
103
103
|
print "Are you sure you wish to delete this druid from all targets? (y/n) "
|
104
|
-
STDOUT.flush
|
104
|
+
STDOUT.flush
|
105
105
|
answer=STDIN.gets.chomp
|
106
|
-
|
106
|
+
|
107
107
|
raise 'STOP!' unless (answer && ['y','yes'].include?(answer.downcase))
|
108
|
-
|
108
|
+
|
109
109
|
puts "** Delete #{druid} druid from all targets."
|
110
110
|
|
111
111
|
indexer = BaseIndexer.indexer_class.constantize.new
|
112
112
|
indexer.delete druid.gsub('druid:','')
|
113
|
-
|
113
|
+
|
114
114
|
end
|
115
115
|
|
116
116
|
desc 'Index a single druid. Specify target to index into and druid to index.'
|
@@ -119,21 +119,21 @@ desc 'Index a single druid. Specify target to index into and druid to index.'
|
|
119
119
|
task :index => :environment do |t, args|
|
120
120
|
|
121
121
|
target = ENV['target'] # must pass in the target so specify solr core to index into
|
122
|
-
druid = ENV['druid']
|
123
|
-
|
122
|
+
druid = ENV['druid']
|
123
|
+
|
124
124
|
raise 'You must specify a target and druid.' if target.blank? || druid.blank?
|
125
|
-
|
125
|
+
|
126
126
|
target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
|
127
|
-
|
127
|
+
|
128
128
|
raise 'Target not found.' if target_config.nil?
|
129
129
|
|
130
130
|
solr_server=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]['url']
|
131
|
-
|
131
|
+
|
132
132
|
puts "** Indexing #{druid} druid into solr server #{solr_server} (target=#{target})."
|
133
133
|
|
134
134
|
indexer = BaseIndexer.indexer_class.constantize.new
|
135
|
-
indexer.index(druid.gsub('druid:',''),
|
136
|
-
|
135
|
+
indexer.index(druid.gsub('druid:',''),{target=>true})
|
136
|
+
|
137
137
|
end
|
138
138
|
|
139
139
|
desc 'Index an entire collection, including the collection itself and all of its members. Specify target to index into and collection druid to index.'
|
@@ -142,21 +142,21 @@ desc 'Index an entire collection, including the collection itself and all of its
|
|
142
142
|
task :collection_indexer => :environment do |t, args|
|
143
143
|
|
144
144
|
target = ENV['target'] # must pass in the target so specify solr core to index into
|
145
|
-
collection_druid = ENV['collection_druid']
|
146
|
-
|
145
|
+
collection_druid = ENV['collection_druid']
|
146
|
+
|
147
147
|
raise 'You must specify a target and collection druid.' if target.blank? || collection_druid.blank?
|
148
|
-
|
148
|
+
|
149
149
|
target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
|
150
|
-
|
150
|
+
|
151
151
|
raise 'Target not found.' if target_config.nil?
|
152
152
|
|
153
153
|
solr_server=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]['url']
|
154
154
|
|
155
155
|
output_log_file_name="#{Rails.root}/log/collection_#{collection_druid}_indexer_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
|
156
156
|
my_logger=Logger.new(output_log_file_name) # set up a new log file
|
157
|
-
|
157
|
+
|
158
158
|
log my_logger,"** Indexing collection #{collection_druid} druid and all of its members into solr server #{solr_server} (target=#{target})."
|
159
|
-
|
159
|
+
|
160
160
|
start_time=Time.now
|
161
161
|
log my_logger,"Indexing started at #{start_time}"
|
162
162
|
|
@@ -165,10 +165,10 @@ task :collection_indexer => :environment do |t, args|
|
|
165
165
|
df = DorFetcher::Client.new({:service_url => Rails.application.config.dor_fetcher_url})
|
166
166
|
|
167
167
|
collection_druid=collection_druid.gsub('druid:','')
|
168
|
-
|
169
|
-
indexer.index(collection_druid,
|
168
|
+
|
169
|
+
indexer.index(collection_druid,{target=>true})
|
170
170
|
log my_logger,"Indexed collection: #{collection_druid}"
|
171
|
-
|
171
|
+
|
172
172
|
druids = df.druid_array(df.get_collection(collection_druid, {}))
|
173
173
|
|
174
174
|
log my_logger,"** Found #{druids.size} members of the collection"
|
@@ -176,15 +176,15 @@ task :collection_indexer => :environment do |t, args|
|
|
176
176
|
counter=0
|
177
177
|
indexed=0
|
178
178
|
errors=0
|
179
|
-
|
179
|
+
|
180
180
|
druids.each do |druid|
|
181
|
-
|
181
|
+
|
182
182
|
druid=druid.gsub('druid:','')
|
183
183
|
counter+=1
|
184
|
-
|
184
|
+
|
185
185
|
begin
|
186
186
|
with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
|
187
|
-
indexer.index(druid,
|
187
|
+
indexer.index(druid,{target=>true})
|
188
188
|
log my_logger,"#{counter} of #{druids.size}: #{druid}"
|
189
189
|
indexed += 1
|
190
190
|
end
|
@@ -194,13 +194,13 @@ task :collection_indexer => :environment do |t, args|
|
|
194
194
|
end
|
195
195
|
|
196
196
|
end
|
197
|
-
|
197
|
+
|
198
198
|
log my_logger,"Objects indexed: #{indexed} out of #{druids.size} + 1 collection druid"
|
199
199
|
log(my_logger,"ERRORS Encountered, #{errors} objects not indexed") if errors > 0
|
200
200
|
log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
|
201
201
|
puts "Logged output at #{output_log_file_name}"
|
202
|
-
|
203
|
-
end
|
202
|
+
|
203
|
+
end
|
204
204
|
|
205
205
|
desc 'ReIndex just the druids that errored out from a previous batch index run. Specify target to index into and batch errored log file to index from.'
|
206
206
|
#Run me: rake reindexer RAILS_ENV=production target=revs_prod file=./log/index.log
|
@@ -209,16 +209,16 @@ task :reindexer => :environment do |t, args|
|
|
209
209
|
|
210
210
|
target = ENV['target'] # must pass in the target so specify solr core to index into
|
211
211
|
file_path = ENV['file'] # must specify previous indexing log file to index from
|
212
|
-
|
212
|
+
|
213
213
|
raise 'You must specify a target and file.' if target.blank? || file_path.blank?
|
214
214
|
raise 'File not found.' unless File.readable? file_path
|
215
|
-
|
215
|
+
|
216
216
|
target_config=BaseIndexer.solr_configuration_class_name.constantize.instance.get_configuration_hash[target]
|
217
|
-
|
217
|
+
|
218
218
|
raise 'Target not found.' if target_config.nil?
|
219
219
|
|
220
220
|
start_time=Time.now
|
221
|
-
|
221
|
+
|
222
222
|
errors=0
|
223
223
|
indexed=0
|
224
224
|
|
@@ -226,7 +226,7 @@ task :reindexer => :environment do |t, args|
|
|
226
226
|
|
227
227
|
output_log_file_name="#{Rails.root}/log/#{File.basename(file_path,File.extname(file_path))}_reindex_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
|
228
228
|
my_logger=Logger.new(output_log_file_name) # set up a new log file
|
229
|
-
|
229
|
+
|
230
230
|
log my_logger,"** Indexing errored out druids from #{file_path} into solr server #{solr_server} (target=#{target})."
|
231
231
|
log my_logger,"Indexing started at #{start_time}"
|
232
232
|
|
@@ -237,15 +237,15 @@ task :reindexer => :environment do |t, args|
|
|
237
237
|
IO.readlines(file_path).each do |line|
|
238
238
|
|
239
239
|
downcased_line=line.downcase
|
240
|
-
|
240
|
+
|
241
241
|
if downcased_line.include? 'error'
|
242
242
|
druid=downcased_line.scan(/[a-z][a-z][0-9][0-9][0-9][a-z][a-z][0-9][0-9][0-9][0-9]/).first
|
243
|
-
|
244
|
-
unless druid.blank?
|
243
|
+
|
244
|
+
unless druid.blank?
|
245
245
|
begin
|
246
246
|
counter+=1
|
247
247
|
with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
|
248
|
-
indexer.index(druid,
|
248
|
+
indexer.index(druid,{target=>true})
|
249
249
|
log my_logger,"#{counter}: #{druid}"
|
250
250
|
indexed += 1
|
251
251
|
end
|
@@ -254,16 +254,16 @@ task :reindexer => :environment do |t, args|
|
|
254
254
|
errors += 1
|
255
255
|
end
|
256
256
|
end
|
257
|
-
|
257
|
+
|
258
258
|
end
|
259
|
-
|
259
|
+
|
260
260
|
end
|
261
|
-
|
261
|
+
|
262
262
|
log my_logger,"Objects indexed: #{indexed}"
|
263
263
|
log(my_logger,"ERRORS Encountered, #{errors} objects not indexed") if errors > 0
|
264
264
|
log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
|
265
265
|
puts "Logged output at #{output_log_file_name}"
|
266
|
-
|
266
|
+
|
267
267
|
end
|
268
268
|
|
269
269
|
desc 'Delete the druids specified in the supplied text file (one druid per line, header not necessary). Be careful! It will delete from all targets.'
|
@@ -272,24 +272,24 @@ desc 'Delete the druids specified in the supplied text file (one druid per line,
|
|
272
272
|
task :delete_druids => :environment do |t, args|
|
273
273
|
|
274
274
|
file_path = ENV['file'] # must specify previous indexing log file to index from
|
275
|
-
|
275
|
+
|
276
276
|
raise 'You must specify a druid file.' if file_path.blank?
|
277
277
|
raise 'File not found.' unless File.readable? file_path
|
278
278
|
|
279
279
|
print "Are you sure you wish to delete all of the druids from all targets specified in #{file_path}? (y/n) "
|
280
|
-
STDOUT.flush
|
280
|
+
STDOUT.flush
|
281
281
|
answer=STDIN.gets.chomp
|
282
|
-
|
282
|
+
|
283
283
|
raise 'STOP!' unless (answer && ['y','yes'].include?(answer.downcase))
|
284
|
-
|
284
|
+
|
285
285
|
output_log_file_name="#{Rails.root}/log/#{File.basename(file_path,File.extname(file_path))}_delete_#{Time.now.strftime('%Y%m%d-%H%M%S')}.log"
|
286
286
|
my_logger=Logger.new(output_log_file_name) # set up a new log file
|
287
|
-
|
287
|
+
|
288
288
|
start_time=Time.now
|
289
|
-
|
289
|
+
|
290
290
|
errors=0
|
291
291
|
indexed=0
|
292
|
-
|
292
|
+
|
293
293
|
log my_logger,"** Deleting druids from #{file_path} in all targets."
|
294
294
|
log my_logger,"Deleting started at #{start_time}"
|
295
295
|
|
@@ -301,10 +301,10 @@ task :delete_druids => :environment do |t, args|
|
|
301
301
|
|
302
302
|
downcased_line=line.downcase
|
303
303
|
druid=downcased_line.scan(/[a-z][a-z][0-9][0-9][0-9][a-z][a-z][0-9][0-9][0-9][0-9]/).first
|
304
|
-
|
304
|
+
|
305
305
|
unless druid.blank?
|
306
306
|
counter+=1
|
307
|
-
|
307
|
+
|
308
308
|
begin
|
309
309
|
with_retries(:max_tries => 5, :base_sleep_seconds => 3, :max_sleep_seconds => 60) do
|
310
310
|
indexer.delete druid
|
@@ -315,11 +315,11 @@ task :delete_druids => :environment do |t, args|
|
|
315
315
|
log my_logger,"ERROR: Failed to delete #{druid}: #{e.message}",:error
|
316
316
|
errors += 1
|
317
317
|
end
|
318
|
-
end
|
318
|
+
end
|
319
319
|
end
|
320
|
-
|
320
|
+
|
321
321
|
log my_logger,"Objects deleted: #{indexed}"
|
322
322
|
log(my_logger,"ERRORS Encountered, #{errors} objects not deleted",:error) if errors > 0
|
323
323
|
log my_logger,"Completed at #{Time.now}, total time was #{'%.2f' % ((Time.now - start_time)/60.0)} minutes"
|
324
|
-
|
325
|
-
end
|
324
|
+
|
325
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: base_indexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ahmed Alsum
|
@@ -17,20 +17,14 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - "~>"
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: '4
|
21
|
-
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 4.1.9
|
20
|
+
version: '4'
|
24
21
|
type: :runtime
|
25
22
|
prerelease: false
|
26
23
|
version_requirements: !ruby/object:Gem::Requirement
|
27
24
|
requirements:
|
28
25
|
- - "~>"
|
29
26
|
- !ruby/object:Gem::Version
|
30
|
-
version: '4
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: 4.1.9
|
27
|
+
version: '4'
|
34
28
|
- !ruby/object:Gem::Dependency
|
35
29
|
name: discovery-indexer
|
36
30
|
requirement: !ruby/object:Gem::Requirement
|