traject 0.13.2 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +15 -9
- data/lib/traject/indexer.rb +158 -157
- data/lib/traject/macros/marc21.rb +30 -0
- data/lib/traject/marc_extractor.rb +39 -21
- data/lib/traject/solrj_writer.rb +1 -1
- data/lib/traject/version.rb +1 -1
- data/test/indexer/each_record_test.rb +4 -4
- data/test/indexer/macros_marc21_test.rb +9 -0
- data/test/indexer/read_write_test.rb +5 -1
- data/test/indexer/to_field_test.rb +10 -23
- data/test/marc_extractor_test.rb +73 -20
- data/traject.gemspec +1 -1
- metadata +2 -1
data/README.md
CHANGED
@@ -13,19 +13,21 @@ them somewhere.
|
|
13
13
|
|
14
14
|
## Background/Goals
|
15
15
|
|
16
|
-
Existing tools for indexing Marc to Solr
|
16
|
+
Existing tools for indexing Marc to Solr served us well for many years, and have many features.
|
17
|
+
But we were having more and more difficulty with them, including in extending/customizing in maintainable ways.
|
18
|
+
We realized that to create a tool with the API (internal and external) we wanted, we could do a better
|
19
|
+
job with jruby (ruby on the JVM).
|
17
20
|
|
18
|
-
*
|
19
|
-
*
|
21
|
+
* **Easy to use**, getting started with standard use cases should be easy, even for non-rubyists.
|
22
|
+
* **Support customization and flexiblity**, common customization use cases, including simple local
|
20
23
|
logic, should be very easy. More sophisticated and even complex customization use cases should still be possible,
|
21
24
|
changing just the parts of traject you want to change.
|
22
|
-
*
|
23
|
-
*
|
25
|
+
* **Maintainable local logic**, supporting sharing of reusable logic via ruby gems.
|
26
|
+
* **Comprehensible internal logic**; well-covered by tests, well-factored separation of concerns,
|
24
27
|
easy for newcomer developers who know ruby to understand the codebase.
|
25
|
-
*
|
26
|
-
|
27
|
-
|
28
|
-
* *Well-behaved shell script*, for painless integration in batch processes and cronjobs, with
|
28
|
+
* **High performance**, using multi-threaded concurrency where appropriate to maximize throughput.
|
29
|
+
traject likely will provide higher throughput than other similar solutions.
|
30
|
+
* **Well-behaved shell script**, for painless integration in batch processes and cronjobs, with
|
29
31
|
exit codes, sufficiently flexible control of logging, proper use of stderr, etc.
|
30
32
|
|
31
33
|
|
@@ -167,6 +169,10 @@ Other examples of the specification string, which can include multiple tag menti
|
|
167
169
|
# each in separate strings:
|
168
170
|
to_field "isbn", extract_marc("020az", :separator => nil)
|
169
171
|
|
172
|
+
# Same thing, but more explicit
|
173
|
+
to_field "isbn", extract_marc("020a:020z")
|
174
|
+
|
175
|
+
|
170
176
|
# Make sure that you don't get any duplicates
|
171
177
|
# by passing in ":deduplicate => true"
|
172
178
|
to_field 'language008', extract_marc('008[35-37]', :deduplicate=>true)
|
data/lib/traject/indexer.rb
CHANGED
@@ -50,13 +50,13 @@ require 'traject/macros/basic'
|
|
50
50
|
# with a String name of class meeting the Writer contract.
|
51
51
|
#
|
52
52
|
class Traject::Indexer
|
53
|
-
|
53
|
+
|
54
54
|
# Arity error on a passed block
|
55
55
|
class ArityError < ArgumentError; end
|
56
56
|
class NamingError < ArgumentError; end
|
57
57
|
|
58
|
-
|
59
|
-
|
58
|
+
|
59
|
+
|
60
60
|
include Traject::QualifiedConstGet
|
61
61
|
|
62
62
|
attr_writer :reader_class, :writer_class
|
@@ -155,26 +155,11 @@ class Traject::Indexer
|
|
155
155
|
|
156
156
|
# Used to define an indexing mapping.
|
157
157
|
def to_field(field_name, aLambda = nil, &block)
|
158
|
-
|
159
|
-
verify_to_field_arguments(field_name, aLambda, block)
|
160
|
-
|
161
|
-
@index_steps << {
|
162
|
-
:field_name => field_name.to_s,
|
163
|
-
:lambda => aLambda,
|
164
|
-
:block => block,
|
165
|
-
:type => :to_field,
|
166
|
-
:source_location => Traject::Util.extract_caller_location(caller.first)
|
167
|
-
}
|
158
|
+
@index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
168
159
|
end
|
169
160
|
|
170
161
|
def each_record(aLambda = nil, &block)
|
171
|
-
|
172
|
-
@index_steps << {
|
173
|
-
:lambda => aLambda,
|
174
|
-
:block => block,
|
175
|
-
:type => :each_record,
|
176
|
-
:source_location => Traject::Util.extract_caller_location(caller.first)
|
177
|
-
}
|
162
|
+
@index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
178
163
|
end
|
179
164
|
|
180
165
|
|
@@ -203,51 +188,24 @@ class Traject::Indexer
|
|
203
188
|
# to mapping routines.
|
204
189
|
#
|
205
190
|
# Returns the context passed in as second arg, as a convenience for chaining etc.
|
191
|
+
|
206
192
|
def map_to_context!(context)
|
207
193
|
@index_steps.each do |index_step|
|
208
194
|
# Don't bother if we're skipping this record
|
209
195
|
break if context.skip?
|
210
|
-
if index_step[:type] == :to_field
|
211
|
-
|
212
|
-
accumulator = []
|
213
|
-
context.field_name = index_step[:field_name]
|
214
|
-
|
215
|
-
# Might have a lambda arg AND a block, we execute in order,
|
216
|
-
# with same accumulator.
|
217
|
-
|
218
|
-
[index_step[:lambda], index_step[:block]].each do |aProc|
|
219
|
-
if aProc
|
220
|
-
log_mapping_errors(context, index_step, aProc) do
|
221
|
-
if aProc.arity == 2
|
222
|
-
aProc.call(context.source_record, accumulator)
|
223
|
-
else
|
224
|
-
aProc.call(context.source_record, accumulator, context)
|
225
|
-
end
|
226
|
-
end
|
227
|
-
end
|
228
|
-
end
|
229
|
-
accumulator.compact!
|
230
|
-
(context.output_hash[context.field_name] ||= []).concat accumulator unless accumulator.empty?
|
231
|
-
context.field_name = nil
|
232
|
-
|
233
|
-
elsif index_step[:type] == :each_record
|
234
|
-
|
235
|
-
# one or two arg
|
236
|
-
[index_step[:lambda], index_step[:block]].each do |aProc|
|
237
|
-
if aProc
|
238
|
-
log_mapping_errors(context, index_step, aProc) do
|
239
|
-
if aProc.arity == 1
|
240
|
-
aProc.call(context.source_record)
|
241
|
-
else
|
242
|
-
aProc.call(context.source_record, context)
|
243
|
-
end
|
244
|
-
end
|
245
|
-
end
|
246
|
-
end
|
247
196
|
|
248
|
-
|
249
|
-
|
197
|
+
context.index_step = index_step
|
198
|
+
accumulator = log_mapping_errors(context, index_step) do
|
199
|
+
index_step.execute(context) # will always return [] for an each_record step
|
250
200
|
end
|
201
|
+
context.index_step =
|
202
|
+
|
203
|
+
accumulator.compact!
|
204
|
+
if accumulator.size > 0
|
205
|
+
(context.output_hash[index_step.field_name] ||= []).concat accumulator
|
206
|
+
end
|
207
|
+
|
208
|
+
context.index_step = index_step
|
251
209
|
end
|
252
210
|
|
253
211
|
return context
|
@@ -255,22 +213,19 @@ class Traject::Indexer
|
|
255
213
|
|
256
214
|
# just a wrapper that captures and records any unexpected
|
257
215
|
# errors raised in mapping, along with contextual information
|
258
|
-
# on record and location in source file of mapping rule.
|
216
|
+
# on record and location in source file of mapping rule.
|
259
217
|
#
|
260
|
-
# Re-raises error at the moment.
|
218
|
+
# Re-raises error at the moment.
|
261
219
|
#
|
262
|
-
#
|
220
|
+
# log_mapping_errors(context, index_step) do
|
263
221
|
# all_sorts_of_stuff # that will have errors logged
|
264
222
|
# end
|
265
|
-
def log_mapping_errors(context, index_step
|
223
|
+
def log_mapping_errors(context, index_step)
|
266
224
|
begin
|
267
225
|
yield
|
268
226
|
rescue Exception => e
|
269
227
|
msg = "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
|
270
|
-
|
271
|
-
conf = context.field_name ? "to_field '#{context.field_name}'" : "each_record"
|
272
|
-
|
273
|
-
msg += " while executing #{conf} defined at #{index_step[:source_location]}\n"
|
228
|
+
msg += " while executing #{index_step.inspect}\n"
|
274
229
|
msg += Traject::Util.exception_to_log_message(e)
|
275
230
|
|
276
231
|
logger.error msg
|
@@ -284,6 +239,12 @@ class Traject::Indexer
|
|
284
239
|
end
|
285
240
|
end
|
286
241
|
|
242
|
+
# get a printable id from record for error logging.
|
243
|
+
# Maybe override this for a future XML version.
|
244
|
+
def id_string(record)
|
245
|
+
record && record['001'] && record['001'].value.to_s
|
246
|
+
end
|
247
|
+
|
287
248
|
# Processes a stream of records, reading from the configured Reader,
|
288
249
|
# mapping according to configured mapping rules, and then writing
|
289
250
|
# to configured Writer.
|
@@ -335,13 +296,14 @@ class Traject::Indexer
|
|
335
296
|
#thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
|
336
297
|
thread_pool.maybe_in_thread_pool do
|
337
298
|
context = Context.new(:source_record => record, :settings => settings, :position => position)
|
299
|
+
context.logger = logger
|
338
300
|
map_to_context!(context)
|
339
301
|
if context.skip?
|
340
302
|
log_skip(context)
|
341
303
|
else
|
342
304
|
writer.put context
|
343
305
|
end
|
344
|
-
|
306
|
+
|
345
307
|
end
|
346
308
|
|
347
309
|
end
|
@@ -353,7 +315,7 @@ class Traject::Indexer
|
|
353
315
|
|
354
316
|
thread_pool.raise_collected_exception!
|
355
317
|
|
356
|
-
|
318
|
+
|
357
319
|
writer.close if writer.respond_to?(:close)
|
358
320
|
|
359
321
|
elapsed = Time.now - start_time
|
@@ -367,7 +329,7 @@ class Traject::Indexer
|
|
367
329
|
|
368
330
|
return true
|
369
331
|
end
|
370
|
-
|
332
|
+
|
371
333
|
# Log that the current record is being skipped, using
|
372
334
|
# data in context.position and context.skipmessage
|
373
335
|
def log_skip(context)
|
@@ -399,89 +361,10 @@ class Traject::Indexer
|
|
399
361
|
return writer_class.new(settings.merge("logger" => logger))
|
400
362
|
end
|
401
363
|
|
402
|
-
# get a printable id from record for error logging.
|
403
|
-
# Maybe override this for a future XML version.
|
404
|
-
def id_string(record)
|
405
|
-
record && record['001'] && record['001'].value.to_s
|
406
|
-
end
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
# Verify that the field name is good, and throw a useful error if not
|
412
|
-
def verify_field_name(field_name)
|
413
|
-
if field_name.nil? || !field_name.is_a?(String) || field_name.empty?
|
414
|
-
raise NamingError.new("to_field requires the field name (String) as the first argument (#{last_named_step.message})")
|
415
|
-
end
|
416
|
-
end
|
417
|
-
|
418
|
-
|
419
|
-
# Verify the various, increasingly-complex things that can be sent to to_field
|
420
|
-
# to make sure it's all kosher.
|
421
|
-
#
|
422
|
-
# "Modification" takes place for zero-argument blocks that return a lambda
|
423
|
-
|
424
|
-
def verify_to_field_arguments(field_name, aLambda, block)
|
425
|
-
|
426
|
-
verify_field_name(field_name)
|
427
|
-
|
428
|
-
[aLambda, block].each do |proc|
|
429
|
-
# allow negative arity, meaning variable/optional, trust em on that.
|
430
|
-
# but for positive arrity, we need 2 or 3 args
|
431
|
-
if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
|
432
|
-
raise ArityError.new("error parsing field '#{field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{last_named_step.message})")
|
433
|
-
end
|
434
|
-
end
|
435
|
-
|
436
|
-
end
|
437
|
-
|
438
|
-
# Verify the procs sent to each_record to make sure it's all kosher.
|
439
|
-
|
440
|
-
def verify_each_record_arguments(aLambda, block)
|
441
|
-
unless aLambda or block
|
442
|
-
raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{last_named_step.message})")
|
443
|
-
end
|
444
|
-
|
445
|
-
[aLambda, block].each do |proc|
|
446
|
-
# allow negative arity, meaning variable/optional, trust em on that.
|
447
|
-
# but for positive arrity, we need 1 or 2 args
|
448
|
-
if proc
|
449
|
-
unless proc.is_a?(Proc)
|
450
|
-
raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} (#{last_named_step.message})")
|
451
|
-
end
|
452
|
-
if (proc.arity == 0 || proc.arity > 2)
|
453
|
-
raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: #{proc} (#{last_named_step.message})")
|
454
|
-
end
|
455
|
-
end
|
456
|
-
end
|
457
|
-
end
|
458
|
-
|
459
|
-
def last_named_step
|
460
|
-
return LastNamedStep.new(@index_steps)
|
461
|
-
end
|
462
|
-
|
463
|
-
|
464
|
-
# A convenient way to find, and generate error messages for, the last named step (for helping locate parse errors)
|
465
|
-
class LastNamedStep
|
466
|
-
attr_accessor :step, :message
|
467
|
-
|
468
|
-
# Get the last step for which we have a field_name (e.g., the last to_field, skipping over each_record)
|
469
|
-
def initialize(index_steps)
|
470
|
-
@step = index_steps.reverse_each.find{|step| step[:field_name]}
|
471
|
-
if @step
|
472
|
-
@message = "last successfully parsed field was '#{@step[:field_name]}'"
|
473
|
-
else
|
474
|
-
@message = "there were no previous named fields successfully parsed"
|
475
|
-
end
|
476
|
-
end
|
477
|
-
end
|
478
|
-
|
479
|
-
|
480
|
-
|
481
364
|
# Represents the context of a specific record being indexed, passed
|
482
365
|
# to indexing logic blocks
|
483
366
|
#
|
484
|
-
class
|
367
|
+
class Context
|
485
368
|
def initialize(hash_init = {})
|
486
369
|
# TODO, argument checking for required args?
|
487
370
|
|
@@ -491,29 +374,147 @@ class Traject::Indexer
|
|
491
374
|
hash_init.each_pair do |key, value|
|
492
375
|
self.send("#{key}=", value)
|
493
376
|
end
|
494
|
-
|
377
|
+
|
495
378
|
@skip = false
|
496
379
|
end
|
497
380
|
|
498
|
-
attr_accessor :clipboard, :output_hash
|
499
|
-
attr_accessor :
|
381
|
+
attr_accessor :clipboard, :output_hash, :logger
|
382
|
+
attr_accessor :index_step, :source_record, :settings
|
500
383
|
# 1-based position in stream of processed records.
|
501
384
|
attr_accessor :position
|
502
|
-
|
385
|
+
|
503
386
|
# Should we be skipping this record?
|
504
387
|
attr_accessor :skipmessage
|
505
|
-
|
388
|
+
|
506
389
|
# Set the fact that this record should be skipped, with an
|
507
390
|
# optional message
|
508
391
|
def skip!(msg = '(no message given)')
|
509
392
|
@skipmessage = msg
|
510
393
|
@skip = true
|
511
394
|
end
|
512
|
-
|
395
|
+
|
513
396
|
# Should we skip this record?
|
514
397
|
def skip?
|
515
398
|
@skip
|
516
399
|
end
|
517
|
-
|
400
|
+
|
401
|
+
end
|
402
|
+
|
403
|
+
|
404
|
+
|
405
|
+
# An indexing step definition, including it's source location
|
406
|
+
# for logging
|
407
|
+
#
|
408
|
+
# This one represents an "each_record" step, a subclass below
|
409
|
+
# for "to_field"
|
410
|
+
#
|
411
|
+
# source_location is just a string with filename and line number for
|
412
|
+
# showing to devs in debugging.
|
413
|
+
class EachRecordStep
|
414
|
+
attr_accessor :source_location, :lambda, :block
|
415
|
+
|
416
|
+
def initialize(lambda, block, source_location)
|
417
|
+
self.lambda = lambda
|
418
|
+
self.block = block
|
419
|
+
self.source_location = source_location
|
420
|
+
|
421
|
+
self.validate!
|
422
|
+
end
|
423
|
+
|
424
|
+
# raises if bad data
|
425
|
+
def validate!
|
426
|
+
unless self.lambda or self.block
|
427
|
+
raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{self.inspect})")
|
428
|
+
end
|
429
|
+
|
430
|
+
[self.lambda, self.block].each do |proc|
|
431
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
432
|
+
# but for positive arrity, we need 1 or 2 args
|
433
|
+
if proc
|
434
|
+
unless proc.is_a?(Proc)
|
435
|
+
raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} (#{self.inspect})")
|
436
|
+
end
|
437
|
+
if (proc.arity == 0 || proc.arity > 2)
|
438
|
+
raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: (#{self.inspect})")
|
439
|
+
end
|
440
|
+
end
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
# For each_record, always return an empty array as the
|
445
|
+
# accumulator, since it doesn't have those kinds of side effects
|
446
|
+
def execute(context)
|
447
|
+
[@lambda, @block].each do |aProc|
|
448
|
+
next unless aProc
|
449
|
+
|
450
|
+
if aProc.arity == 1
|
451
|
+
aProc.call(context.source_record)
|
452
|
+
else
|
453
|
+
aProc.call(context.source_record, context)
|
454
|
+
end
|
455
|
+
|
456
|
+
end
|
457
|
+
return [] # empty -- no accumulator for each_record
|
458
|
+
end
|
459
|
+
|
460
|
+
# Over-ride inspect for outputting error messages etc.
|
461
|
+
def inspect
|
462
|
+
"<each_record at #{source_location}>"
|
463
|
+
end
|
518
464
|
end
|
465
|
+
|
466
|
+
|
467
|
+
# An indexing step definition for a "to_field" step to specific
|
468
|
+
# field.
|
469
|
+
class ToFieldStep
|
470
|
+
attr_accessor :field_name, :lambda, :block, :source_location
|
471
|
+
def initialize(fieldname, lambda, block, source_location)
|
472
|
+
self.field_name = fieldname
|
473
|
+
self.lambda = lambda
|
474
|
+
self.block = block
|
475
|
+
self.source_location = source_location
|
476
|
+
|
477
|
+
validate!
|
478
|
+
end
|
479
|
+
|
480
|
+
def validate!
|
481
|
+
|
482
|
+
if self.field_name.nil? || !self.field_name.is_a?(String) || self.field_name.empty?
|
483
|
+
raise NamingError.new("to_field requires the field name (as a string) as the first argument at #{self.source_location})")
|
484
|
+
end
|
485
|
+
|
486
|
+
[self.lambda, self.block].each do |proc|
|
487
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
488
|
+
# but for positive arrity, we need 2 or 3 args
|
489
|
+
if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
|
490
|
+
raise ArityError.new("error parsing field '#{self.field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{self.inspect})")
|
491
|
+
end
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
# Override inspect for developer debug messages
|
496
|
+
def inspect
|
497
|
+
"<to_field #{self.field_name} at #{self.source_location}>"
|
498
|
+
end
|
499
|
+
|
500
|
+
def execute(context)
|
501
|
+
accumulator = []
|
502
|
+
[@lambda, @block].each do |aProc|
|
503
|
+
next unless aProc
|
504
|
+
|
505
|
+
if aProc.arity == 2
|
506
|
+
aProc.call(context.source_record, accumulator)
|
507
|
+
else
|
508
|
+
aProc.call(context.source_record, accumulator, context)
|
509
|
+
end
|
510
|
+
|
511
|
+
end
|
512
|
+
return accumulator
|
513
|
+
end
|
514
|
+
|
515
|
+
end
|
516
|
+
|
517
|
+
|
518
|
+
|
519
|
+
|
519
520
|
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'traject/marc_extractor'
|
2
2
|
require 'traject/translation_map'
|
3
|
+
require 'traject/util'
|
3
4
|
require 'base64'
|
4
5
|
require 'json'
|
5
6
|
|
@@ -30,7 +31,22 @@ module Traject::Macros
|
|
30
31
|
# to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
|
31
32
|
# to_field("id"), extract_marc("001", :first => true)
|
32
33
|
# to_field("geo"), extract_marc("040a", :separator => nil, :translation_map => "marc040")
|
34
|
+
|
35
|
+
|
36
|
+
# A list of symbols that are valid keys in the options hash
|
37
|
+
EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
|
38
|
+
:deduplicate, :uniq, :separator, :translation_map,
|
39
|
+
:alternate_script]
|
40
|
+
|
33
41
|
def extract_marc(spec, options = {})
|
42
|
+
|
43
|
+
# Raise an error if there are any invalid options, indicating a
|
44
|
+
# misspelled or illegal option, using a string instead of a symbol, etc.
|
45
|
+
|
46
|
+
unless (options.keys - EXTRACT_MARC_VALID_OPTIONS).empty?
|
47
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{options.keys.join(', ')}' in extract_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
48
|
+
end
|
49
|
+
|
34
50
|
only_first = options.delete(:first)
|
35
51
|
trim_punctuation = options.delete(:trim_punctuation)
|
36
52
|
default_value = options.delete(:default)
|
@@ -46,6 +62,7 @@ module Traject::Macros
|
|
46
62
|
if translation_map_arg = options.delete(:translation_map)
|
47
63
|
translation_map = Traject::TranslationMap.new(translation_map_arg)
|
48
64
|
end
|
65
|
+
|
49
66
|
|
50
67
|
extractor = Traject::MarcExtractor.new(spec, options)
|
51
68
|
|
@@ -93,7 +110,14 @@ module Traject::Macros
|
|
93
110
|
# serialized, with certain header bytes filled with ascii 0's
|
94
111
|
# -- technically illegal MARC, but can still be read by
|
95
112
|
# ruby MARC::Reader in permissive mode.
|
113
|
+
|
114
|
+
SERIALZED_MARC_VALID_OPTIONS = [:format, :binary_escape, :allow_oversized, :format]
|
115
|
+
|
96
116
|
def serialized_marc(options)
|
117
|
+
unless (options.keys - SERIALZED_MARC_VALID_OPTIONS).empty?
|
118
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{options.keys.join(', ')}' in seralized_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
119
|
+
end
|
120
|
+
|
97
121
|
format = options[:format].to_s
|
98
122
|
binary_escape = (options[:binary_escape] != false)
|
99
123
|
allow_oversized = (options[:allow_oversized] == true)
|
@@ -129,7 +153,13 @@ module Traject::Macros
|
|
129
153
|
#
|
130
154
|
# Can always run this thing multiple times on the same field if you need
|
131
155
|
# non-contiguous ranges of fields.
|
156
|
+
|
157
|
+
EXTRACT_ALL_MARC_VALID_OPTIONS = [:separator, :from, :to]
|
158
|
+
|
132
159
|
def extract_all_marc_values(options = {})
|
160
|
+
unless (options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).empty?
|
161
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{options.keys.join(', ')}' in extract_all_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
162
|
+
end
|
133
163
|
options = {:from => "100", :to => "899", :separator => ' '}.merge(options)
|
134
164
|
|
135
165
|
lambda do |record, accumulator, context|
|
@@ -135,13 +135,23 @@ module Traject
|
|
135
135
|
# "008[35-37]:LDR[5]"
|
136
136
|
# => bytes 35-37 inclusive of field 008, and byte 5 of the marc leader.
|
137
137
|
#
|
138
|
-
# Returns a nested hash
|
139
|
-
#
|
140
|
-
#
|
141
|
-
#
|
142
|
-
#
|
143
|
-
#
|
144
|
-
#
|
138
|
+
# Returns a nested hash whose keys are tags and whose value is an array
|
139
|
+
# of hash structures indicating what indicators and subfields (or
|
140
|
+
# byte-offsets for control fields) are needed, e.g.
|
141
|
+
#
|
142
|
+
# '245|1*|a:245ab:110:008[15-17]:008[17]' would give us
|
143
|
+
#
|
144
|
+
# {
|
145
|
+
# '245' => [
|
146
|
+
# {:indicators => ['1', nil], :subfields=>['a']},
|
147
|
+
# {:subfields => ['a', 'b']}
|
148
|
+
# ]
|
149
|
+
# '110' => [{}] # all subfields, indicators don't matter
|
150
|
+
# '008' => [
|
151
|
+
# {:bytes => (15..17)}
|
152
|
+
# {:bytes => 17}
|
153
|
+
# ]
|
154
|
+
# }
|
145
155
|
#
|
146
156
|
# * subfields and indicators can only be provided for marc data/variable fields
|
147
157
|
# * byte slice can only be provided for marc control fields (generally tags less than 010)
|
@@ -156,26 +166,31 @@ module Traject
|
|
156
166
|
# variable field
|
157
167
|
tag, indicators, subfields = $1, $3, $4
|
158
168
|
|
159
|
-
hash[tag] ||=
|
169
|
+
hash[tag] ||= []
|
170
|
+
spec = {}
|
160
171
|
|
161
|
-
if subfields
|
162
|
-
subfields
|
163
|
-
hash[tag][:subfields] ||= Array.new
|
164
|
-
hash[tag][:subfields] << subfield
|
165
|
-
end
|
172
|
+
if subfields and !subfields.empty?
|
173
|
+
spec[:subfields] = subfields.split('')
|
166
174
|
end
|
175
|
+
|
167
176
|
if indicators
|
168
|
-
|
177
|
+
spec[:indicators] = [ (indicators[0] if indicators[0] != "*"), (indicators[1] if indicators[1] != "*") ]
|
169
178
|
end
|
179
|
+
|
180
|
+
hash[tag] << spec
|
181
|
+
|
170
182
|
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # "005[4-5]"
|
171
183
|
tag, byte1, byte2 = $1, $3, $5
|
172
|
-
hash[tag] ||=
|
184
|
+
hash[tag] ||= []
|
185
|
+
spec = {}
|
173
186
|
|
174
187
|
if byte1 && byte2
|
175
|
-
|
188
|
+
spec[:bytes] = ((byte1.to_i)..(byte2.to_i))
|
176
189
|
elsif byte1
|
177
|
-
|
190
|
+
spec[:bytes] = byte1.to_i
|
178
191
|
end
|
192
|
+
|
193
|
+
hash[tag] << spec
|
179
194
|
else
|
180
195
|
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
181
196
|
end
|
@@ -210,15 +225,18 @@ module Traject
|
|
210
225
|
def each_matching_line(marc_record)
|
211
226
|
marc_record.fields(@interesting_tags_hash.keys).each do |field|
|
212
227
|
|
213
|
-
|
228
|
+
specs = spec_covering_field(field)
|
214
229
|
|
215
230
|
# Don't have a spec that addresses this field? Move on.
|
216
|
-
next unless
|
231
|
+
next unless specs
|
217
232
|
|
218
233
|
# Make sure it matches indicators too, spec_covering_field
|
219
234
|
# doens't check that.
|
220
|
-
|
221
|
-
|
235
|
+
|
236
|
+
specs.each do |spec|
|
237
|
+
if matches_indicators(field, spec)
|
238
|
+
yield(field, spec, self)
|
239
|
+
end
|
222
240
|
end
|
223
241
|
end
|
224
242
|
end
|
data/lib/traject/solrj_writer.rb
CHANGED
@@ -109,7 +109,7 @@ class Traject::SolrJWriter
|
|
109
109
|
|
110
110
|
@debug_ascii_progress = (@settings["debug_ascii_progress"].to_s == "true")
|
111
111
|
|
112
|
-
logger.info("
|
112
|
+
logger.info(" #{self.class.name} writing to '#{settings['solr.url']}'")
|
113
113
|
end
|
114
114
|
|
115
115
|
# Loads solrj if not already loaded. By loading all jars found
|
data/lib/traject/version.rb
CHANGED
@@ -31,14 +31,14 @@ describe "Traject::Indexer#each_record" do
|
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
34
|
-
it "
|
34
|
+
it "outputs error with source location" do
|
35
35
|
begin
|
36
36
|
@indexer.to_field('foo') {|one, two| }
|
37
37
|
@indexer.each_record {|one, two, three| } # bad arity
|
38
38
|
flunk("Should have rejected bad arity ")
|
39
39
|
rescue Traject::Indexer::ArityError => e
|
40
|
-
assert_match(/
|
41
|
-
rescue
|
40
|
+
assert_match(/each_record at .*\/.*:\d+/, e.message)
|
41
|
+
rescue
|
42
42
|
flunk("Should only fail with a ArityError")
|
43
43
|
end
|
44
44
|
end
|
@@ -53,7 +53,7 @@ describe "Traject::Indexer#each_record" do
|
|
53
53
|
assert_raises(ArgumentError) do
|
54
54
|
@indexer.each_record()
|
55
55
|
end
|
56
|
-
end
|
56
|
+
end
|
57
57
|
|
58
58
|
end
|
59
59
|
end
|
@@ -75,6 +75,15 @@ describe "Traject::Macros::Marc21" do
|
|
75
75
|
|
76
76
|
end
|
77
77
|
|
78
|
+
it "fails on an extra/misspelled argument to extract_marc" do
|
79
|
+
assert_raises(RuntimeError) do
|
80
|
+
@indexer.instance_eval do
|
81
|
+
to_field "foo", extract_marc("9999", :misspelled => "Who cares")
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
|
78
87
|
|
79
88
|
|
80
89
|
it "Marc21::trim_punctuation class method" do
|
@@ -34,7 +34,11 @@ describe "Traject::Indexer#process" do
|
|
34
34
|
@indexer.to_field("title") do |record, accumulator, context|
|
35
35
|
times_called += 1
|
36
36
|
accumulator << "ADDED TITLE"
|
37
|
-
|
37
|
+
|
38
|
+
assert context.index_step, "Context has #index_step set"
|
39
|
+
assert_equal "title", context.index_step.field_name
|
40
|
+
|
41
|
+
assert context.logger, "Context knows #logger"
|
38
42
|
|
39
43
|
assert_equal times_called, context.position
|
40
44
|
end
|
@@ -40,30 +40,17 @@ describe "Traject::Indexer.to_field" do
|
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
flunk("Should only fail with a NamingError")
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
it "finds first (only) field on error" do
|
57
|
-
begin
|
58
|
-
@indexer.to_field('foo') {|one, two| }
|
59
|
-
@indexer.to_field('') {|one, two| } # bad field name
|
60
|
-
flunk("Should have rejected empty field name")
|
61
|
-
rescue Traject::Indexer::NamingError => e
|
62
|
-
assert_match(/foo/, e.message)
|
63
|
-
rescue
|
64
|
-
flunk("Should only fail with a NamingError")
|
65
|
-
end
|
43
|
+
it "outputs error with source location" do
|
44
|
+
begin
|
45
|
+
@indexer.to_field('foo') {|one, two| }
|
46
|
+
@indexer.to_field('') {|one, two| } # bad field name
|
47
|
+
flunk("Should have rejected empty field name")
|
48
|
+
rescue Traject::Indexer::NamingError => e
|
49
|
+
assert_match(/at .*\/.*:\d+/, e.message)
|
50
|
+
rescue
|
51
|
+
flunk("Should only fail with a NamingError")
|
66
52
|
end
|
67
53
|
end
|
54
|
+
|
68
55
|
|
69
56
|
end
|
data/test/marc_extractor_test.rb
CHANGED
@@ -12,43 +12,47 @@ describe "Traject::MarcExtractor" do
|
|
12
12
|
|
13
13
|
assert_kind_of Hash, parsed
|
14
14
|
assert_equal 1, parsed.keys.length
|
15
|
-
|
15
|
+
spec = parsed['245'].first
|
16
|
+
assert_kind_of Hash, spec
|
16
17
|
|
17
|
-
assert_kind_of Array,
|
18
|
-
assert_equal 2,
|
19
|
-
assert_equal "1",
|
20
|
-
assert_nil
|
18
|
+
assert_kind_of Array, spec[:indicators]
|
19
|
+
assert_equal 2, spec[:indicators].length
|
20
|
+
assert_equal "1", spec[:indicators][0]
|
21
|
+
assert_nil spec[:indicators][1]
|
21
22
|
|
22
|
-
assert_kind_of Array,
|
23
|
+
assert_kind_of Array, spec[:subfields]
|
23
24
|
|
24
25
|
end
|
25
26
|
|
26
27
|
it "parses a mixed bag" do
|
27
28
|
parsed = Traject::MarcExtractor.parse_string_spec("245abcde:810:700|*4|bcd")
|
29
|
+
spec245 = parsed['245'].first
|
30
|
+
spec810 = parsed['810'].first
|
31
|
+
spec700 = parsed['700'].first
|
28
32
|
|
29
33
|
assert_length 3, parsed
|
30
34
|
|
31
35
|
#245abcde
|
32
|
-
assert
|
33
|
-
assert_nil
|
34
|
-
assert_equal %w{a b c d e},
|
36
|
+
assert spec245
|
37
|
+
assert_nil spec245[:indicators]
|
38
|
+
assert_equal %w{a b c d e}, spec245[:subfields]
|
35
39
|
|
36
40
|
#810
|
37
|
-
assert
|
38
|
-
assert_nil
|
39
|
-
assert_nil
|
41
|
+
assert spec810
|
42
|
+
assert_nil spec810[:indicators]
|
43
|
+
assert_nil spec810[:subfields], "No subfields"
|
40
44
|
|
41
45
|
#700-*4bcd
|
42
|
-
assert
|
43
|
-
assert_equal [nil, "4"],
|
44
|
-
assert_equal %w{b c d},
|
46
|
+
assert spec700
|
47
|
+
assert_equal [nil, "4"], spec700[:indicators]
|
48
|
+
assert_equal %w{b c d}, spec700[:subfields]
|
45
49
|
end
|
46
50
|
|
47
51
|
it "parses fixed field byte offsets" do
|
48
52
|
parsed = Traject::MarcExtractor.parse_string_spec("005[5]:008[7-10]")
|
49
53
|
|
50
|
-
assert_equal 5, parsed["005"][:bytes]
|
51
|
-
assert_equal 7..10, parsed["008"][:bytes]
|
54
|
+
assert_equal 5, parsed["005"].first[:bytes]
|
55
|
+
assert_equal 7..10, parsed["008"].first[:bytes]
|
52
56
|
end
|
53
57
|
|
54
58
|
it "allows arrays of specs" do
|
@@ -98,7 +102,7 @@ describe "Traject::MarcExtractor" do
|
|
98
102
|
assert ! @a880_100.nil?, "Found an 880-100 to test"
|
99
103
|
end
|
100
104
|
it "finds spec for relevant 880" do
|
101
|
-
assert_equal( {}, @extractor.spec_covering_field(@a880_245) )
|
105
|
+
assert_equal( [{}], @extractor.spec_covering_field(@a880_245) )
|
102
106
|
assert_nil @extractor.spec_covering_field(@a880_100)
|
103
107
|
end
|
104
108
|
it "does not find spec for 880 if disabled" do
|
@@ -108,7 +112,7 @@ describe "Traject::MarcExtractor" do
|
|
108
112
|
it "finds only 880 if so configured" do
|
109
113
|
@extractor = Traject::MarcExtractor.new("245", :alternate_script => :only)
|
110
114
|
assert_nil @extractor.spec_covering_field(@a245)
|
111
|
-
assert_equal({}, @extractor.spec_covering_field(@a880_245))
|
115
|
+
assert_equal([{}], @extractor.spec_covering_field(@a880_245))
|
112
116
|
end
|
113
117
|
end
|
114
118
|
end
|
@@ -289,7 +293,7 @@ describe "Traject::MarcExtractor" do
|
|
289
293
|
describe "MarcExtractor.cached" do
|
290
294
|
it "creates" do
|
291
295
|
ext = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
292
|
-
assert_equal({"245"=>{:subfields=>["a", "b", "c"]}}, ext.spec_hash)
|
296
|
+
assert_equal({"245"=>[{:subfields=>["a", "b", "c"]}]}, ext.spec_hash)
|
293
297
|
assert ext.options[:separator].nil?, "extractor options[:separator] is nil"
|
294
298
|
end
|
295
299
|
it "caches" do
|
@@ -301,4 +305,53 @@ describe "Traject::MarcExtractor" do
|
|
301
305
|
end
|
302
306
|
|
303
307
|
|
308
|
+
describe "Allows multiple uses of the same tag" do
|
309
|
+
before do
|
310
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
311
|
+
end
|
312
|
+
|
313
|
+
it "allows repated tags for a variable field" do
|
314
|
+
extractor = Traject::MarcExtractor.new("245a:245b")
|
315
|
+
values = extractor.extract(@record)
|
316
|
+
assert_equal ['Manufacturing consent :', 'the political economy of the mass media /'], values
|
317
|
+
end
|
318
|
+
|
319
|
+
it "allows repeated tags with indicators specs" do
|
320
|
+
extractor = Traject::MarcExtractor.new("245|1*|a:245|2*|b")
|
321
|
+
@record.append(MARC::DataField.new('245', '2', '0', ['a', 'Subfield A Value'], ['b', 'Subfield B Value']))
|
322
|
+
results = extractor.extract(@record)
|
323
|
+
assert_equal ['Manufacturing consent :', 'Subfield B Value'], results
|
324
|
+
end
|
325
|
+
|
326
|
+
|
327
|
+
|
328
|
+
|
329
|
+
it "works the same as ::separator=>nil" do
|
330
|
+
ex1 = Traject::MarcExtractor.new("245a:245b")
|
331
|
+
ex2 = Traject::MarcExtractor.new("245ab", :separator=>nil)
|
332
|
+
assert_equal ex1.extract(@record), ex2.extract(@record)
|
333
|
+
end
|
334
|
+
|
335
|
+
|
336
|
+
it "allows repeated tags for a control field" do
|
337
|
+
extractor = Traject::MarcExtractor.new("001[0-1]:001[0-3]")
|
338
|
+
values = extractor.extract(@record)
|
339
|
+
assert_equal ["27", "2710"], values
|
340
|
+
end
|
341
|
+
|
342
|
+
it "associates indicators properly with repeated tags" do
|
343
|
+
@record = MARC::Record.new
|
344
|
+
@record.append MARC::DataField.new("100", '1', ' ', ['a', '100a first indicator 1'], ['b', 'should not include 100|1|b'])
|
345
|
+
@record.append MARC::DataField.new("100", '2', ' ', ['b', '100b first indicator 2'], ['a', 'should not include 100|2|a'])
|
346
|
+
|
347
|
+
extractor = Traject::MarcExtractor.new("100|1*|a:100|2*|b")
|
348
|
+
|
349
|
+
values = extractor.extract(@record)
|
350
|
+
|
351
|
+
assert_equal ['100a first indicator 1', '100b first indicator 2'], values
|
352
|
+
end
|
353
|
+
|
354
|
+
end
|
355
|
+
|
356
|
+
|
304
357
|
end
|
data/traject.gemspec
CHANGED
@@ -6,7 +6,7 @@ require 'traject/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = "traject"
|
8
8
|
spec.version = Traject::VERSION
|
9
|
-
spec.authors = ["Jonathan Rochkind"]
|
9
|
+
spec.authors = ["Jonathan Rochkind", "Bill Dueber"]
|
10
10
|
spec.email = ["none@nowhere.org"]
|
11
11
|
spec.summary = %q{Index MARC to Solr; or generally process source records to hash-like structures}
|
12
12
|
spec.homepage = "http://github.com/jrochkind/traject"
|