traject 0.13.2 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +15 -9
- data/lib/traject/indexer.rb +158 -157
- data/lib/traject/macros/marc21.rb +30 -0
- data/lib/traject/marc_extractor.rb +39 -21
- data/lib/traject/solrj_writer.rb +1 -1
- data/lib/traject/version.rb +1 -1
- data/test/indexer/each_record_test.rb +4 -4
- data/test/indexer/macros_marc21_test.rb +9 -0
- data/test/indexer/read_write_test.rb +5 -1
- data/test/indexer/to_field_test.rb +10 -23
- data/test/marc_extractor_test.rb +73 -20
- data/traject.gemspec +1 -1
- metadata +2 -1
data/README.md
CHANGED
|
@@ -13,19 +13,21 @@ them somewhere.
|
|
|
13
13
|
|
|
14
14
|
## Background/Goals
|
|
15
15
|
|
|
16
|
-
Existing tools for indexing Marc to Solr
|
|
16
|
+
Existing tools for indexing Marc to Solr served us well for many years, and have many features.
|
|
17
|
+
But we were having more and more difficulty with them, including in extending/customizing in maintainable ways.
|
|
18
|
+
We realized that to create a tool with the API (internal and external) we wanted, we could do a better
|
|
19
|
+
job with jruby (ruby on the JVM).
|
|
17
20
|
|
|
18
|
-
*
|
|
19
|
-
*
|
|
21
|
+
* **Easy to use**, getting started with standard use cases should be easy, even for non-rubyists.
|
|
22
|
+
* **Support customization and flexiblity**, common customization use cases, including simple local
|
|
20
23
|
logic, should be very easy. More sophisticated and even complex customization use cases should still be possible,
|
|
21
24
|
changing just the parts of traject you want to change.
|
|
22
|
-
*
|
|
23
|
-
*
|
|
25
|
+
* **Maintainable local logic**, supporting sharing of reusable logic via ruby gems.
|
|
26
|
+
* **Comprehensible internal logic**; well-covered by tests, well-factored separation of concerns,
|
|
24
27
|
easy for newcomer developers who know ruby to understand the codebase.
|
|
25
|
-
*
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
* *Well-behaved shell script*, for painless integration in batch processes and cronjobs, with
|
|
28
|
+
* **High performance**, using multi-threaded concurrency where appropriate to maximize throughput.
|
|
29
|
+
traject likely will provide higher throughput than other similar solutions.
|
|
30
|
+
* **Well-behaved shell script**, for painless integration in batch processes and cronjobs, with
|
|
29
31
|
exit codes, sufficiently flexible control of logging, proper use of stderr, etc.
|
|
30
32
|
|
|
31
33
|
|
|
@@ -167,6 +169,10 @@ Other examples of the specification string, which can include multiple tag menti
|
|
|
167
169
|
# each in separate strings:
|
|
168
170
|
to_field "isbn", extract_marc("020az", :separator => nil)
|
|
169
171
|
|
|
172
|
+
# Same thing, but more explicit
|
|
173
|
+
to_field "isbn", extract_marc("020a:020z")
|
|
174
|
+
|
|
175
|
+
|
|
170
176
|
# Make sure that you don't get any duplicates
|
|
171
177
|
# by passing in ":deduplicate => true"
|
|
172
178
|
to_field 'language008', extract_marc('008[35-37]', :deduplicate=>true)
|
data/lib/traject/indexer.rb
CHANGED
|
@@ -50,13 +50,13 @@ require 'traject/macros/basic'
|
|
|
50
50
|
# with a String name of class meeting the Writer contract.
|
|
51
51
|
#
|
|
52
52
|
class Traject::Indexer
|
|
53
|
-
|
|
53
|
+
|
|
54
54
|
# Arity error on a passed block
|
|
55
55
|
class ArityError < ArgumentError; end
|
|
56
56
|
class NamingError < ArgumentError; end
|
|
57
57
|
|
|
58
|
-
|
|
59
|
-
|
|
58
|
+
|
|
59
|
+
|
|
60
60
|
include Traject::QualifiedConstGet
|
|
61
61
|
|
|
62
62
|
attr_writer :reader_class, :writer_class
|
|
@@ -155,26 +155,11 @@ class Traject::Indexer
|
|
|
155
155
|
|
|
156
156
|
# Used to define an indexing mapping.
|
|
157
157
|
def to_field(field_name, aLambda = nil, &block)
|
|
158
|
-
|
|
159
|
-
verify_to_field_arguments(field_name, aLambda, block)
|
|
160
|
-
|
|
161
|
-
@index_steps << {
|
|
162
|
-
:field_name => field_name.to_s,
|
|
163
|
-
:lambda => aLambda,
|
|
164
|
-
:block => block,
|
|
165
|
-
:type => :to_field,
|
|
166
|
-
:source_location => Traject::Util.extract_caller_location(caller.first)
|
|
167
|
-
}
|
|
158
|
+
@index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
|
168
159
|
end
|
|
169
160
|
|
|
170
161
|
def each_record(aLambda = nil, &block)
|
|
171
|
-
|
|
172
|
-
@index_steps << {
|
|
173
|
-
:lambda => aLambda,
|
|
174
|
-
:block => block,
|
|
175
|
-
:type => :each_record,
|
|
176
|
-
:source_location => Traject::Util.extract_caller_location(caller.first)
|
|
177
|
-
}
|
|
162
|
+
@index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
|
178
163
|
end
|
|
179
164
|
|
|
180
165
|
|
|
@@ -203,51 +188,24 @@ class Traject::Indexer
|
|
|
203
188
|
# to mapping routines.
|
|
204
189
|
#
|
|
205
190
|
# Returns the context passed in as second arg, as a convenience for chaining etc.
|
|
191
|
+
|
|
206
192
|
def map_to_context!(context)
|
|
207
193
|
@index_steps.each do |index_step|
|
|
208
194
|
# Don't bother if we're skipping this record
|
|
209
195
|
break if context.skip?
|
|
210
|
-
if index_step[:type] == :to_field
|
|
211
|
-
|
|
212
|
-
accumulator = []
|
|
213
|
-
context.field_name = index_step[:field_name]
|
|
214
|
-
|
|
215
|
-
# Might have a lambda arg AND a block, we execute in order,
|
|
216
|
-
# with same accumulator.
|
|
217
|
-
|
|
218
|
-
[index_step[:lambda], index_step[:block]].each do |aProc|
|
|
219
|
-
if aProc
|
|
220
|
-
log_mapping_errors(context, index_step, aProc) do
|
|
221
|
-
if aProc.arity == 2
|
|
222
|
-
aProc.call(context.source_record, accumulator)
|
|
223
|
-
else
|
|
224
|
-
aProc.call(context.source_record, accumulator, context)
|
|
225
|
-
end
|
|
226
|
-
end
|
|
227
|
-
end
|
|
228
|
-
end
|
|
229
|
-
accumulator.compact!
|
|
230
|
-
(context.output_hash[context.field_name] ||= []).concat accumulator unless accumulator.empty?
|
|
231
|
-
context.field_name = nil
|
|
232
|
-
|
|
233
|
-
elsif index_step[:type] == :each_record
|
|
234
|
-
|
|
235
|
-
# one or two arg
|
|
236
|
-
[index_step[:lambda], index_step[:block]].each do |aProc|
|
|
237
|
-
if aProc
|
|
238
|
-
log_mapping_errors(context, index_step, aProc) do
|
|
239
|
-
if aProc.arity == 1
|
|
240
|
-
aProc.call(context.source_record)
|
|
241
|
-
else
|
|
242
|
-
aProc.call(context.source_record, context)
|
|
243
|
-
end
|
|
244
|
-
end
|
|
245
|
-
end
|
|
246
|
-
end
|
|
247
196
|
|
|
248
|
-
|
|
249
|
-
|
|
197
|
+
context.index_step = index_step
|
|
198
|
+
accumulator = log_mapping_errors(context, index_step) do
|
|
199
|
+
index_step.execute(context) # will always return [] for an each_record step
|
|
250
200
|
end
|
|
201
|
+
context.index_step =
|
|
202
|
+
|
|
203
|
+
accumulator.compact!
|
|
204
|
+
if accumulator.size > 0
|
|
205
|
+
(context.output_hash[index_step.field_name] ||= []).concat accumulator
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
context.index_step = index_step
|
|
251
209
|
end
|
|
252
210
|
|
|
253
211
|
return context
|
|
@@ -255,22 +213,19 @@ class Traject::Indexer
|
|
|
255
213
|
|
|
256
214
|
# just a wrapper that captures and records any unexpected
|
|
257
215
|
# errors raised in mapping, along with contextual information
|
|
258
|
-
# on record and location in source file of mapping rule.
|
|
216
|
+
# on record and location in source file of mapping rule.
|
|
259
217
|
#
|
|
260
|
-
# Re-raises error at the moment.
|
|
218
|
+
# Re-raises error at the moment.
|
|
261
219
|
#
|
|
262
|
-
#
|
|
220
|
+
# log_mapping_errors(context, index_step) do
|
|
263
221
|
# all_sorts_of_stuff # that will have errors logged
|
|
264
222
|
# end
|
|
265
|
-
def log_mapping_errors(context, index_step
|
|
223
|
+
def log_mapping_errors(context, index_step)
|
|
266
224
|
begin
|
|
267
225
|
yield
|
|
268
226
|
rescue Exception => e
|
|
269
227
|
msg = "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
|
|
270
|
-
|
|
271
|
-
conf = context.field_name ? "to_field '#{context.field_name}'" : "each_record"
|
|
272
|
-
|
|
273
|
-
msg += " while executing #{conf} defined at #{index_step[:source_location]}\n"
|
|
228
|
+
msg += " while executing #{index_step.inspect}\n"
|
|
274
229
|
msg += Traject::Util.exception_to_log_message(e)
|
|
275
230
|
|
|
276
231
|
logger.error msg
|
|
@@ -284,6 +239,12 @@ class Traject::Indexer
|
|
|
284
239
|
end
|
|
285
240
|
end
|
|
286
241
|
|
|
242
|
+
# get a printable id from record for error logging.
|
|
243
|
+
# Maybe override this for a future XML version.
|
|
244
|
+
def id_string(record)
|
|
245
|
+
record && record['001'] && record['001'].value.to_s
|
|
246
|
+
end
|
|
247
|
+
|
|
287
248
|
# Processes a stream of records, reading from the configured Reader,
|
|
288
249
|
# mapping according to configured mapping rules, and then writing
|
|
289
250
|
# to configured Writer.
|
|
@@ -335,13 +296,14 @@ class Traject::Indexer
|
|
|
335
296
|
#thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
|
|
336
297
|
thread_pool.maybe_in_thread_pool do
|
|
337
298
|
context = Context.new(:source_record => record, :settings => settings, :position => position)
|
|
299
|
+
context.logger = logger
|
|
338
300
|
map_to_context!(context)
|
|
339
301
|
if context.skip?
|
|
340
302
|
log_skip(context)
|
|
341
303
|
else
|
|
342
304
|
writer.put context
|
|
343
305
|
end
|
|
344
|
-
|
|
306
|
+
|
|
345
307
|
end
|
|
346
308
|
|
|
347
309
|
end
|
|
@@ -353,7 +315,7 @@ class Traject::Indexer
|
|
|
353
315
|
|
|
354
316
|
thread_pool.raise_collected_exception!
|
|
355
317
|
|
|
356
|
-
|
|
318
|
+
|
|
357
319
|
writer.close if writer.respond_to?(:close)
|
|
358
320
|
|
|
359
321
|
elapsed = Time.now - start_time
|
|
@@ -367,7 +329,7 @@ class Traject::Indexer
|
|
|
367
329
|
|
|
368
330
|
return true
|
|
369
331
|
end
|
|
370
|
-
|
|
332
|
+
|
|
371
333
|
# Log that the current record is being skipped, using
|
|
372
334
|
# data in context.position and context.skipmessage
|
|
373
335
|
def log_skip(context)
|
|
@@ -399,89 +361,10 @@ class Traject::Indexer
|
|
|
399
361
|
return writer_class.new(settings.merge("logger" => logger))
|
|
400
362
|
end
|
|
401
363
|
|
|
402
|
-
# get a printable id from record for error logging.
|
|
403
|
-
# Maybe override this for a future XML version.
|
|
404
|
-
def id_string(record)
|
|
405
|
-
record && record['001'] && record['001'].value.to_s
|
|
406
|
-
end
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
# Verify that the field name is good, and throw a useful error if not
|
|
412
|
-
def verify_field_name(field_name)
|
|
413
|
-
if field_name.nil? || !field_name.is_a?(String) || field_name.empty?
|
|
414
|
-
raise NamingError.new("to_field requires the field name (String) as the first argument (#{last_named_step.message})")
|
|
415
|
-
end
|
|
416
|
-
end
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
# Verify the various, increasingly-complex things that can be sent to to_field
|
|
420
|
-
# to make sure it's all kosher.
|
|
421
|
-
#
|
|
422
|
-
# "Modification" takes place for zero-argument blocks that return a lambda
|
|
423
|
-
|
|
424
|
-
def verify_to_field_arguments(field_name, aLambda, block)
|
|
425
|
-
|
|
426
|
-
verify_field_name(field_name)
|
|
427
|
-
|
|
428
|
-
[aLambda, block].each do |proc|
|
|
429
|
-
# allow negative arity, meaning variable/optional, trust em on that.
|
|
430
|
-
# but for positive arrity, we need 2 or 3 args
|
|
431
|
-
if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
|
|
432
|
-
raise ArityError.new("error parsing field '#{field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{last_named_step.message})")
|
|
433
|
-
end
|
|
434
|
-
end
|
|
435
|
-
|
|
436
|
-
end
|
|
437
|
-
|
|
438
|
-
# Verify the procs sent to each_record to make sure it's all kosher.
|
|
439
|
-
|
|
440
|
-
def verify_each_record_arguments(aLambda, block)
|
|
441
|
-
unless aLambda or block
|
|
442
|
-
raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{last_named_step.message})")
|
|
443
|
-
end
|
|
444
|
-
|
|
445
|
-
[aLambda, block].each do |proc|
|
|
446
|
-
# allow negative arity, meaning variable/optional, trust em on that.
|
|
447
|
-
# but for positive arrity, we need 1 or 2 args
|
|
448
|
-
if proc
|
|
449
|
-
unless proc.is_a?(Proc)
|
|
450
|
-
raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} (#{last_named_step.message})")
|
|
451
|
-
end
|
|
452
|
-
if (proc.arity == 0 || proc.arity > 2)
|
|
453
|
-
raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: #{proc} (#{last_named_step.message})")
|
|
454
|
-
end
|
|
455
|
-
end
|
|
456
|
-
end
|
|
457
|
-
end
|
|
458
|
-
|
|
459
|
-
def last_named_step
|
|
460
|
-
return LastNamedStep.new(@index_steps)
|
|
461
|
-
end
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
# A convenient way to find, and generate error messages for, the last named step (for helping locate parse errors)
|
|
465
|
-
class LastNamedStep
|
|
466
|
-
attr_accessor :step, :message
|
|
467
|
-
|
|
468
|
-
# Get the last step for which we have a field_name (e.g., the last to_field, skipping over each_record)
|
|
469
|
-
def initialize(index_steps)
|
|
470
|
-
@step = index_steps.reverse_each.find{|step| step[:field_name]}
|
|
471
|
-
if @step
|
|
472
|
-
@message = "last successfully parsed field was '#{@step[:field_name]}'"
|
|
473
|
-
else
|
|
474
|
-
@message = "there were no previous named fields successfully parsed"
|
|
475
|
-
end
|
|
476
|
-
end
|
|
477
|
-
end
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
364
|
# Represents the context of a specific record being indexed, passed
|
|
482
365
|
# to indexing logic blocks
|
|
483
366
|
#
|
|
484
|
-
class
|
|
367
|
+
class Context
|
|
485
368
|
def initialize(hash_init = {})
|
|
486
369
|
# TODO, argument checking for required args?
|
|
487
370
|
|
|
@@ -491,29 +374,147 @@ class Traject::Indexer
|
|
|
491
374
|
hash_init.each_pair do |key, value|
|
|
492
375
|
self.send("#{key}=", value)
|
|
493
376
|
end
|
|
494
|
-
|
|
377
|
+
|
|
495
378
|
@skip = false
|
|
496
379
|
end
|
|
497
380
|
|
|
498
|
-
attr_accessor :clipboard, :output_hash
|
|
499
|
-
attr_accessor :
|
|
381
|
+
attr_accessor :clipboard, :output_hash, :logger
|
|
382
|
+
attr_accessor :index_step, :source_record, :settings
|
|
500
383
|
# 1-based position in stream of processed records.
|
|
501
384
|
attr_accessor :position
|
|
502
|
-
|
|
385
|
+
|
|
503
386
|
# Should we be skipping this record?
|
|
504
387
|
attr_accessor :skipmessage
|
|
505
|
-
|
|
388
|
+
|
|
506
389
|
# Set the fact that this record should be skipped, with an
|
|
507
390
|
# optional message
|
|
508
391
|
def skip!(msg = '(no message given)')
|
|
509
392
|
@skipmessage = msg
|
|
510
393
|
@skip = true
|
|
511
394
|
end
|
|
512
|
-
|
|
395
|
+
|
|
513
396
|
# Should we skip this record?
|
|
514
397
|
def skip?
|
|
515
398
|
@skip
|
|
516
399
|
end
|
|
517
|
-
|
|
400
|
+
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
# An indexing step definition, including it's source location
|
|
406
|
+
# for logging
|
|
407
|
+
#
|
|
408
|
+
# This one represents an "each_record" step, a subclass below
|
|
409
|
+
# for "to_field"
|
|
410
|
+
#
|
|
411
|
+
# source_location is just a string with filename and line number for
|
|
412
|
+
# showing to devs in debugging.
|
|
413
|
+
class EachRecordStep
|
|
414
|
+
attr_accessor :source_location, :lambda, :block
|
|
415
|
+
|
|
416
|
+
def initialize(lambda, block, source_location)
|
|
417
|
+
self.lambda = lambda
|
|
418
|
+
self.block = block
|
|
419
|
+
self.source_location = source_location
|
|
420
|
+
|
|
421
|
+
self.validate!
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
# raises if bad data
|
|
425
|
+
def validate!
|
|
426
|
+
unless self.lambda or self.block
|
|
427
|
+
raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{self.inspect})")
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
[self.lambda, self.block].each do |proc|
|
|
431
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
|
432
|
+
# but for positive arrity, we need 1 or 2 args
|
|
433
|
+
if proc
|
|
434
|
+
unless proc.is_a?(Proc)
|
|
435
|
+
raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} (#{self.inspect})")
|
|
436
|
+
end
|
|
437
|
+
if (proc.arity == 0 || proc.arity > 2)
|
|
438
|
+
raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: (#{self.inspect})")
|
|
439
|
+
end
|
|
440
|
+
end
|
|
441
|
+
end
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
# For each_record, always return an empty array as the
|
|
445
|
+
# accumulator, since it doesn't have those kinds of side effects
|
|
446
|
+
def execute(context)
|
|
447
|
+
[@lambda, @block].each do |aProc|
|
|
448
|
+
next unless aProc
|
|
449
|
+
|
|
450
|
+
if aProc.arity == 1
|
|
451
|
+
aProc.call(context.source_record)
|
|
452
|
+
else
|
|
453
|
+
aProc.call(context.source_record, context)
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
end
|
|
457
|
+
return [] # empty -- no accumulator for each_record
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
# Over-ride inspect for outputting error messages etc.
|
|
461
|
+
def inspect
|
|
462
|
+
"<each_record at #{source_location}>"
|
|
463
|
+
end
|
|
518
464
|
end
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
# An indexing step definition for a "to_field" step to specific
|
|
468
|
+
# field.
|
|
469
|
+
class ToFieldStep
|
|
470
|
+
attr_accessor :field_name, :lambda, :block, :source_location
|
|
471
|
+
def initialize(fieldname, lambda, block, source_location)
|
|
472
|
+
self.field_name = fieldname
|
|
473
|
+
self.lambda = lambda
|
|
474
|
+
self.block = block
|
|
475
|
+
self.source_location = source_location
|
|
476
|
+
|
|
477
|
+
validate!
|
|
478
|
+
end
|
|
479
|
+
|
|
480
|
+
def validate!
|
|
481
|
+
|
|
482
|
+
if self.field_name.nil? || !self.field_name.is_a?(String) || self.field_name.empty?
|
|
483
|
+
raise NamingError.new("to_field requires the field name (as a string) as the first argument at #{self.source_location})")
|
|
484
|
+
end
|
|
485
|
+
|
|
486
|
+
[self.lambda, self.block].each do |proc|
|
|
487
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
|
488
|
+
# but for positive arrity, we need 2 or 3 args
|
|
489
|
+
if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
|
|
490
|
+
raise ArityError.new("error parsing field '#{self.field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{self.inspect})")
|
|
491
|
+
end
|
|
492
|
+
end
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
# Override inspect for developer debug messages
|
|
496
|
+
def inspect
|
|
497
|
+
"<to_field #{self.field_name} at #{self.source_location}>"
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
def execute(context)
|
|
501
|
+
accumulator = []
|
|
502
|
+
[@lambda, @block].each do |aProc|
|
|
503
|
+
next unless aProc
|
|
504
|
+
|
|
505
|
+
if aProc.arity == 2
|
|
506
|
+
aProc.call(context.source_record, accumulator)
|
|
507
|
+
else
|
|
508
|
+
aProc.call(context.source_record, accumulator, context)
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
end
|
|
512
|
+
return accumulator
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
end
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
|
|
519
520
|
end
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
require 'traject/marc_extractor'
|
|
2
2
|
require 'traject/translation_map'
|
|
3
|
+
require 'traject/util'
|
|
3
4
|
require 'base64'
|
|
4
5
|
require 'json'
|
|
5
6
|
|
|
@@ -30,7 +31,22 @@ module Traject::Macros
|
|
|
30
31
|
# to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
|
|
31
32
|
# to_field("id"), extract_marc("001", :first => true)
|
|
32
33
|
# to_field("geo"), extract_marc("040a", :separator => nil, :translation_map => "marc040")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# A list of symbols that are valid keys in the options hash
|
|
37
|
+
EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
|
|
38
|
+
:deduplicate, :uniq, :separator, :translation_map,
|
|
39
|
+
:alternate_script]
|
|
40
|
+
|
|
33
41
|
def extract_marc(spec, options = {})
|
|
42
|
+
|
|
43
|
+
# Raise an error if there are any invalid options, indicating a
|
|
44
|
+
# misspelled or illegal option, using a string instead of a symbol, etc.
|
|
45
|
+
|
|
46
|
+
unless (options.keys - EXTRACT_MARC_VALID_OPTIONS).empty?
|
|
47
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{options.keys.join(', ')}' in extract_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
|
48
|
+
end
|
|
49
|
+
|
|
34
50
|
only_first = options.delete(:first)
|
|
35
51
|
trim_punctuation = options.delete(:trim_punctuation)
|
|
36
52
|
default_value = options.delete(:default)
|
|
@@ -46,6 +62,7 @@ module Traject::Macros
|
|
|
46
62
|
if translation_map_arg = options.delete(:translation_map)
|
|
47
63
|
translation_map = Traject::TranslationMap.new(translation_map_arg)
|
|
48
64
|
end
|
|
65
|
+
|
|
49
66
|
|
|
50
67
|
extractor = Traject::MarcExtractor.new(spec, options)
|
|
51
68
|
|
|
@@ -93,7 +110,14 @@ module Traject::Macros
|
|
|
93
110
|
# serialized, with certain header bytes filled with ascii 0's
|
|
94
111
|
# -- technically illegal MARC, but can still be read by
|
|
95
112
|
# ruby MARC::Reader in permissive mode.
|
|
113
|
+
|
|
114
|
+
SERIALZED_MARC_VALID_OPTIONS = [:format, :binary_escape, :allow_oversized, :format]
|
|
115
|
+
|
|
96
116
|
def serialized_marc(options)
|
|
117
|
+
unless (options.keys - SERIALZED_MARC_VALID_OPTIONS).empty?
|
|
118
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{options.keys.join(', ')}' in seralized_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
|
119
|
+
end
|
|
120
|
+
|
|
97
121
|
format = options[:format].to_s
|
|
98
122
|
binary_escape = (options[:binary_escape] != false)
|
|
99
123
|
allow_oversized = (options[:allow_oversized] == true)
|
|
@@ -129,7 +153,13 @@ module Traject::Macros
|
|
|
129
153
|
#
|
|
130
154
|
# Can always run this thing multiple times on the same field if you need
|
|
131
155
|
# non-contiguous ranges of fields.
|
|
156
|
+
|
|
157
|
+
EXTRACT_ALL_MARC_VALID_OPTIONS = [:separator, :from, :to]
|
|
158
|
+
|
|
132
159
|
def extract_all_marc_values(options = {})
|
|
160
|
+
unless (options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).empty?
|
|
161
|
+
raise RuntimeError.new("Illegal/Unknown argument '#{options.keys.join(', ')}' in extract_all_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
|
162
|
+
end
|
|
133
163
|
options = {:from => "100", :to => "899", :separator => ' '}.merge(options)
|
|
134
164
|
|
|
135
165
|
lambda do |record, accumulator, context|
|
|
@@ -135,13 +135,23 @@ module Traject
|
|
|
135
135
|
# "008[35-37]:LDR[5]"
|
|
136
136
|
# => bytes 35-37 inclusive of field 008, and byte 5 of the marc leader.
|
|
137
137
|
#
|
|
138
|
-
# Returns a nested hash
|
|
139
|
-
#
|
|
140
|
-
#
|
|
141
|
-
#
|
|
142
|
-
#
|
|
143
|
-
#
|
|
144
|
-
#
|
|
138
|
+
# Returns a nested hash whose keys are tags and whose value is an array
|
|
139
|
+
# of hash structures indicating what indicators and subfields (or
|
|
140
|
+
# byte-offsets for control fields) are needed, e.g.
|
|
141
|
+
#
|
|
142
|
+
# '245|1*|a:245ab:110:008[15-17]:008[17]' would give us
|
|
143
|
+
#
|
|
144
|
+
# {
|
|
145
|
+
# '245' => [
|
|
146
|
+
# {:indicators => ['1', nil], :subfields=>['a']},
|
|
147
|
+
# {:subfields => ['a', 'b']}
|
|
148
|
+
# ]
|
|
149
|
+
# '110' => [{}] # all subfields, indicators don't matter
|
|
150
|
+
# '008' => [
|
|
151
|
+
# {:bytes => (15..17)}
|
|
152
|
+
# {:bytes => 17}
|
|
153
|
+
# ]
|
|
154
|
+
# }
|
|
145
155
|
#
|
|
146
156
|
# * subfields and indicators can only be provided for marc data/variable fields
|
|
147
157
|
# * byte slice can only be provided for marc control fields (generally tags less than 010)
|
|
@@ -156,26 +166,31 @@ module Traject
|
|
|
156
166
|
# variable field
|
|
157
167
|
tag, indicators, subfields = $1, $3, $4
|
|
158
168
|
|
|
159
|
-
hash[tag] ||=
|
|
169
|
+
hash[tag] ||= []
|
|
170
|
+
spec = {}
|
|
160
171
|
|
|
161
|
-
if subfields
|
|
162
|
-
subfields
|
|
163
|
-
hash[tag][:subfields] ||= Array.new
|
|
164
|
-
hash[tag][:subfields] << subfield
|
|
165
|
-
end
|
|
172
|
+
if subfields and !subfields.empty?
|
|
173
|
+
spec[:subfields] = subfields.split('')
|
|
166
174
|
end
|
|
175
|
+
|
|
167
176
|
if indicators
|
|
168
|
-
|
|
177
|
+
spec[:indicators] = [ (indicators[0] if indicators[0] != "*"), (indicators[1] if indicators[1] != "*") ]
|
|
169
178
|
end
|
|
179
|
+
|
|
180
|
+
hash[tag] << spec
|
|
181
|
+
|
|
170
182
|
elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # "005[4-5]"
|
|
171
183
|
tag, byte1, byte2 = $1, $3, $5
|
|
172
|
-
hash[tag] ||=
|
|
184
|
+
hash[tag] ||= []
|
|
185
|
+
spec = {}
|
|
173
186
|
|
|
174
187
|
if byte1 && byte2
|
|
175
|
-
|
|
188
|
+
spec[:bytes] = ((byte1.to_i)..(byte2.to_i))
|
|
176
189
|
elsif byte1
|
|
177
|
-
|
|
190
|
+
spec[:bytes] = byte1.to_i
|
|
178
191
|
end
|
|
192
|
+
|
|
193
|
+
hash[tag] << spec
|
|
179
194
|
else
|
|
180
195
|
raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
|
|
181
196
|
end
|
|
@@ -210,15 +225,18 @@ module Traject
|
|
|
210
225
|
def each_matching_line(marc_record)
|
|
211
226
|
marc_record.fields(@interesting_tags_hash.keys).each do |field|
|
|
212
227
|
|
|
213
|
-
|
|
228
|
+
specs = spec_covering_field(field)
|
|
214
229
|
|
|
215
230
|
# Don't have a spec that addresses this field? Move on.
|
|
216
|
-
next unless
|
|
231
|
+
next unless specs
|
|
217
232
|
|
|
218
233
|
# Make sure it matches indicators too, spec_covering_field
|
|
219
234
|
# doens't check that.
|
|
220
|
-
|
|
221
|
-
|
|
235
|
+
|
|
236
|
+
specs.each do |spec|
|
|
237
|
+
if matches_indicators(field, spec)
|
|
238
|
+
yield(field, spec, self)
|
|
239
|
+
end
|
|
222
240
|
end
|
|
223
241
|
end
|
|
224
242
|
end
|
data/lib/traject/solrj_writer.rb
CHANGED
|
@@ -109,7 +109,7 @@ class Traject::SolrJWriter
|
|
|
109
109
|
|
|
110
110
|
@debug_ascii_progress = (@settings["debug_ascii_progress"].to_s == "true")
|
|
111
111
|
|
|
112
|
-
logger.info("
|
|
112
|
+
logger.info(" #{self.class.name} writing to '#{settings['solr.url']}'")
|
|
113
113
|
end
|
|
114
114
|
|
|
115
115
|
# Loads solrj if not already loaded. By loading all jars found
|
data/lib/traject/version.rb
CHANGED
|
@@ -31,14 +31,14 @@ describe "Traject::Indexer#each_record" do
|
|
|
31
31
|
end
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
-
it "
|
|
34
|
+
it "outputs error with source location" do
|
|
35
35
|
begin
|
|
36
36
|
@indexer.to_field('foo') {|one, two| }
|
|
37
37
|
@indexer.each_record {|one, two, three| } # bad arity
|
|
38
38
|
flunk("Should have rejected bad arity ")
|
|
39
39
|
rescue Traject::Indexer::ArityError => e
|
|
40
|
-
assert_match(/
|
|
41
|
-
rescue
|
|
40
|
+
assert_match(/each_record at .*\/.*:\d+/, e.message)
|
|
41
|
+
rescue
|
|
42
42
|
flunk("Should only fail with a ArityError")
|
|
43
43
|
end
|
|
44
44
|
end
|
|
@@ -53,7 +53,7 @@ describe "Traject::Indexer#each_record" do
|
|
|
53
53
|
assert_raises(ArgumentError) do
|
|
54
54
|
@indexer.each_record()
|
|
55
55
|
end
|
|
56
|
-
end
|
|
56
|
+
end
|
|
57
57
|
|
|
58
58
|
end
|
|
59
59
|
end
|
|
@@ -75,6 +75,15 @@ describe "Traject::Macros::Marc21" do
|
|
|
75
75
|
|
|
76
76
|
end
|
|
77
77
|
|
|
78
|
+
it "fails on an extra/misspelled argument to extract_marc" do
|
|
79
|
+
assert_raises(RuntimeError) do
|
|
80
|
+
@indexer.instance_eval do
|
|
81
|
+
to_field "foo", extract_marc("9999", :misspelled => "Who cares")
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
|
|
78
87
|
|
|
79
88
|
|
|
80
89
|
it "Marc21::trim_punctuation class method" do
|
|
@@ -34,7 +34,11 @@ describe "Traject::Indexer#process" do
|
|
|
34
34
|
@indexer.to_field("title") do |record, accumulator, context|
|
|
35
35
|
times_called += 1
|
|
36
36
|
accumulator << "ADDED TITLE"
|
|
37
|
-
|
|
37
|
+
|
|
38
|
+
assert context.index_step, "Context has #index_step set"
|
|
39
|
+
assert_equal "title", context.index_step.field_name
|
|
40
|
+
|
|
41
|
+
assert context.logger, "Context knows #logger"
|
|
38
42
|
|
|
39
43
|
assert_equal times_called, context.position
|
|
40
44
|
end
|
|
@@ -40,30 +40,17 @@ describe "Traject::Indexer.to_field" do
|
|
|
40
40
|
end
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
flunk("Should only fail with a NamingError")
|
|
53
|
-
end
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
it "finds first (only) field on error" do
|
|
57
|
-
begin
|
|
58
|
-
@indexer.to_field('foo') {|one, two| }
|
|
59
|
-
@indexer.to_field('') {|one, two| } # bad field name
|
|
60
|
-
flunk("Should have rejected empty field name")
|
|
61
|
-
rescue Traject::Indexer::NamingError => e
|
|
62
|
-
assert_match(/foo/, e.message)
|
|
63
|
-
rescue
|
|
64
|
-
flunk("Should only fail with a NamingError")
|
|
65
|
-
end
|
|
43
|
+
it "outputs error with source location" do
|
|
44
|
+
begin
|
|
45
|
+
@indexer.to_field('foo') {|one, two| }
|
|
46
|
+
@indexer.to_field('') {|one, two| } # bad field name
|
|
47
|
+
flunk("Should have rejected empty field name")
|
|
48
|
+
rescue Traject::Indexer::NamingError => e
|
|
49
|
+
assert_match(/at .*\/.*:\d+/, e.message)
|
|
50
|
+
rescue
|
|
51
|
+
flunk("Should only fail with a NamingError")
|
|
66
52
|
end
|
|
67
53
|
end
|
|
54
|
+
|
|
68
55
|
|
|
69
56
|
end
|
data/test/marc_extractor_test.rb
CHANGED
|
@@ -12,43 +12,47 @@ describe "Traject::MarcExtractor" do
|
|
|
12
12
|
|
|
13
13
|
assert_kind_of Hash, parsed
|
|
14
14
|
assert_equal 1, parsed.keys.length
|
|
15
|
-
|
|
15
|
+
spec = parsed['245'].first
|
|
16
|
+
assert_kind_of Hash, spec
|
|
16
17
|
|
|
17
|
-
assert_kind_of Array,
|
|
18
|
-
assert_equal 2,
|
|
19
|
-
assert_equal "1",
|
|
20
|
-
assert_nil
|
|
18
|
+
assert_kind_of Array, spec[:indicators]
|
|
19
|
+
assert_equal 2, spec[:indicators].length
|
|
20
|
+
assert_equal "1", spec[:indicators][0]
|
|
21
|
+
assert_nil spec[:indicators][1]
|
|
21
22
|
|
|
22
|
-
assert_kind_of Array,
|
|
23
|
+
assert_kind_of Array, spec[:subfields]
|
|
23
24
|
|
|
24
25
|
end
|
|
25
26
|
|
|
26
27
|
it "parses a mixed bag" do
|
|
27
28
|
parsed = Traject::MarcExtractor.parse_string_spec("245abcde:810:700|*4|bcd")
|
|
29
|
+
spec245 = parsed['245'].first
|
|
30
|
+
spec810 = parsed['810'].first
|
|
31
|
+
spec700 = parsed['700'].first
|
|
28
32
|
|
|
29
33
|
assert_length 3, parsed
|
|
30
34
|
|
|
31
35
|
#245abcde
|
|
32
|
-
assert
|
|
33
|
-
assert_nil
|
|
34
|
-
assert_equal %w{a b c d e},
|
|
36
|
+
assert spec245
|
|
37
|
+
assert_nil spec245[:indicators]
|
|
38
|
+
assert_equal %w{a b c d e}, spec245[:subfields]
|
|
35
39
|
|
|
36
40
|
#810
|
|
37
|
-
assert
|
|
38
|
-
assert_nil
|
|
39
|
-
assert_nil
|
|
41
|
+
assert spec810
|
|
42
|
+
assert_nil spec810[:indicators]
|
|
43
|
+
assert_nil spec810[:subfields], "No subfields"
|
|
40
44
|
|
|
41
45
|
#700-*4bcd
|
|
42
|
-
assert
|
|
43
|
-
assert_equal [nil, "4"],
|
|
44
|
-
assert_equal %w{b c d},
|
|
46
|
+
assert spec700
|
|
47
|
+
assert_equal [nil, "4"], spec700[:indicators]
|
|
48
|
+
assert_equal %w{b c d}, spec700[:subfields]
|
|
45
49
|
end
|
|
46
50
|
|
|
47
51
|
it "parses fixed field byte offsets" do
|
|
48
52
|
parsed = Traject::MarcExtractor.parse_string_spec("005[5]:008[7-10]")
|
|
49
53
|
|
|
50
|
-
assert_equal 5, parsed["005"][:bytes]
|
|
51
|
-
assert_equal 7..10, parsed["008"][:bytes]
|
|
54
|
+
assert_equal 5, parsed["005"].first[:bytes]
|
|
55
|
+
assert_equal 7..10, parsed["008"].first[:bytes]
|
|
52
56
|
end
|
|
53
57
|
|
|
54
58
|
it "allows arrays of specs" do
|
|
@@ -98,7 +102,7 @@ describe "Traject::MarcExtractor" do
|
|
|
98
102
|
assert ! @a880_100.nil?, "Found an 880-100 to test"
|
|
99
103
|
end
|
|
100
104
|
it "finds spec for relevant 880" do
|
|
101
|
-
assert_equal( {}, @extractor.spec_covering_field(@a880_245) )
|
|
105
|
+
assert_equal( [{}], @extractor.spec_covering_field(@a880_245) )
|
|
102
106
|
assert_nil @extractor.spec_covering_field(@a880_100)
|
|
103
107
|
end
|
|
104
108
|
it "does not find spec for 880 if disabled" do
|
|
@@ -108,7 +112,7 @@ describe "Traject::MarcExtractor" do
|
|
|
108
112
|
it "finds only 880 if so configured" do
|
|
109
113
|
@extractor = Traject::MarcExtractor.new("245", :alternate_script => :only)
|
|
110
114
|
assert_nil @extractor.spec_covering_field(@a245)
|
|
111
|
-
assert_equal({}, @extractor.spec_covering_field(@a880_245))
|
|
115
|
+
assert_equal([{}], @extractor.spec_covering_field(@a880_245))
|
|
112
116
|
end
|
|
113
117
|
end
|
|
114
118
|
end
|
|
@@ -289,7 +293,7 @@ describe "Traject::MarcExtractor" do
|
|
|
289
293
|
describe "MarcExtractor.cached" do
|
|
290
294
|
it "creates" do
|
|
291
295
|
ext = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
|
292
|
-
assert_equal({"245"=>{:subfields=>["a", "b", "c"]}}, ext.spec_hash)
|
|
296
|
+
assert_equal({"245"=>[{:subfields=>["a", "b", "c"]}]}, ext.spec_hash)
|
|
293
297
|
assert ext.options[:separator].nil?, "extractor options[:separator] is nil"
|
|
294
298
|
end
|
|
295
299
|
it "caches" do
|
|
@@ -301,4 +305,53 @@ describe "Traject::MarcExtractor" do
|
|
|
301
305
|
end
|
|
302
306
|
|
|
303
307
|
|
|
308
|
+
describe "Allows multiple uses of the same tag" do
|
|
309
|
+
before do
|
|
310
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
it "allows repated tags for a variable field" do
|
|
314
|
+
extractor = Traject::MarcExtractor.new("245a:245b")
|
|
315
|
+
values = extractor.extract(@record)
|
|
316
|
+
assert_equal ['Manufacturing consent :', 'the political economy of the mass media /'], values
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
it "allows repeated tags with indicators specs" do
|
|
320
|
+
extractor = Traject::MarcExtractor.new("245|1*|a:245|2*|b")
|
|
321
|
+
@record.append(MARC::DataField.new('245', '2', '0', ['a', 'Subfield A Value'], ['b', 'Subfield B Value']))
|
|
322
|
+
results = extractor.extract(@record)
|
|
323
|
+
assert_equal ['Manufacturing consent :', 'Subfield B Value'], results
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
it "works the same as ::separator=>nil" do
|
|
330
|
+
ex1 = Traject::MarcExtractor.new("245a:245b")
|
|
331
|
+
ex2 = Traject::MarcExtractor.new("245ab", :separator=>nil)
|
|
332
|
+
assert_equal ex1.extract(@record), ex2.extract(@record)
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
it "allows repeated tags for a control field" do
|
|
337
|
+
extractor = Traject::MarcExtractor.new("001[0-1]:001[0-3]")
|
|
338
|
+
values = extractor.extract(@record)
|
|
339
|
+
assert_equal ["27", "2710"], values
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
it "associates indicators properly with repeated tags" do
|
|
343
|
+
@record = MARC::Record.new
|
|
344
|
+
@record.append MARC::DataField.new("100", '1', ' ', ['a', '100a first indicator 1'], ['b', 'should not include 100|1|b'])
|
|
345
|
+
@record.append MARC::DataField.new("100", '2', ' ', ['b', '100b first indicator 2'], ['a', 'should not include 100|2|a'])
|
|
346
|
+
|
|
347
|
+
extractor = Traject::MarcExtractor.new("100|1*|a:100|2*|b")
|
|
348
|
+
|
|
349
|
+
values = extractor.extract(@record)
|
|
350
|
+
|
|
351
|
+
assert_equal ['100a first indicator 1', '100b first indicator 2'], values
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
|
|
304
357
|
end
|
data/traject.gemspec
CHANGED
|
@@ -6,7 +6,7 @@ require 'traject/version'
|
|
|
6
6
|
Gem::Specification.new do |spec|
|
|
7
7
|
spec.name = "traject"
|
|
8
8
|
spec.version = Traject::VERSION
|
|
9
|
-
spec.authors = ["Jonathan Rochkind"]
|
|
9
|
+
spec.authors = ["Jonathan Rochkind", "Bill Dueber"]
|
|
10
10
|
spec.email = ["none@nowhere.org"]
|
|
11
11
|
spec.summary = %q{Index MARC to Solr; or generally process source records to hash-like structures}
|
|
12
12
|
spec.homepage = "http://github.com/jrochkind/traject"
|