traject 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ require 'traject/line_writer'
2
+
3
+ # A writer for Traject::Indexer that outputs each record as a series of
4
+ # lines, prefixed by the id, one for each field and it's values.
5
+ # Multiple values are separated by pipes
6
+ #
7
+ # Applicable settings:
8
+ #
9
+ # - 'output_file' -- the name of the file to output to
10
+ # - 'output_stream' -- alternately, the IO stream
11
+ # - 'debug_writer.idfield' -- the solr field from which to pull the record ID (default: 'id')
12
+ # - 'debug_writer.format' -- How to format the id/solr field/values (default: '%-12s %-25s %s')
13
+
14
+
15
+ class Traject::DebugWriter < Traject::LineWriter
16
+ DEFAULT_FORMAT = '%-12s %-25s %s'
17
+ DEFAULT_IDFIELD = 'id'
18
+
19
+ def serialize(context)
20
+ idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
21
+ format = settings['debug_writer.format'] || DEFAULT_FORMAT
22
+ h = context.output_hash
23
+ lines = h.keys.sort.map {|k| format % [h[idfield].first, k, h[k].join(' | ')] }
24
+ lines.push "\n"
25
+ lines.join("\n")
26
+ end
27
+
28
+ end
@@ -50,6 +50,13 @@ require 'traject/macros/basic'
50
50
  # with a String name of class meeting the Writer contract.
51
51
  #
52
52
  class Traject::Indexer
53
+
54
+ # Arity error on a passed block
55
+ class ArityError < ArgumentError; end
56
+ class NamingError < ArgumentError; end
57
+
58
+
59
+
53
60
  include Traject::QualifiedConstGet
54
61
 
55
62
  attr_writer :reader_class, :writer_class
@@ -143,20 +150,13 @@ class Traject::Indexer
143
150
  end
144
151
 
145
152
 
153
+
154
+
155
+
146
156
  # Used to define an indexing mapping.
147
157
  def to_field(field_name, aLambda = nil, &block)
148
158
 
149
- if field_name.nil? || field_name.empty?
150
- raise ArgumentError.new("to_field requires a non-blank first argument, field name")
151
- end
152
- [aLambda, block].each do |proc|
153
- # allow negative arity, meaning variable/optional, trust em on that.
154
- # but for positive arrity, we need 2 or 3 args
155
- if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
156
- raise ArgumentError.new("block/proc given to to_field needs 2 or 3 arguments: #{proc}")
157
- end
158
- end
159
-
159
+ verify_to_field_arguments(field_name, aLambda, block)
160
160
 
161
161
  @index_steps << {
162
162
  :field_name => field_name.to_s,
@@ -168,15 +168,7 @@ class Traject::Indexer
168
168
  end
169
169
 
170
170
  def each_record(aLambda = nil, &block)
171
- # arity check
172
- [aLambda, block].each do |proc|
173
- # allow negative arity, meaning variable/optional, trust em on that.
174
- # but for positive arrity, we need 1 or 2 args
175
- if proc && (proc.arity == 0 || proc.arity > 2)
176
- raise ArgumentError.new("block/proc given to to_field needs 1 or 2 arguments: #{proc}")
177
- end
178
- end
179
-
171
+ verify_each_record_arguments(aLambda, block)
180
172
  @index_steps << {
181
173
  :lambda => aLambda,
182
174
  :block => block,
@@ -394,6 +386,78 @@ class Traject::Indexer
394
386
  end
395
387
 
396
388
 
389
+
390
+
391
+ # Verify that the field name is good, and throw a useful error if not
392
+ def verify_field_name(field_name)
393
+ if field_name.nil? || !field_name.is_a?(String) || field_name.empty?
394
+ raise NamingError.new("to_field requires the field name (String) as the first argument (#{last_named_step.message})")
395
+ end
396
+ end
397
+
398
+
399
+ # Verify the various, increasingly-complex things that can be sent to to_field
400
+ # to make sure it's all kosher.
401
+ #
402
+ # "Modification" takes place for zero-argument blocks that return a lambda
403
+
404
+ def verify_to_field_arguments(field_name, aLambda, block)
405
+
406
+ verify_field_name(field_name)
407
+
408
+ [aLambda, block].each do |proc|
409
+ # allow negative arity, meaning variable/optional, trust em on that.
410
+ # but for positive arrity, we need 2 or 3 args
411
+ if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
412
+ raise ArityError.new("error parsing field '#{field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{last_named_step.message})")
413
+ end
414
+ end
415
+
416
+ end
417
+
418
+ # Verify the procs sent to each_record to make sure it's all kosher.
419
+
420
+ def verify_each_record_arguments(aLambda, block)
421
+ unless aLambda or block
422
+ raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{last_named_step.message})")
423
+ end
424
+
425
+ [aLambda, block].each do |proc|
426
+ # allow negative arity, meaning variable/optional, trust em on that.
427
+ # but for positive arrity, we need 1 or 2 args
428
+ if proc
429
+ unless proc.is_a?(Proc)
430
+ raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} (#{last_named_step.message})")
431
+ end
432
+ if (proc.arity == 0 || proc.arity > 2)
433
+ raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: #{proc} (#{last_named_step.message})")
434
+ end
435
+ end
436
+ end
437
+ end
438
+
439
+ def last_named_step
440
+ return LastNamedStep.new(@index_steps)
441
+ end
442
+
443
+
444
+ # A convenient way to find, and generate error messages for, the last named step (for helping locate parse errors)
445
+ class LastNamedStep
446
+ attr_accessor :step, :message
447
+
448
+ # Get the last step for which we have a field_name (e.g., the last to_field, skipping over each_record)
449
+ def initialize(index_steps)
450
+ @step = index_steps.reverse_each.find{|step| step[:field_name]}
451
+ if @step
452
+ @message = "last successfully parsed field was '#{@step[:field_name]}'"
453
+ else
454
+ @message = "there were no previous named fields successfully parsed"
455
+ end
456
+ end
457
+ end
458
+
459
+
460
+
397
461
  # Represents the context of a specific record being indexed, passed
398
462
  # to indexing logic blocks
399
463
  #
@@ -1,7 +1,7 @@
1
1
  require 'hashie'
2
2
 
3
3
  # A Hash of settings for a Traject::Indexer, which also ends up passed along
4
- # to other objects Traject::Indexer interacts with.
4
+ # to other objects Traject::Indexer interacts with.
5
5
  #
6
6
  # Enhanced with a few features from Hashie, to make it for
7
7
  # instance string/symbol indifferent
@@ -71,5 +71,13 @@ class Traject::Indexer
71
71
  "processing_thread_pool" => 3
72
72
  }
73
73
  end
74
+
75
+ def inspect
76
+ # Keep any key ending in password out of the inspect
77
+ self.inject({}) do |hash, (key, value)|
78
+ hash[key] = (key =~ /password\Z/) ? "[hidden]" : value
79
+ hash
80
+ end.inspect
81
+ end
74
82
  end
75
83
  end
@@ -1,53 +1,30 @@
1
1
  require 'json'
2
+ require 'traject/line_writer'
2
3
 
3
4
  # A writer for Traject::Indexer, that just writes out
4
5
  # all the output as Json. It's newline delimitted json, but
5
6
  # right now no checks to make sure there is no internal newlines
6
- # as whitespace in the json. TODO, add that.
7
+ # as whitespace in the json. TODO, add that.
7
8
  #
8
- # Not currently thread-safe (have to make sure whole object and newline
9
- # get written without context switch. Can be made so.)
9
+ # Should be thread-safe (ie, multiple worker threads can be calling #put
10
+ # concurrently), by wrapping write to actual output file in a mutex synchronize.
11
+ # This does not seem to effect performance much, as far as I could tell
12
+ # benchmarking.
10
13
  #
11
14
  # You can force pretty-printing with setting 'json_writer.pretty_print' of boolean
12
- # true or string 'true'. Useful mostly for human checking of output.
15
+ # true or string 'true'. Useful mostly for human checking of output.
13
16
  #
14
17
  # Output will be sent to settings["output_file"] string path, or else
15
- # settings["output_stream"] (ruby IO object), or else stdout.
16
- class Traject::JsonWriter
17
- attr_reader :settings
18
+ # settings["output_stream"] (ruby IO object), or else stdout.
19
+ class Traject::JsonWriter < Traject::LineWriter
18
20
 
19
- def initialize(argSettings)
20
- @settings = argSettings
21
- end
22
-
23
- def put(context)
21
+ def serialize(context)
24
22
  hash = context.output_hash
25
-
26
- serialized =
27
- if settings["json_writer.pretty_print"]
28
- JSON.pretty_generate(hash)
29
- else
30
- JSON.generate(hash)
31
- end
32
- output_file.puts(serialized)
33
- end
34
-
35
- def output_file
36
- unless defined? @output_file
37
- @output_file =
38
- if settings["output_file"]
39
- File.open(settings["output_file"], 'w:UTF-8')
40
- elsif settings["output_stream"]
41
- settings["output_stream"]
42
- else
43
- $stdout
44
- end
23
+ if settings["json_writer.pretty_print"]
24
+ JSON.pretty_generate(hash)
25
+ else
26
+ JSON.generate(hash)
45
27
  end
46
- return @output_file
47
- end
48
-
49
- def close
50
- @output_file.close unless (@output_file.nil? || @output_file.tty?)
51
- end
28
+ end
52
29
 
53
30
  end
@@ -0,0 +1,59 @@
1
+ require 'thread'
2
+
3
+ # A writer for Traject::Indexer, that just writes out
4
+ # all the output as serialized text with #puts.
5
+ #
6
+ # Should be thread-safe (ie, multiple worker threads can be calling #put
7
+ # concurrently), by wrapping write to actual output file in a mutex synchronize.
8
+ # This does not seem to effect performance much, as far as I could tell
9
+ # benchmarking.
10
+ #
11
+ # Output will be sent to settings["output_file"] string path, or else
12
+ # settings["output_stream"] (ruby IO object), or else stdout.
13
+ #
14
+ # This class can be sub-classed to write out different serialized
15
+ # reprentations -- subclasses will just override the #serialize
16
+ # method. For instance, see JsonWriter.
17
+ class Traject::LineWriter
18
+ attr_reader :settings
19
+ attr_reader :write_mutex
20
+
21
+ def initialize(argSettings)
22
+ @settings = argSettings
23
+ @write_mutex = Mutex.new
24
+
25
+ # trigger lazy loading now for thread-safety
26
+ output_file
27
+ end
28
+
29
+
30
+ def serialize(context)
31
+ context.output_hash
32
+ end
33
+
34
+ def put(context)
35
+ serialized = serialize(context)
36
+ write_mutex.synchronize do
37
+ output_file.puts(serialized)
38
+ end
39
+ end
40
+
41
+ def output_file
42
+ unless defined? @output_file
43
+ @output_file =
44
+ if settings["output_file"]
45
+ File.open(settings["output_file"], 'w:UTF-8')
46
+ elsif settings["output_stream"]
47
+ settings["output_stream"]
48
+ else
49
+ $stdout
50
+ end
51
+ end
52
+ return @output_file
53
+ end
54
+
55
+ def close
56
+ @output_file.close unless (@output_file.nil? || @output_file.tty?)
57
+ end
58
+
59
+ end
@@ -35,16 +35,21 @@ module Traject::Macros
35
35
  trim_punctuation = options.delete(:trim_punctuation)
36
36
  default_value = options.delete(:default)
37
37
 
38
- # We create the TranslationMap here on load, not inside the closure
39
- # where it'll be called for every record. Since TranslationMap is supposed
40
- # to cache, prob doesn't matter, but doens't hurt. Also causes any syntax
41
- # exceptions to raise on load.
38
+ # We create the TranslationMap and the MarcExtractor here
39
+ # on load, so the lambda can just refer to already created
40
+ # ones, and not have to create a new one per-execution.
41
+ #
42
+ # Benchmarking shows for MarcExtractor at least, there is
43
+ # significant performance advantage.
44
+
42
45
  if translation_map_arg = options.delete(:translation_map)
43
46
  translation_map = Traject::TranslationMap.new(translation_map_arg)
44
47
  end
45
48
 
49
+ extractor = Traject::MarcExtractor.new(spec, options)
50
+
46
51
  lambda do |record, accumulator, context|
47
- accumulator.concat Traject::MarcExtractor.extract_by_spec(record, spec, options)
52
+ accumulator.concat extractor.extract(record)
48
53
 
49
54
  if only_first
50
55
  Marc21.first! accumulator
@@ -11,19 +11,30 @@ module Traject::Macros
11
11
  # shortcut
12
12
  MarcExtractor = Traject::MarcExtractor
13
13
 
14
- # Extract OCLC numbers from, by default 035a's, then strip known prefixes to get
14
+ # Extract OCLC numbers from, by default 035a's by known prefixes, then stripped
15
15
  # just the num, and de-dup.
16
16
  def oclcnum(extract_fields = "035a")
17
+ extractor = MarcExtractor.new(extract_fields, :seperator => nil)
18
+
17
19
  lambda do |record, accumulator|
18
- list = MarcExtractor.extract_by_spec(record, extract_fields, :seperator => nil).collect! do |o|
19
- Marc21Semantics.oclcnum_trim(o)
20
- end
20
+ list = extractor.extract(record).collect! do |o|
21
+ Marc21Semantics.oclcnum_extract(o)
22
+ end.compact
21
23
 
22
24
  accumulator.concat list.uniq if list
23
25
  end
24
26
  end
25
- def self.oclcnum_trim(num)
26
- num.gsub(/\A(ocm)|(ocn)|(on)|(\(OCoLC\))/, '')
27
+ # If a num begins with a known OCLC prefix, return it without the prefix.
28
+ # otherwise nil.
29
+ def self.oclcnum_extract(num)
30
+ stripped = num.gsub(/\A(ocm)|(ocn)|(on)|(\(OCoLC\))/, '')
31
+ if num != stripped
32
+ # it had the prefix, which we've now stripped
33
+ return stripped
34
+ else
35
+ # it didn't have the prefix
36
+ return nil
37
+ end
27
38
  end
28
39
 
29
40
 
@@ -47,12 +58,13 @@ module Traject::Macros
47
58
  accumulator << Marc21Semantics.get_sortable_author(record)
48
59
  end
49
60
  end
61
+
50
62
  def self.get_sortable_author(record)
51
- onexx = MarcExtractor.extract_by_spec(record, "100:110:111", :first => true).first
63
+ onexx = MarcExtractor.cached("100:110:111", :first => true).extract(record).first
52
64
  onexx = onexx.strip if onexx
53
65
 
54
66
  titles = []
55
- MarcExtractor.new(record, "240:245", :first => true).each_matching_line do |field, spec|
67
+ MarcExtractor.cached("240:245", :first => true).each_matching_line(record) do |field, spec|
56
68
  non_filing = field.indicator2.to_i
57
69
 
58
70
  str = field.subfields.collect {|sf| sf.value}.join(" ")
@@ -72,8 +84,9 @@ module Traject::Macros
72
84
  accumulator << Marc21Semantics.get_sortable_title(record)
73
85
  end
74
86
  end
87
+
75
88
  def self.get_sortable_title(record)
76
- MarcExtractor.new(record, "245ab").collect_matching_lines do |field, spec, extractor|
89
+ MarcExtractor.cached("245ab").collect_matching_lines(record) do |field, spec, extractor|
77
90
  str = extractor.collect_subfields(field, spec).first
78
91
 
79
92
  if str.nil?
@@ -105,8 +118,10 @@ module Traject::Macros
105
118
  def marc_languages(spec = "008[35-37]:041a:041d")
106
119
  translation_map = Traject::TranslationMap.new("marc_languages")
107
120
 
121
+ extractor = MarcExtractor.new(spec, :seperator => nil)
122
+
108
123
  lambda do |record, accumulator|
109
- codes = MarcExtractor.new(record, spec, :seperator => "nil").collect_matching_lines do |field, spec, extractor|
124
+ codes = extractor.collect_matching_lines(record) do |field, spec, extractor|
110
125
  if extractor.control_field?(field)
111
126
  (spec[:bytes] ? field.value.byteslice(spec[:bytes]) : field.value)
112
127
  else
@@ -134,10 +149,12 @@ module Traject::Macros
134
149
  # already covered by another field we're including, so we don't want to double count it, possibly
135
150
  # with slight variation.
136
151
  def marc_series_facet(spec = "440a:490a:800abcdt:810abcdt:811acdeft:830adfgklmnoprst")
152
+ extractor = MarcExtractor.new(spec)
153
+
137
154
  lambda do |record, accumulator|
138
- MarcExtractor.new(record, spec).collect_matching_lines do |field, spec, extractor|
155
+ accumulator.concat( extractor.collect_matching_lines(record) do |field, spec, extractor|
139
156
  extractor.collect_subfields(field, spec) unless (field.tag == "490" && field.indicator1 == "1")
140
- end
157
+ end.compact)
141
158
  end
142
159
  end
143
160
 
@@ -149,8 +166,10 @@ module Traject::Macros
149
166
  def marc_instrumentation_humanized(spec = "048ab", options = {})
150
167
  translation_map = Traject::TranslationMap.new(options[:translation_map] || "marc_instruments")
151
168
 
169
+ extractor = MarcExtractor.new(spec, :seperator => nil)
170
+
152
171
  lambda do |record, accumulator|
153
- values = Traject::MarcExtractor.extract_by_spec(record, spec, :seperator => nil)
172
+ values = extractor.extract(record)
154
173
  human = values.collect do |value|
155
174
  translation_map[ value.slice(0, 2) ]
156
175
  end.uniq
@@ -169,9 +188,12 @@ module Traject::Macros
169
188
  # codes.
170
189
  def marc_instrument_codes_normalized(spec = "048")
171
190
  soloist_suffix = ".s"
191
+
192
+ extractor = MarcExtractor.new("048", :seperator => nil)
193
+
172
194
  return lambda do |record, accumulator|
173
195
  accumulator.concat(
174
- MarcExtractor.new(record, "048", :seperator => nil).collect_matching_lines do |field, spec, extractor|
196
+ extractor.collect_matching_lines(record) do |field, spec, extractor|
175
197
  values = []
176
198
 
177
199
  field.subfields.each do |sf|
@@ -219,7 +241,7 @@ module Traject::Macros
219
241
  # See #marc_publication_date. Yeah, this is a holy mess.
220
242
  # Maybe it should actually be extracted to it's own class!
221
243
  def self.publication_date(record, estimate_tolerance = 15, min_year = 500, max_year = (Time.new.year + 6))
222
- field008 = MarcExtractor.extract_by_spec(record, "008").first
244
+ field008 = MarcExtractor.cached("008").extract(record).first
223
245
  found_date = nil
224
246
 
225
247
  if field008 && field008.length >= 11
@@ -264,7 +286,7 @@ module Traject::Macros
264
286
  end
265
287
  # Okay, nothing from 008, try 260
266
288
  if found_date.nil?
267
- v260c = MarcExtractor.extract_by_spec(record, "260c", :seperator => nil).first
289
+ v260c = MarcExtractor.cached("260c", :seperator => nil).extract(record).first
268
290
  # just try to take the first four digits out of there, we're not going to try
269
291
  # anything crazy.
270
292
  if v260c =~ /(\d{4})/
@@ -298,8 +320,10 @@ module Traject::Macros
298
320
  default_value = options.has_key?(:default) ? options[:default] : "Unknown"
299
321
  translation_map = Traject::TranslationMap.new("lcc_top_level")
300
322
 
323
+ extractor = MarcExtractor.new(spec, :seperator => nil)
324
+
301
325
  lambda do |record, accumulator|
302
- candidates = MarcExtractor.extract_by_spec(record, spec, :seperator => nil)
326
+ candidates = extractor.extract(record)
303
327
 
304
328
  candidates.reject! do |candidate|
305
329
  !(candidate =~ lcc_regex)
@@ -328,10 +352,14 @@ module Traject::Macros
328
352
  a_fields_spec = options[:geo_a_fields] || "651a:691a"
329
353
  z_fields_spec = options[:geo_z_fields] || "600:610:611:630:648:650:654:655:656:690:651:691"
330
354
 
355
+ extractor_043a = MarcExtractor.new("043a", :seperator => nil)
356
+ extractor_a_fields = MarcExtractor.new(a_fields_spec, :seperator => nil)
357
+ extractor_z_fields = MarcExtractor.new(z_fields_spec)
358
+
331
359
  lambda do |record, accumulator|
332
360
 
333
361
  accumulator.concat(
334
- MarcExtractor.extract_by_spec(record, "043a", :seperator => nil).collect do |code|
362
+ extractor_043a.extract(record).collect do |code|
335
363
  # remove any trailing hyphens, then map
336
364
  marc_geo_map[code.gsub(/\-+\Z/, '')]
337
365
  end.compact
@@ -339,15 +367,15 @@ module Traject::Macros
339
367
 
340
368
  #LCSH 651a and 691a go in more or less normally.
341
369
  accumulator.concat(
342
- MarcExtractor.extract_by_spec(record, a_fields_spec, :seperator => nil).collect do |s|
370
+ extractor_a_fields.extract(record).collect do |s|
343
371
  # remove trailing periods, which they sometimes have if they were
344
372
  # at end of LCSH.
345
373
  s.sub(/\. */, '')
346
374
  end
347
375
  )
348
376
 
349
- # fields we take z's from have a bit more normalization
350
- MarcExtractor.new(record, z_fields_spec).each_matching_line do |field, spec, extractor|
377
+ # fields we take z's from have a bit more normalization
378
+ extractor_z_fields.each_matching_line(record) do |field, spec, extractor|
351
379
  z_fields = field.subfields.find_all {|sf| sf.code == "z"}.collect {|sf| sf.value }
352
380
  # depending on position in total field, may be a period on the end
353
381
  # we want to remove.
@@ -376,17 +404,21 @@ module Traject::Macros
376
404
  ordinary_fields_spec = "600y:610y:611y:630y:648ay:650y:654y:656y:690y"
377
405
  special_fields_spec = "651:691"
378
406
  seperator = ": "
407
+
408
+ extractor_ordinary_fields = MarcExtractor.new(ordinary_fields_spec)
409
+ extractor_special_fields = MarcExtractor.new(special_fields_spec)
410
+
379
411
  lambda do |record, accumulator|
380
412
  # straightforward ones
381
413
 
382
414
 
383
- accumulator.concat( MarcExtractor.extract_by_spec(record, ordinary_fields_spec).collect do |v|
415
+ accumulator.concat( extractor_ordinary_fields.extract(record).collect do |v|
384
416
  # May have a period we have to remove, if it was at end of tag
385
417
  v.sub(/\. *\Z/, '')
386
418
  end)
387
419
 
388
- # weird ones
389
- MarcExtractor.new(record, special_fields_spec).each_matching_line do |field, spec, extractor|
420
+ # weird ones
421
+ extractor_special_fields.each_matching_line(record) do |field, spec, extractor|
390
422
  field.subfields.each do |sf|
391
423
  next unless sf.code == 'y'
392
424
  if sf.value =~ /\A\s*.+,\s+(ca.\s+)?\d\d\d\d?(-\d\d\d\d?)?( B\.C\.)?[.,; ]*\Z/
@@ -396,7 +428,7 @@ module Traject::Macros
396
428
  accumulator << sf.value.sub(/\. *\Z/, '')
397
429
  end
398
430
  end
399
- end
431
+ end
400
432
  end
401
433
  end
402
434