traject 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +7 -0
- data/Gemfile +5 -1
- data/README.md +65 -17
- data/bench/bench.rb +30 -0
- data/bin/traject +4 -169
- data/doc/batch_execution.md +177 -0
- data/doc/extending.md +182 -0
- data/doc/other_commands.md +49 -0
- data/doc/settings.md +6 -2
- data/lib/traject.rb +1 -0
- data/lib/traject/command_line.rb +296 -0
- data/lib/traject/debug_writer.rb +28 -0
- data/lib/traject/indexer.rb +84 -20
- data/lib/traject/indexer/settings.rb +9 -1
- data/lib/traject/json_writer.rb +15 -38
- data/lib/traject/line_writer.rb +59 -0
- data/lib/traject/macros/marc21.rb +10 -5
- data/lib/traject/macros/marc21_semantics.rb +57 -25
- data/lib/traject/marc4j_reader.rb +9 -26
- data/lib/traject/marc_extractor.rb +121 -48
- data/lib/traject/mock_reader.rb +87 -0
- data/lib/traject/mock_writer.rb +34 -0
- data/lib/traject/solrj_writer.rb +1 -22
- data/lib/traject/util.rb +107 -1
- data/lib/traject/version.rb +1 -1
- data/lib/traject/yaml_writer.rb +9 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/indexer/each_record_test.rb +27 -2
- data/test/indexer/macros_marc21_semantics_test.rb +12 -1
- data/test/indexer/settings_test.rb +9 -2
- data/test/indexer/to_field_test.rb +35 -5
- data/test/marc4j_reader_test.rb +3 -0
- data/test/marc_extractor_test.rb +94 -20
- data/test/test_support/demo_config.rb +6 -3
- data/traject.gemspec +1 -2
- metadata +17 -20
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'traject/line_writer'
|
2
|
+
|
3
|
+
# A writer for Traject::Indexer that outputs each record as a series of
|
4
|
+
# lines, prefixed by the id, one for each field and it's values.
|
5
|
+
# Multiple values are separated by pipes
|
6
|
+
#
|
7
|
+
# Applicable settings:
|
8
|
+
#
|
9
|
+
# - 'output_file' -- the name of the file to output to
|
10
|
+
# - 'output_stream' -- alternately, the IO stream
|
11
|
+
# - 'debug_writer.idfield' -- the solr field from which to pull the record ID (default: 'id')
|
12
|
+
# - 'debug_writer.format' -- How to format the id/solr field/values (default: '%-12s %-25s %s')
|
13
|
+
|
14
|
+
|
15
|
+
class Traject::DebugWriter < Traject::LineWriter
|
16
|
+
DEFAULT_FORMAT = '%-12s %-25s %s'
|
17
|
+
DEFAULT_IDFIELD = 'id'
|
18
|
+
|
19
|
+
def serialize(context)
|
20
|
+
idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
|
21
|
+
format = settings['debug_writer.format'] || DEFAULT_FORMAT
|
22
|
+
h = context.output_hash
|
23
|
+
lines = h.keys.sort.map {|k| format % [h[idfield].first, k, h[k].join(' | ')] }
|
24
|
+
lines.push "\n"
|
25
|
+
lines.join("\n")
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
data/lib/traject/indexer.rb
CHANGED
@@ -50,6 +50,13 @@ require 'traject/macros/basic'
|
|
50
50
|
# with a String name of class meeting the Writer contract.
|
51
51
|
#
|
52
52
|
class Traject::Indexer
|
53
|
+
|
54
|
+
# Arity error on a passed block
|
55
|
+
class ArityError < ArgumentError; end
|
56
|
+
class NamingError < ArgumentError; end
|
57
|
+
|
58
|
+
|
59
|
+
|
53
60
|
include Traject::QualifiedConstGet
|
54
61
|
|
55
62
|
attr_writer :reader_class, :writer_class
|
@@ -143,20 +150,13 @@ class Traject::Indexer
|
|
143
150
|
end
|
144
151
|
|
145
152
|
|
153
|
+
|
154
|
+
|
155
|
+
|
146
156
|
# Used to define an indexing mapping.
|
147
157
|
def to_field(field_name, aLambda = nil, &block)
|
148
158
|
|
149
|
-
|
150
|
-
raise ArgumentError.new("to_field requires a non-blank first argument, field name")
|
151
|
-
end
|
152
|
-
[aLambda, block].each do |proc|
|
153
|
-
# allow negative arity, meaning variable/optional, trust em on that.
|
154
|
-
# but for positive arrity, we need 2 or 3 args
|
155
|
-
if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
|
156
|
-
raise ArgumentError.new("block/proc given to to_field needs 2 or 3 arguments: #{proc}")
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
159
|
+
verify_to_field_arguments(field_name, aLambda, block)
|
160
160
|
|
161
161
|
@index_steps << {
|
162
162
|
:field_name => field_name.to_s,
|
@@ -168,15 +168,7 @@ class Traject::Indexer
|
|
168
168
|
end
|
169
169
|
|
170
170
|
def each_record(aLambda = nil, &block)
|
171
|
-
|
172
|
-
[aLambda, block].each do |proc|
|
173
|
-
# allow negative arity, meaning variable/optional, trust em on that.
|
174
|
-
# but for positive arrity, we need 1 or 2 args
|
175
|
-
if proc && (proc.arity == 0 || proc.arity > 2)
|
176
|
-
raise ArgumentError.new("block/proc given to to_field needs 1 or 2 arguments: #{proc}")
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
171
|
+
verify_each_record_arguments(aLambda, block)
|
180
172
|
@index_steps << {
|
181
173
|
:lambda => aLambda,
|
182
174
|
:block => block,
|
@@ -394,6 +386,78 @@ class Traject::Indexer
|
|
394
386
|
end
|
395
387
|
|
396
388
|
|
389
|
+
|
390
|
+
|
391
|
+
# Verify that the field name is good, and throw a useful error if not
|
392
|
+
def verify_field_name(field_name)
|
393
|
+
if field_name.nil? || !field_name.is_a?(String) || field_name.empty?
|
394
|
+
raise NamingError.new("to_field requires the field name (String) as the first argument (#{last_named_step.message})")
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
|
399
|
+
# Verify the various, increasingly-complex things that can be sent to to_field
|
400
|
+
# to make sure it's all kosher.
|
401
|
+
#
|
402
|
+
# "Modification" takes place for zero-argument blocks that return a lambda
|
403
|
+
|
404
|
+
def verify_to_field_arguments(field_name, aLambda, block)
|
405
|
+
|
406
|
+
verify_field_name(field_name)
|
407
|
+
|
408
|
+
[aLambda, block].each do |proc|
|
409
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
410
|
+
# but for positive arrity, we need 2 or 3 args
|
411
|
+
if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
|
412
|
+
raise ArityError.new("error parsing field '#{field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{last_named_step.message})")
|
413
|
+
end
|
414
|
+
end
|
415
|
+
|
416
|
+
end
|
417
|
+
|
418
|
+
# Verify the procs sent to each_record to make sure it's all kosher.
|
419
|
+
|
420
|
+
def verify_each_record_arguments(aLambda, block)
|
421
|
+
unless aLambda or block
|
422
|
+
raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{last_named_step.message})")
|
423
|
+
end
|
424
|
+
|
425
|
+
[aLambda, block].each do |proc|
|
426
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
427
|
+
# but for positive arrity, we need 1 or 2 args
|
428
|
+
if proc
|
429
|
+
unless proc.is_a?(Proc)
|
430
|
+
raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} (#{last_named_step.message})")
|
431
|
+
end
|
432
|
+
if (proc.arity == 0 || proc.arity > 2)
|
433
|
+
raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: #{proc} (#{last_named_step.message})")
|
434
|
+
end
|
435
|
+
end
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
def last_named_step
|
440
|
+
return LastNamedStep.new(@index_steps)
|
441
|
+
end
|
442
|
+
|
443
|
+
|
444
|
+
# A convenient way to find, and generate error messages for, the last named step (for helping locate parse errors)
|
445
|
+
class LastNamedStep
|
446
|
+
attr_accessor :step, :message
|
447
|
+
|
448
|
+
# Get the last step for which we have a field_name (e.g., the last to_field, skipping over each_record)
|
449
|
+
def initialize(index_steps)
|
450
|
+
@step = index_steps.reverse_each.find{|step| step[:field_name]}
|
451
|
+
if @step
|
452
|
+
@message = "last successfully parsed field was '#{@step[:field_name]}'"
|
453
|
+
else
|
454
|
+
@message = "there were no previous named fields successfully parsed"
|
455
|
+
end
|
456
|
+
end
|
457
|
+
end
|
458
|
+
|
459
|
+
|
460
|
+
|
397
461
|
# Represents the context of a specific record being indexed, passed
|
398
462
|
# to indexing logic blocks
|
399
463
|
#
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'hashie'
|
2
2
|
|
3
3
|
# A Hash of settings for a Traject::Indexer, which also ends up passed along
|
4
|
-
# to other objects Traject::Indexer interacts with.
|
4
|
+
# to other objects Traject::Indexer interacts with.
|
5
5
|
#
|
6
6
|
# Enhanced with a few features from Hashie, to make it for
|
7
7
|
# instance string/symbol indifferent
|
@@ -71,5 +71,13 @@ class Traject::Indexer
|
|
71
71
|
"processing_thread_pool" => 3
|
72
72
|
}
|
73
73
|
end
|
74
|
+
|
75
|
+
def inspect
|
76
|
+
# Keep any key ending in password out of the inspect
|
77
|
+
self.inject({}) do |hash, (key, value)|
|
78
|
+
hash[key] = (key =~ /password\Z/) ? "[hidden]" : value
|
79
|
+
hash
|
80
|
+
end.inspect
|
81
|
+
end
|
74
82
|
end
|
75
83
|
end
|
data/lib/traject/json_writer.rb
CHANGED
@@ -1,53 +1,30 @@
|
|
1
1
|
require 'json'
|
2
|
+
require 'traject/line_writer'
|
2
3
|
|
3
4
|
# A writer for Traject::Indexer, that just writes out
|
4
5
|
# all the output as Json. It's newline delimitted json, but
|
5
6
|
# right now no checks to make sure there is no internal newlines
|
6
|
-
# as whitespace in the json. TODO, add that.
|
7
|
+
# as whitespace in the json. TODO, add that.
|
7
8
|
#
|
8
|
-
#
|
9
|
-
#
|
9
|
+
# Should be thread-safe (ie, multiple worker threads can be calling #put
|
10
|
+
# concurrently), by wrapping write to actual output file in a mutex synchronize.
|
11
|
+
# This does not seem to effect performance much, as far as I could tell
|
12
|
+
# benchmarking.
|
10
13
|
#
|
11
14
|
# You can force pretty-printing with setting 'json_writer.pretty_print' of boolean
|
12
|
-
# true or string 'true'. Useful mostly for human checking of output.
|
15
|
+
# true or string 'true'. Useful mostly for human checking of output.
|
13
16
|
#
|
14
17
|
# Output will be sent to settings["output_file"] string path, or else
|
15
|
-
# settings["output_stream"] (ruby IO object), or else stdout.
|
16
|
-
class Traject::JsonWriter
|
17
|
-
attr_reader :settings
|
18
|
+
# settings["output_stream"] (ruby IO object), or else stdout.
|
19
|
+
class Traject::JsonWriter < Traject::LineWriter
|
18
20
|
|
19
|
-
def
|
20
|
-
@settings = argSettings
|
21
|
-
end
|
22
|
-
|
23
|
-
def put(context)
|
21
|
+
def serialize(context)
|
24
22
|
hash = context.output_hash
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
else
|
30
|
-
JSON.generate(hash)
|
31
|
-
end
|
32
|
-
output_file.puts(serialized)
|
33
|
-
end
|
34
|
-
|
35
|
-
def output_file
|
36
|
-
unless defined? @output_file
|
37
|
-
@output_file =
|
38
|
-
if settings["output_file"]
|
39
|
-
File.open(settings["output_file"], 'w:UTF-8')
|
40
|
-
elsif settings["output_stream"]
|
41
|
-
settings["output_stream"]
|
42
|
-
else
|
43
|
-
$stdout
|
44
|
-
end
|
23
|
+
if settings["json_writer.pretty_print"]
|
24
|
+
JSON.pretty_generate(hash)
|
25
|
+
else
|
26
|
+
JSON.generate(hash)
|
45
27
|
end
|
46
|
-
|
47
|
-
end
|
48
|
-
|
49
|
-
def close
|
50
|
-
@output_file.close unless (@output_file.nil? || @output_file.tty?)
|
51
|
-
end
|
28
|
+
end
|
52
29
|
|
53
30
|
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'thread'
|
2
|
+
|
3
|
+
# A writer for Traject::Indexer, that just writes out
|
4
|
+
# all the output as serialized text with #puts.
|
5
|
+
#
|
6
|
+
# Should be thread-safe (ie, multiple worker threads can be calling #put
|
7
|
+
# concurrently), by wrapping write to actual output file in a mutex synchronize.
|
8
|
+
# This does not seem to effect performance much, as far as I could tell
|
9
|
+
# benchmarking.
|
10
|
+
#
|
11
|
+
# Output will be sent to settings["output_file"] string path, or else
|
12
|
+
# settings["output_stream"] (ruby IO object), or else stdout.
|
13
|
+
#
|
14
|
+
# This class can be sub-classed to write out different serialized
|
15
|
+
# reprentations -- subclasses will just override the #serialize
|
16
|
+
# method. For instance, see JsonWriter.
|
17
|
+
class Traject::LineWriter
|
18
|
+
attr_reader :settings
|
19
|
+
attr_reader :write_mutex
|
20
|
+
|
21
|
+
def initialize(argSettings)
|
22
|
+
@settings = argSettings
|
23
|
+
@write_mutex = Mutex.new
|
24
|
+
|
25
|
+
# trigger lazy loading now for thread-safety
|
26
|
+
output_file
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def serialize(context)
|
31
|
+
context.output_hash
|
32
|
+
end
|
33
|
+
|
34
|
+
def put(context)
|
35
|
+
serialized = serialize(context)
|
36
|
+
write_mutex.synchronize do
|
37
|
+
output_file.puts(serialized)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def output_file
|
42
|
+
unless defined? @output_file
|
43
|
+
@output_file =
|
44
|
+
if settings["output_file"]
|
45
|
+
File.open(settings["output_file"], 'w:UTF-8')
|
46
|
+
elsif settings["output_stream"]
|
47
|
+
settings["output_stream"]
|
48
|
+
else
|
49
|
+
$stdout
|
50
|
+
end
|
51
|
+
end
|
52
|
+
return @output_file
|
53
|
+
end
|
54
|
+
|
55
|
+
def close
|
56
|
+
@output_file.close unless (@output_file.nil? || @output_file.tty?)
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
@@ -35,16 +35,21 @@ module Traject::Macros
|
|
35
35
|
trim_punctuation = options.delete(:trim_punctuation)
|
36
36
|
default_value = options.delete(:default)
|
37
37
|
|
38
|
-
# We create the TranslationMap
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
38
|
+
# We create the TranslationMap and the MarcExtractor here
|
39
|
+
# on load, so the lambda can just refer to already created
|
40
|
+
# ones, and not have to create a new one per-execution.
|
41
|
+
#
|
42
|
+
# Benchmarking shows for MarcExtractor at least, there is
|
43
|
+
# significant performance advantage.
|
44
|
+
|
42
45
|
if translation_map_arg = options.delete(:translation_map)
|
43
46
|
translation_map = Traject::TranslationMap.new(translation_map_arg)
|
44
47
|
end
|
45
48
|
|
49
|
+
extractor = Traject::MarcExtractor.new(spec, options)
|
50
|
+
|
46
51
|
lambda do |record, accumulator, context|
|
47
|
-
accumulator.concat
|
52
|
+
accumulator.concat extractor.extract(record)
|
48
53
|
|
49
54
|
if only_first
|
50
55
|
Marc21.first! accumulator
|
@@ -11,19 +11,30 @@ module Traject::Macros
|
|
11
11
|
# shortcut
|
12
12
|
MarcExtractor = Traject::MarcExtractor
|
13
13
|
|
14
|
-
# Extract OCLC numbers from, by default 035a's
|
14
|
+
# Extract OCLC numbers from, by default 035a's by known prefixes, then stripped
|
15
15
|
# just the num, and de-dup.
|
16
16
|
def oclcnum(extract_fields = "035a")
|
17
|
+
extractor = MarcExtractor.new(extract_fields, :seperator => nil)
|
18
|
+
|
17
19
|
lambda do |record, accumulator|
|
18
|
-
list =
|
19
|
-
Marc21Semantics.
|
20
|
-
end
|
20
|
+
list = extractor.extract(record).collect! do |o|
|
21
|
+
Marc21Semantics.oclcnum_extract(o)
|
22
|
+
end.compact
|
21
23
|
|
22
24
|
accumulator.concat list.uniq if list
|
23
25
|
end
|
24
26
|
end
|
25
|
-
|
26
|
-
|
27
|
+
# If a num begins with a known OCLC prefix, return it without the prefix.
|
28
|
+
# otherwise nil.
|
29
|
+
def self.oclcnum_extract(num)
|
30
|
+
stripped = num.gsub(/\A(ocm)|(ocn)|(on)|(\(OCoLC\))/, '')
|
31
|
+
if num != stripped
|
32
|
+
# it had the prefix, which we've now stripped
|
33
|
+
return stripped
|
34
|
+
else
|
35
|
+
# it didn't have the prefix
|
36
|
+
return nil
|
37
|
+
end
|
27
38
|
end
|
28
39
|
|
29
40
|
|
@@ -47,12 +58,13 @@ module Traject::Macros
|
|
47
58
|
accumulator << Marc21Semantics.get_sortable_author(record)
|
48
59
|
end
|
49
60
|
end
|
61
|
+
|
50
62
|
def self.get_sortable_author(record)
|
51
|
-
onexx = MarcExtractor.
|
63
|
+
onexx = MarcExtractor.cached("100:110:111", :first => true).extract(record).first
|
52
64
|
onexx = onexx.strip if onexx
|
53
65
|
|
54
66
|
titles = []
|
55
|
-
MarcExtractor.
|
67
|
+
MarcExtractor.cached("240:245", :first => true).each_matching_line(record) do |field, spec|
|
56
68
|
non_filing = field.indicator2.to_i
|
57
69
|
|
58
70
|
str = field.subfields.collect {|sf| sf.value}.join(" ")
|
@@ -72,8 +84,9 @@ module Traject::Macros
|
|
72
84
|
accumulator << Marc21Semantics.get_sortable_title(record)
|
73
85
|
end
|
74
86
|
end
|
87
|
+
|
75
88
|
def self.get_sortable_title(record)
|
76
|
-
MarcExtractor.
|
89
|
+
MarcExtractor.cached("245ab").collect_matching_lines(record) do |field, spec, extractor|
|
77
90
|
str = extractor.collect_subfields(field, spec).first
|
78
91
|
|
79
92
|
if str.nil?
|
@@ -105,8 +118,10 @@ module Traject::Macros
|
|
105
118
|
def marc_languages(spec = "008[35-37]:041a:041d")
|
106
119
|
translation_map = Traject::TranslationMap.new("marc_languages")
|
107
120
|
|
121
|
+
extractor = MarcExtractor.new(spec, :seperator => nil)
|
122
|
+
|
108
123
|
lambda do |record, accumulator|
|
109
|
-
codes =
|
124
|
+
codes = extractor.collect_matching_lines(record) do |field, spec, extractor|
|
110
125
|
if extractor.control_field?(field)
|
111
126
|
(spec[:bytes] ? field.value.byteslice(spec[:bytes]) : field.value)
|
112
127
|
else
|
@@ -134,10 +149,12 @@ module Traject::Macros
|
|
134
149
|
# already covered by another field we're including, so we don't want to double count it, possibly
|
135
150
|
# with slight variation.
|
136
151
|
def marc_series_facet(spec = "440a:490a:800abcdt:810abcdt:811acdeft:830adfgklmnoprst")
|
152
|
+
extractor = MarcExtractor.new(spec)
|
153
|
+
|
137
154
|
lambda do |record, accumulator|
|
138
|
-
|
155
|
+
accumulator.concat( extractor.collect_matching_lines(record) do |field, spec, extractor|
|
139
156
|
extractor.collect_subfields(field, spec) unless (field.tag == "490" && field.indicator1 == "1")
|
140
|
-
end
|
157
|
+
end.compact)
|
141
158
|
end
|
142
159
|
end
|
143
160
|
|
@@ -149,8 +166,10 @@ module Traject::Macros
|
|
149
166
|
def marc_instrumentation_humanized(spec = "048ab", options = {})
|
150
167
|
translation_map = Traject::TranslationMap.new(options[:translation_map] || "marc_instruments")
|
151
168
|
|
169
|
+
extractor = MarcExtractor.new(spec, :seperator => nil)
|
170
|
+
|
152
171
|
lambda do |record, accumulator|
|
153
|
-
values =
|
172
|
+
values = extractor.extract(record)
|
154
173
|
human = values.collect do |value|
|
155
174
|
translation_map[ value.slice(0, 2) ]
|
156
175
|
end.uniq
|
@@ -169,9 +188,12 @@ module Traject::Macros
|
|
169
188
|
# codes.
|
170
189
|
def marc_instrument_codes_normalized(spec = "048")
|
171
190
|
soloist_suffix = ".s"
|
191
|
+
|
192
|
+
extractor = MarcExtractor.new("048", :seperator => nil)
|
193
|
+
|
172
194
|
return lambda do |record, accumulator|
|
173
195
|
accumulator.concat(
|
174
|
-
|
196
|
+
extractor.collect_matching_lines(record) do |field, spec, extractor|
|
175
197
|
values = []
|
176
198
|
|
177
199
|
field.subfields.each do |sf|
|
@@ -219,7 +241,7 @@ module Traject::Macros
|
|
219
241
|
# See #marc_publication_date. Yeah, this is a holy mess.
|
220
242
|
# Maybe it should actually be extracted to it's own class!
|
221
243
|
def self.publication_date(record, estimate_tolerance = 15, min_year = 500, max_year = (Time.new.year + 6))
|
222
|
-
field008 = MarcExtractor.
|
244
|
+
field008 = MarcExtractor.cached("008").extract(record).first
|
223
245
|
found_date = nil
|
224
246
|
|
225
247
|
if field008 && field008.length >= 11
|
@@ -264,7 +286,7 @@ module Traject::Macros
|
|
264
286
|
end
|
265
287
|
# Okay, nothing from 008, try 260
|
266
288
|
if found_date.nil?
|
267
|
-
v260c = MarcExtractor.
|
289
|
+
v260c = MarcExtractor.cached("260c", :seperator => nil).extract(record).first
|
268
290
|
# just try to take the first four digits out of there, we're not going to try
|
269
291
|
# anything crazy.
|
270
292
|
if v260c =~ /(\d{4})/
|
@@ -298,8 +320,10 @@ module Traject::Macros
|
|
298
320
|
default_value = options.has_key?(:default) ? options[:default] : "Unknown"
|
299
321
|
translation_map = Traject::TranslationMap.new("lcc_top_level")
|
300
322
|
|
323
|
+
extractor = MarcExtractor.new(spec, :seperator => nil)
|
324
|
+
|
301
325
|
lambda do |record, accumulator|
|
302
|
-
candidates =
|
326
|
+
candidates = extractor.extract(record)
|
303
327
|
|
304
328
|
candidates.reject! do |candidate|
|
305
329
|
!(candidate =~ lcc_regex)
|
@@ -328,10 +352,14 @@ module Traject::Macros
|
|
328
352
|
a_fields_spec = options[:geo_a_fields] || "651a:691a"
|
329
353
|
z_fields_spec = options[:geo_z_fields] || "600:610:611:630:648:650:654:655:656:690:651:691"
|
330
354
|
|
355
|
+
extractor_043a = MarcExtractor.new("043a", :seperator => nil)
|
356
|
+
extractor_a_fields = MarcExtractor.new(a_fields_spec, :seperator => nil)
|
357
|
+
extractor_z_fields = MarcExtractor.new(z_fields_spec)
|
358
|
+
|
331
359
|
lambda do |record, accumulator|
|
332
360
|
|
333
361
|
accumulator.concat(
|
334
|
-
|
362
|
+
extractor_043a.extract(record).collect do |code|
|
335
363
|
# remove any trailing hyphens, then map
|
336
364
|
marc_geo_map[code.gsub(/\-+\Z/, '')]
|
337
365
|
end.compact
|
@@ -339,15 +367,15 @@ module Traject::Macros
|
|
339
367
|
|
340
368
|
#LCSH 651a and 691a go in more or less normally.
|
341
369
|
accumulator.concat(
|
342
|
-
|
370
|
+
extractor_a_fields.extract(record).collect do |s|
|
343
371
|
# remove trailing periods, which they sometimes have if they were
|
344
372
|
# at end of LCSH.
|
345
373
|
s.sub(/\. */, '')
|
346
374
|
end
|
347
375
|
)
|
348
376
|
|
349
|
-
# fields we take z's from have a bit more normalization
|
350
|
-
|
377
|
+
# fields we take z's from have a bit more normalization
|
378
|
+
extractor_z_fields.each_matching_line(record) do |field, spec, extractor|
|
351
379
|
z_fields = field.subfields.find_all {|sf| sf.code == "z"}.collect {|sf| sf.value }
|
352
380
|
# depending on position in total field, may be a period on the end
|
353
381
|
# we want to remove.
|
@@ -376,17 +404,21 @@ module Traject::Macros
|
|
376
404
|
ordinary_fields_spec = "600y:610y:611y:630y:648ay:650y:654y:656y:690y"
|
377
405
|
special_fields_spec = "651:691"
|
378
406
|
seperator = ": "
|
407
|
+
|
408
|
+
extractor_ordinary_fields = MarcExtractor.new(ordinary_fields_spec)
|
409
|
+
extractor_special_fields = MarcExtractor.new(special_fields_spec)
|
410
|
+
|
379
411
|
lambda do |record, accumulator|
|
380
412
|
# straightforward ones
|
381
413
|
|
382
414
|
|
383
|
-
accumulator.concat(
|
415
|
+
accumulator.concat( extractor_ordinary_fields.extract(record).collect do |v|
|
384
416
|
# May have a period we have to remove, if it was at end of tag
|
385
417
|
v.sub(/\. *\Z/, '')
|
386
418
|
end)
|
387
419
|
|
388
|
-
# weird ones
|
389
|
-
|
420
|
+
# weird ones
|
421
|
+
extractor_special_fields.each_matching_line(record) do |field, spec, extractor|
|
390
422
|
field.subfields.each do |sf|
|
391
423
|
next unless sf.code == 'y'
|
392
424
|
if sf.value =~ /\A\s*.+,\s+(ca.\s+)?\d\d\d\d?(-\d\d\d\d?)?( B\.C\.)?[.,; ]*\Z/
|
@@ -396,7 +428,7 @@ module Traject::Macros
|
|
396
428
|
accumulator << sf.value.sub(/\. *\Z/, '')
|
397
429
|
end
|
398
430
|
end
|
399
|
-
end
|
431
|
+
end
|
400
432
|
end
|
401
433
|
end
|
402
434
|
|