traject 0.16.0 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/README.md +183 -191
- data/bench/bench.rb +1 -1
- data/doc/batch_execution.md +14 -0
- data/doc/extending.md +14 -12
- data/doc/indexing_rules.md +265 -0
- data/lib/traject/command_line.rb +12 -41
- data/lib/traject/debug_writer.rb +32 -13
- data/lib/traject/indexer.rb +101 -24
- data/lib/traject/indexer/settings.rb +18 -17
- data/lib/traject/json_writer.rb +32 -11
- data/lib/traject/line_writer.rb +6 -6
- data/lib/traject/macros/basic.rb +1 -1
- data/lib/traject/macros/marc21.rb +17 -13
- data/lib/traject/macros/marc21_semantics.rb +27 -25
- data/lib/traject/macros/marc_format_classifier.rb +39 -25
- data/lib/traject/marc4j_reader.rb +36 -22
- data/lib/traject/marc_extractor.rb +79 -75
- data/lib/traject/marc_reader.rb +33 -25
- data/lib/traject/mock_reader.rb +9 -10
- data/lib/traject/ndj_reader.rb +7 -7
- data/lib/traject/null_writer.rb +1 -1
- data/lib/traject/qualified_const_get.rb +12 -2
- data/lib/traject/solrj_writer.rb +61 -52
- data/lib/traject/thread_pool.rb +45 -45
- data/lib/traject/translation_map.rb +59 -27
- data/lib/traject/util.rb +3 -3
- data/lib/traject/version.rb +1 -1
- data/lib/traject/yaml_writer.rb +1 -1
- data/test/debug_writer_test.rb +7 -7
- data/test/indexer/each_record_test.rb +4 -4
- data/test/indexer/macros_marc21_semantics_test.rb +12 -12
- data/test/indexer/macros_marc21_test.rb +10 -10
- data/test/indexer/macros_test.rb +1 -1
- data/test/indexer/map_record_test.rb +6 -6
- data/test/indexer/read_write_test.rb +43 -4
- data/test/indexer/settings_test.rb +2 -2
- data/test/indexer/to_field_test.rb +8 -8
- data/test/marc4j_reader_test.rb +4 -4
- data/test/marc_extractor_test.rb +33 -25
- data/test/marc_format_classifier_test.rb +3 -3
- data/test/marc_reader_test.rb +2 -2
- data/test/test_helper.rb +3 -3
- data/test/test_support/demo_config.rb +52 -48
- data/test/translation_map_test.rb +22 -4
- data/test/translation_maps/bad_ruby.rb +2 -2
- data/test/translation_maps/both_map.rb +1 -1
- data/test/translation_maps/default_literal.rb +1 -1
- data/test/translation_maps/default_passthrough.rb +1 -1
- data/test/translation_maps/ruby_map.rb +1 -1
- metadata +7 -31
- data/doc/macros.md +0 -103
data/lib/traject/debug_writer.rb
CHANGED
@@ -1,21 +1,40 @@
|
|
1
1
|
require 'traject/line_writer'
|
2
2
|
|
3
|
-
#
|
4
|
-
#
|
5
|
-
#
|
3
|
+
# The Traject::DebugWriter produces a simple, human-readable output format that's
|
4
|
+
# also amenable to simple computer processing (e.g., with a simple grep).
|
5
|
+
# It's the output format used when you pass the --debug-mode switch to traject on the command line.
|
6
6
|
#
|
7
|
-
#
|
7
|
+
# Output format is three columns: id, output field, values (multiple
|
8
|
+
# values seperated by '|'), and looks something like:
|
8
9
|
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
|
14
|
-
|
10
|
+
# 000001580 edition [1st ed.]
|
11
|
+
# 000001580 format Book | Online | Print
|
12
|
+
# 000001580 geo Great Britain
|
13
|
+
# 000001580 id 000001580
|
14
|
+
# 000001580 isbn 0631126902
|
15
|
+
#
|
16
|
+
# ## Settings
|
17
|
+
#
|
18
|
+
# * 'output_file' -- the name of the file to output to (command line -o shortcut).
|
19
|
+
# * 'output_stream' -- alternately, the IO stream
|
20
|
+
# * 'debug_writer.idfield' -- the solr field from which to pull the record ID (default: 'id')
|
21
|
+
# * 'debug_writer.format' -- How to format the id/solr field/values (default: '%-12s %-25s %s')
|
22
|
+
#
|
23
|
+
# By default, with neither output_file nor output_stream provided, writes to stdout, which
|
24
|
+
# can be useful for debugging diagnosis.
|
25
|
+
#
|
26
|
+
# ## Example configuration file
|
27
|
+
#
|
28
|
+
# require 'traject/debug_writer'
|
29
|
+
#
|
30
|
+
# settings do
|
31
|
+
# provide "writer_class_name", "Traject::DebugWriter"
|
32
|
+
# provide "output_file", "out.txt"
|
33
|
+
# end
|
15
34
|
class Traject::DebugWriter < Traject::LineWriter
|
16
35
|
DEFAULT_FORMAT = '%-12s %-25s %s'
|
17
36
|
DEFAULT_IDFIELD = 'id'
|
18
|
-
|
37
|
+
|
19
38
|
def serialize(context)
|
20
39
|
idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
|
21
40
|
format = settings['debug_writer.format'] || DEFAULT_FORMAT
|
@@ -23,6 +42,6 @@ class Traject::DebugWriter < Traject::LineWriter
|
|
23
42
|
lines = h.keys.sort.map {|k| format % [h[idfield].first, k, h[k].join(' | ')] }
|
24
43
|
lines.push "\n"
|
25
44
|
lines.join("\n")
|
26
|
-
end
|
45
|
+
end
|
27
46
|
|
28
|
-
end
|
47
|
+
end
|
data/lib/traject/indexer.rb
CHANGED
@@ -11,8 +11,38 @@ require 'traject/solrj_writer'
|
|
11
11
|
|
12
12
|
require 'traject/macros/marc21'
|
13
13
|
require 'traject/macros/basic'
|
14
|
+
|
15
|
+
# This class does indexing for traject: Getting input records from a Reader
|
16
|
+
# class, mapping the input records to an output hash, and then sending the output
|
17
|
+
# hash off somewhere (usually Solr) with a Writer class.
|
18
|
+
#
|
19
|
+
# Traject config files are `instance_eval`d in an Indexer object, so `self` in
|
20
|
+
# a config file is an Indexer, and any Indexer methods can be called.
|
21
|
+
#
|
22
|
+
# However, certain Indexer methods exist almost entirely for the purpose of
|
23
|
+
# being called in config files; these methods are part of the expected
|
24
|
+
# Domain-Specific Language ("DSL") for config files, and will ordinarily
|
25
|
+
# form the bulk or entirety of config files:
|
26
|
+
#
|
27
|
+
# * #settings
|
28
|
+
# * #to_field
|
29
|
+
# * #each_record
|
30
|
+
# * #after_procesing
|
31
|
+
# * #logger (rarely used in config files, but in some cases to set up custom logging config)
|
32
|
+
#
|
33
|
+
# If accessing a Traject::Indexer programmatically (instead of via command line with
|
34
|
+
# config files), additional methods of note include:
|
35
|
+
#
|
36
|
+
# # to process a stream of input records from configured Reader,
|
37
|
+
# # to configured Writer:
|
38
|
+
# indexer.process(io_stream)
|
39
|
+
#
|
40
|
+
# # To map a single input record manually to an ouput_hash,
|
41
|
+
# # ignoring Readers and Writers
|
42
|
+
# hash = indexer.map_record(record)
|
14
43
|
#
|
15
|
-
#
|
44
|
+
#
|
45
|
+
# ## Readers and Writers
|
16
46
|
#
|
17
47
|
# The Indexer has a modularized architecture for readers and writers, for where
|
18
48
|
# source records come from (reader), and where output is sent to (writer).
|
@@ -73,28 +103,38 @@ class Traject::Indexer
|
|
73
103
|
def initialize(arg_settings = {})
|
74
104
|
@settings = Settings.new(arg_settings)
|
75
105
|
@index_steps = []
|
106
|
+
@after_processing_steps = []
|
76
107
|
end
|
77
108
|
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
109
|
+
# Part of the config file DSL, for writing settings values.
|
110
|
+
#
|
111
|
+
# The Indexer's settings consist of a hash-like Traject::Settings
|
112
|
+
# object. The settings hash is *not* nested hashes, just one level
|
113
|
+
# of configuration settings. Keys are always strings, and by convention
|
114
|
+
# use "." for namespacing, eg `log.file`
|
81
115
|
#
|
82
|
-
# The settings method with no arguments returns that
|
116
|
+
# The settings method with no arguments returns that Settings object.
|
83
117
|
#
|
84
118
|
# With a hash and/or block argument, can be used to set
|
85
119
|
# new key/values. Each call merges onto the existing settings
|
86
|
-
# hash.
|
120
|
+
# hash. The block is `instance_eval`d in the context
|
121
|
+
# of the Traject::Settings object.
|
87
122
|
#
|
88
123
|
# indexer.settings("a" => "a", "b" => "b")
|
89
124
|
#
|
90
125
|
# indexer.settings do
|
91
|
-
#
|
126
|
+
# provide "b", "new b"
|
92
127
|
# end
|
93
128
|
#
|
94
129
|
# indexer.settings #=> {"a" => "a", "b" => "new b"}
|
95
130
|
#
|
96
|
-
#
|
97
|
-
#
|
131
|
+
# Note the #provide method is defined on Traject::Settings to
|
132
|
+
# write to a setting only if previously not set. You can also
|
133
|
+
# use #store to force over-writing even if an existing setting.
|
134
|
+
#
|
135
|
+
# Even with arguments, Indexer#settings returns the Settings object,
|
136
|
+
# hash too, so can method calls can be chained.
|
137
|
+
#
|
98
138
|
def settings(new_settings = nil, &block)
|
99
139
|
@settings.merge!(new_settings) if new_settings
|
100
140
|
|
@@ -103,6 +143,24 @@ class Traject::Indexer
|
|
103
143
|
return @settings
|
104
144
|
end
|
105
145
|
|
146
|
+
# Part of DSL, used to define an indexing mapping. Register logic
|
147
|
+
# to be called for each record, and generate values for a particular
|
148
|
+
# output field.
|
149
|
+
def to_field(field_name, aLambda = nil, &block)
|
150
|
+
@index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
151
|
+
end
|
152
|
+
|
153
|
+
# Part of DSL, register logic to be called for each record
|
154
|
+
def each_record(aLambda = nil, &block)
|
155
|
+
@index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
156
|
+
end
|
157
|
+
|
158
|
+
# Part of DSL, register logic to be called once at the end
|
159
|
+
# of processing a stream of records.
|
160
|
+
def after_processing(aLambda = nil, &block)
|
161
|
+
@after_processing_steps << AfterProcessingStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first))
|
162
|
+
end
|
163
|
+
|
106
164
|
def logger
|
107
165
|
@logger ||= create_logger
|
108
166
|
end
|
@@ -149,20 +207,6 @@ class Traject::Indexer
|
|
149
207
|
return logger
|
150
208
|
end
|
151
209
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
# Used to define an indexing mapping.
|
157
|
-
def to_field(field_name, aLambda = nil, &block)
|
158
|
-
@index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
159
|
-
end
|
160
|
-
|
161
|
-
def each_record(aLambda = nil, &block)
|
162
|
-
@index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
163
|
-
end
|
164
|
-
|
165
|
-
|
166
210
|
# Processes a single record according to indexing rules set up in
|
167
211
|
# this indexer. Returns the output hash (a hash whose keys are
|
168
212
|
# string fields, and values are arrays of one or more values in that field)
|
@@ -293,7 +337,7 @@ class Traject::Indexer
|
|
293
337
|
# of having it be bound to the original variable in a non-threadsafe way.
|
294
338
|
# This is confusing, I might not be understanding things properly, but that's where i am.
|
295
339
|
#thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
|
296
|
-
thread_pool.maybe_in_thread_pool do
|
340
|
+
thread_pool.maybe_in_thread_pool(record, settings, position) do |record, settings, position|
|
297
341
|
context = Context.new(:source_record => record, :settings => settings, :position => position)
|
298
342
|
context.logger = logger
|
299
343
|
map_to_context!(context)
|
@@ -317,6 +361,15 @@ class Traject::Indexer
|
|
317
361
|
|
318
362
|
writer.close if writer.respond_to?(:close)
|
319
363
|
|
364
|
+
@after_processing_steps.each do |step|
|
365
|
+
begin
|
366
|
+
step.execute
|
367
|
+
rescue Exception => e
|
368
|
+
logger.fatal("Unexpected exception #{e} when executing #{step}")
|
369
|
+
raise e
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
320
373
|
elapsed = Time.now - start_time
|
321
374
|
avg_rps = (count / elapsed)
|
322
375
|
logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
|
@@ -513,6 +566,30 @@ class Traject::Indexer
|
|
513
566
|
|
514
567
|
end
|
515
568
|
|
569
|
+
# A class representing a block of logic called after
|
570
|
+
# processing, registered with #after_processing
|
571
|
+
class AfterProcessingStep
|
572
|
+
attr_accessor :lambda, :block, :source_location
|
573
|
+
def initialize(lambda, block, source_location)
|
574
|
+
self.lambda = lambda
|
575
|
+
self.block = block
|
576
|
+
self.source_location = source_location
|
577
|
+
end
|
578
|
+
|
579
|
+
# after_processing steps get no args yielded to
|
580
|
+
# their blocks, they just are what they are.
|
581
|
+
def execute
|
582
|
+
[lambda, block].each do |aProc|
|
583
|
+
next unless aProc
|
584
|
+
aProc.call
|
585
|
+
end
|
586
|
+
end
|
587
|
+
|
588
|
+
def inspect
|
589
|
+
"(after_processing at #{self.source_location}"
|
590
|
+
end
|
591
|
+
end
|
592
|
+
|
516
593
|
|
517
594
|
|
518
595
|
|
@@ -1,22 +1,23 @@
|
|
1
1
|
require 'hashie'
|
2
2
|
|
3
|
-
# A Hash of settings for a Traject::Indexer, which also ends up passed along
|
4
|
-
# to other objects Traject::Indexer interacts with.
|
5
|
-
#
|
6
|
-
# Enhanced with a few features from Hashie, to make it for
|
7
|
-
# instance string/symbol indifferent
|
8
|
-
#
|
9
|
-
# #provide(key, value) is added, to do like settings[key] ||= value,
|
10
|
-
# set only if not already set (but unlike ||=, nil or false can count as already set)
|
11
|
-
#
|
12
|
-
# Also has an interesting 'defaults' system, meant to play along
|
13
|
-
# with configuration file 'provide' statements. There is a built-in hash of
|
14
|
-
# defaults, which will be lazily filled in if accessed and not yet
|
15
|
-
# set. (nil can count as set, though!). If they haven't been lazily
|
16
|
-
# set yet, then #provide will still fill them in. But you can also call
|
17
|
-
# fill_in_defaults! to fill all defaults in, if you know configuration
|
18
|
-
# files have all been loaded, and want to fill them in for inspection.
|
19
3
|
class Traject::Indexer
|
4
|
+
|
5
|
+
# A Hash of settings for a Traject::Indexer, which also ends up passed along
|
6
|
+
# to other objects Traject::Indexer interacts with.
|
7
|
+
#
|
8
|
+
# Enhanced with a few features from Hashie, to make it for
|
9
|
+
# instance string/symbol indifferent
|
10
|
+
#
|
11
|
+
# method #provide(key, value) is added, to do like settings[key] ||= value,
|
12
|
+
# set only if not already set (but unlike ||=, nil or false can count as already set)
|
13
|
+
#
|
14
|
+
# Also has an interesting 'defaults' system, meant to play along
|
15
|
+
# with configuration file 'provide' statements. There is a built-in hash of
|
16
|
+
# defaults, which will be lazily filled in if accessed and not yet
|
17
|
+
# set. (nil can count as set, though!). If they haven't been lazily
|
18
|
+
# set yet, then #provide will still fill them in. But you can also call
|
19
|
+
# fill_in_defaults! to fill all defaults in, if you know configuration
|
20
|
+
# files have all been loaded, and want to fill them in for inspection.
|
20
21
|
class Settings < Hash
|
21
22
|
include Hashie::Extensions::MergeInitializer # can init with hash
|
22
23
|
include Hashie::Extensions::IndifferentAccess
|
@@ -80,4 +81,4 @@ class Traject::Indexer
|
|
80
81
|
end.inspect
|
81
82
|
end
|
82
83
|
end
|
83
|
-
end
|
84
|
+
end
|
data/lib/traject/json_writer.rb
CHANGED
@@ -1,21 +1,42 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'traject/line_writer'
|
3
3
|
|
4
|
-
#
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
4
|
+
# The JsonWriter outputs one JSON hash per record, separated by newlines.
|
5
|
+
#
|
6
|
+
# It's newline delimitted json, which should be suitable for being
|
7
|
+
# read by simple NDJ readers. (TODO: We have no checks right now to
|
8
|
+
# make sure the standard json serializers we're using don't put any
|
9
|
+
# internal newlines as whitespace in the json. Which would break NDJ
|
10
|
+
# reading. Should we?)
|
8
11
|
#
|
9
12
|
# Should be thread-safe (ie, multiple worker threads can be calling #put
|
10
|
-
# concurrently),
|
13
|
+
# concurrently), because output to file is wrapped in a mutex synchronize.
|
11
14
|
# This does not seem to effect performance much, as far as I could tell
|
12
15
|
# benchmarking.
|
13
16
|
#
|
14
|
-
#
|
15
|
-
#
|
17
|
+
# ## Settings
|
18
|
+
#
|
19
|
+
# * output_file A filename to send output; default will use stdout.
|
20
|
+
#
|
21
|
+
# * json_writer.pretty_print: [default: false]: Pretty-print (e.g., include newlines, indentation, etc.)
|
22
|
+
# each JSON record instead of just mashing it all together on one line. The default, no pretty-printing option
|
23
|
+
# produces one record per line, easy to process with another program.
|
24
|
+
#
|
25
|
+
# ## Example output
|
26
|
+
#
|
27
|
+
# Without pretty printing, you end up with something like this (just two records shown):
|
28
|
+
#
|
29
|
+
# {"id":["000001118"],"oclc":["ocm00085737"],"sdrnum":["sdr-nrlf.b170195454"],"isbn":["0137319924"],"lccn":["73120791"],"mainauthor":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"author2":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"authorSort":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author_top":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Edited by David A. Hamburg.","Hamburg, David A., 1925- ed."],"title":["Psychiatry as a behavioral science."],"title_a":["Psychiatry as a behavioral science."],"title_ab":["Psychiatry as a behavioral science."],"title_c":["Edited by David A. Hamburg."],"titleSort":["Psychiatry as a behavioral science"],"title_top":["Psychiatry as a behavioral science."],"title_rest":["A Spectrum book"],"series2":["A Spectrum book"],"callnumber":["RC327 .B41"],"broad_subject":["Medicine"],"pubdate":[1970],"format":["Book","Online","Print"],"publisher":["Prentice-Hall"],"language":["English"],"language008":["eng"],"editor":["David A. Hamburg."]}
|
30
|
+
# {"id":["000000794"],"oclc":["ocm00067181"],"lccn":["78011026"],"mainauthor":["Clark, Albert Curtis, 1859-1937."],"author":["Clark, Albert Curtis, 1859-1937."],"authorSort":["Clark, Albert Curtis, 1859-1937."],"author_top":["Clark, Albert Curtis, 1859-1937."],"title":["The descent of manuscripts.","descent of manuscripts."],"title_a":["The descent of manuscripts.","descent of manuscripts."],"title_ab":["The descent of manuscripts.","descent of manuscripts."],"titleSort":["descent of manuscripts"],"title_top":["The descent of manuscripts."],"callnumber":["PA47 .C45 1970"],"broad_subject":["Language & Literature"],"pubdate":[1918],"format":["Book","Online","Print"],"publisher":["Clarendon Press"],"language":["English"],"language008":["eng"]}
|
31
|
+
#
|
32
|
+
# ## Example configuration file
|
33
|
+
#
|
34
|
+
# require 'traject/json_writer'
|
16
35
|
#
|
17
|
-
#
|
18
|
-
#
|
36
|
+
# settings do
|
37
|
+
# provide "writer_class_name", "Traject::JsonWriter"
|
38
|
+
# provide "output_file", "out.json"
|
39
|
+
# end
|
19
40
|
class Traject::JsonWriter < Traject::LineWriter
|
20
41
|
|
21
42
|
def serialize(context)
|
@@ -25,6 +46,6 @@ class Traject::JsonWriter < Traject::LineWriter
|
|
25
46
|
else
|
26
47
|
JSON.generate(hash)
|
27
48
|
end
|
28
|
-
end
|
49
|
+
end
|
29
50
|
|
30
|
-
end
|
51
|
+
end
|
data/lib/traject/line_writer.rb
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
require 'thread'
|
2
2
|
|
3
3
|
# A writer for Traject::Indexer, that just writes out
|
4
|
-
# all the output as serialized text with #puts.
|
4
|
+
# all the output as serialized text with #puts.
|
5
5
|
#
|
6
6
|
# Should be thread-safe (ie, multiple worker threads can be calling #put
|
7
7
|
# concurrently), by wrapping write to actual output file in a mutex synchronize.
|
8
8
|
# This does not seem to effect performance much, as far as I could tell
|
9
9
|
# benchmarking.
|
10
10
|
#
|
11
|
-
# Output will be sent to settings["output_file"] string path, or else
|
12
|
-
# settings["output_stream"] (ruby IO object), or else stdout.
|
11
|
+
# Output will be sent to `settings["output_file"]` string path, or else
|
12
|
+
# `settings["output_stream"]` (ruby IO object), or else stdout.
|
13
13
|
#
|
14
14
|
# This class can be sub-classed to write out different serialized
|
15
15
|
# reprentations -- subclasses will just override the #serialize
|
16
|
-
# method. For instance, see JsonWriter.
|
16
|
+
# method. For instance, see JsonWriter.
|
17
17
|
class Traject::LineWriter
|
18
18
|
attr_reader :settings
|
19
19
|
attr_reader :write_mutex
|
@@ -29,7 +29,7 @@ class Traject::LineWriter
|
|
29
29
|
|
30
30
|
def serialize(context)
|
31
31
|
context.output_hash
|
32
|
-
end
|
32
|
+
end
|
33
33
|
|
34
34
|
def put(context)
|
35
35
|
serialized = serialize(context)
|
@@ -56,4 +56,4 @@ class Traject::LineWriter
|
|
56
56
|
@output_file.close unless (@output_file.nil? || @output_file.tty?)
|
57
57
|
end
|
58
58
|
|
59
|
-
end
|
59
|
+
end
|
data/lib/traject/macros/basic.rb
CHANGED
@@ -20,29 +20,33 @@ module Traject::Macros
|
|
20
20
|
# and others. By default, will de-duplicate results, but see :allow_duplicates
|
21
21
|
#
|
22
22
|
# * :first => true: take only first value
|
23
|
+
#
|
23
24
|
# * :translation_map => String: translate with named translation map looked up in load
|
24
25
|
# path, uses Tranject::TranslationMap.new(translation_map_arg)
|
26
|
+
#
|
25
27
|
# * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
|
26
28
|
# have shown themselves useful with Marc, using Marc21.trim_punctuation
|
29
|
+
#
|
27
30
|
# * :default => String: if otherwise empty, add default value
|
31
|
+
#
|
28
32
|
# * :allow_duplicates => boolean, default false, if set to true then will avoid
|
29
33
|
# de-duplicating the result array (array.uniq!)
|
30
34
|
#
|
31
35
|
#
|
32
36
|
# Examples:
|
33
37
|
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
#
|
38
|
+
# to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
|
39
|
+
# to_field("id"), extract_marc("001", :first => true)
|
40
|
+
# to_field("geo"), extract_marc("040a", :separator => nil, :translation_map => "marc040")
|
37
41
|
def extract_marc(spec, options = {})
|
38
|
-
|
42
|
+
|
39
43
|
# Raise an error if there are any invalid options, indicating a
|
40
44
|
# misspelled or illegal option, using a string instead of a symbol, etc.
|
41
|
-
|
45
|
+
|
42
46
|
unless (options.keys - EXTRACT_MARC_VALID_OPTIONS).empty?
|
43
47
|
raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_MARC_VALID_OPTIONS).join(', ')}' in extract_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
44
48
|
end
|
45
|
-
|
49
|
+
|
46
50
|
only_first = options.delete(:first)
|
47
51
|
trim_punctuation = options.delete(:trim_punctuation)
|
48
52
|
default_value = options.delete(:default)
|
@@ -53,12 +57,12 @@ module Traject::Macros
|
|
53
57
|
# ones, and not have to create a new one per-execution.
|
54
58
|
#
|
55
59
|
# Benchmarking shows for MarcExtractor at least, there is
|
56
|
-
# significant performance advantage.
|
60
|
+
# significant performance advantage.
|
57
61
|
|
58
62
|
if translation_map_arg = options.delete(:translation_map)
|
59
63
|
translation_map = Traject::TranslationMap.new(translation_map_arg)
|
60
64
|
end
|
61
|
-
|
65
|
+
|
62
66
|
|
63
67
|
extractor = Traject::MarcExtractor.new(spec, options)
|
64
68
|
|
@@ -76,7 +80,7 @@ module Traject::Macros
|
|
76
80
|
if trim_punctuation
|
77
81
|
accumulator.collect! {|s| Marc21.trim_punctuation(s)}
|
78
82
|
end
|
79
|
-
|
83
|
+
|
80
84
|
unless allow_duplicates
|
81
85
|
accumulator.uniq!
|
82
86
|
end
|
@@ -84,14 +88,14 @@ module Traject::Macros
|
|
84
88
|
if default_value && accumulator.empty?
|
85
89
|
accumulator << default_value
|
86
90
|
end
|
87
|
-
|
91
|
+
|
88
92
|
end
|
89
93
|
end
|
90
94
|
# A list of symbols that are valid keys in the options hash
|
91
|
-
EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
|
92
|
-
:allow_duplicates, :separator, :translation_map,
|
95
|
+
EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
|
96
|
+
:allow_duplicates, :separator, :translation_map,
|
93
97
|
:alternate_script]
|
94
|
-
|
98
|
+
|
95
99
|
# Serializes complete marc record to a serialization format.
|
96
100
|
# required param :format,
|
97
101
|
# serialize_marc(:format => :binary)
|