traject 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/README.md +183 -191
- data/bench/bench.rb +1 -1
- data/doc/batch_execution.md +14 -0
- data/doc/extending.md +14 -12
- data/doc/indexing_rules.md +265 -0
- data/lib/traject/command_line.rb +12 -41
- data/lib/traject/debug_writer.rb +32 -13
- data/lib/traject/indexer.rb +101 -24
- data/lib/traject/indexer/settings.rb +18 -17
- data/lib/traject/json_writer.rb +32 -11
- data/lib/traject/line_writer.rb +6 -6
- data/lib/traject/macros/basic.rb +1 -1
- data/lib/traject/macros/marc21.rb +17 -13
- data/lib/traject/macros/marc21_semantics.rb +27 -25
- data/lib/traject/macros/marc_format_classifier.rb +39 -25
- data/lib/traject/marc4j_reader.rb +36 -22
- data/lib/traject/marc_extractor.rb +79 -75
- data/lib/traject/marc_reader.rb +33 -25
- data/lib/traject/mock_reader.rb +9 -10
- data/lib/traject/ndj_reader.rb +7 -7
- data/lib/traject/null_writer.rb +1 -1
- data/lib/traject/qualified_const_get.rb +12 -2
- data/lib/traject/solrj_writer.rb +61 -52
- data/lib/traject/thread_pool.rb +45 -45
- data/lib/traject/translation_map.rb +59 -27
- data/lib/traject/util.rb +3 -3
- data/lib/traject/version.rb +1 -1
- data/lib/traject/yaml_writer.rb +1 -1
- data/test/debug_writer_test.rb +7 -7
- data/test/indexer/each_record_test.rb +4 -4
- data/test/indexer/macros_marc21_semantics_test.rb +12 -12
- data/test/indexer/macros_marc21_test.rb +10 -10
- data/test/indexer/macros_test.rb +1 -1
- data/test/indexer/map_record_test.rb +6 -6
- data/test/indexer/read_write_test.rb +43 -4
- data/test/indexer/settings_test.rb +2 -2
- data/test/indexer/to_field_test.rb +8 -8
- data/test/marc4j_reader_test.rb +4 -4
- data/test/marc_extractor_test.rb +33 -25
- data/test/marc_format_classifier_test.rb +3 -3
- data/test/marc_reader_test.rb +2 -2
- data/test/test_helper.rb +3 -3
- data/test/test_support/demo_config.rb +52 -48
- data/test/translation_map_test.rb +22 -4
- data/test/translation_maps/bad_ruby.rb +2 -2
- data/test/translation_maps/both_map.rb +1 -1
- data/test/translation_maps/default_literal.rb +1 -1
- data/test/translation_maps/default_passthrough.rb +1 -1
- data/test/translation_maps/ruby_map.rb +1 -1
- metadata +7 -31
- data/doc/macros.md +0 -103
data/lib/traject/debug_writer.rb
CHANGED
@@ -1,21 +1,40 @@
|
|
1
1
|
require 'traject/line_writer'
|
2
2
|
|
3
|
-
#
|
4
|
-
#
|
5
|
-
#
|
3
|
+
# The Traject::DebugWriter produces a simple, human-readable output format that's
|
4
|
+
# also amenable to simple computer processing (e.g., with a simple grep).
|
5
|
+
# It's the output format used when you pass the --debug-mode switch to traject on the command line.
|
6
6
|
#
|
7
|
-
#
|
7
|
+
# Output format is three columns: id, output field, values (multiple
|
8
|
+
# values seperated by '|'), and looks something like:
|
8
9
|
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
|
14
|
-
|
10
|
+
# 000001580 edition [1st ed.]
|
11
|
+
# 000001580 format Book | Online | Print
|
12
|
+
# 000001580 geo Great Britain
|
13
|
+
# 000001580 id 000001580
|
14
|
+
# 000001580 isbn 0631126902
|
15
|
+
#
|
16
|
+
# ## Settings
|
17
|
+
#
|
18
|
+
# * 'output_file' -- the name of the file to output to (command line -o shortcut).
|
19
|
+
# * 'output_stream' -- alternately, the IO stream
|
20
|
+
# * 'debug_writer.idfield' -- the solr field from which to pull the record ID (default: 'id')
|
21
|
+
# * 'debug_writer.format' -- How to format the id/solr field/values (default: '%-12s %-25s %s')
|
22
|
+
#
|
23
|
+
# By default, with neither output_file nor output_stream provided, writes to stdout, which
|
24
|
+
# can be useful for debugging diagnosis.
|
25
|
+
#
|
26
|
+
# ## Example configuration file
|
27
|
+
#
|
28
|
+
# require 'traject/debug_writer'
|
29
|
+
#
|
30
|
+
# settings do
|
31
|
+
# provide "writer_class_name", "Traject::DebugWriter"
|
32
|
+
# provide "output_file", "out.txt"
|
33
|
+
# end
|
15
34
|
class Traject::DebugWriter < Traject::LineWriter
|
16
35
|
DEFAULT_FORMAT = '%-12s %-25s %s'
|
17
36
|
DEFAULT_IDFIELD = 'id'
|
18
|
-
|
37
|
+
|
19
38
|
def serialize(context)
|
20
39
|
idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
|
21
40
|
format = settings['debug_writer.format'] || DEFAULT_FORMAT
|
@@ -23,6 +42,6 @@ class Traject::DebugWriter < Traject::LineWriter
|
|
23
42
|
lines = h.keys.sort.map {|k| format % [h[idfield].first, k, h[k].join(' | ')] }
|
24
43
|
lines.push "\n"
|
25
44
|
lines.join("\n")
|
26
|
-
end
|
45
|
+
end
|
27
46
|
|
28
|
-
end
|
47
|
+
end
|
data/lib/traject/indexer.rb
CHANGED
@@ -11,8 +11,38 @@ require 'traject/solrj_writer'
|
|
11
11
|
|
12
12
|
require 'traject/macros/marc21'
|
13
13
|
require 'traject/macros/basic'
|
14
|
+
|
15
|
+
# This class does indexing for traject: Getting input records from a Reader
|
16
|
+
# class, mapping the input records to an output hash, and then sending the output
|
17
|
+
# hash off somewhere (usually Solr) with a Writer class.
|
18
|
+
#
|
19
|
+
# Traject config files are `instance_eval`d in an Indexer object, so `self` in
|
20
|
+
# a config file is an Indexer, and any Indexer methods can be called.
|
21
|
+
#
|
22
|
+
# However, certain Indexer methods exist almost entirely for the purpose of
|
23
|
+
# being called in config files; these methods are part of the expected
|
24
|
+
# Domain-Specific Language ("DSL") for config files, and will ordinarily
|
25
|
+
# form the bulk or entirety of config files:
|
26
|
+
#
|
27
|
+
# * #settings
|
28
|
+
# * #to_field
|
29
|
+
# * #each_record
|
30
|
+
# * #after_procesing
|
31
|
+
# * #logger (rarely used in config files, but in some cases to set up custom logging config)
|
32
|
+
#
|
33
|
+
# If accessing a Traject::Indexer programmatically (instead of via command line with
|
34
|
+
# config files), additional methods of note include:
|
35
|
+
#
|
36
|
+
# # to process a stream of input records from configured Reader,
|
37
|
+
# # to configured Writer:
|
38
|
+
# indexer.process(io_stream)
|
39
|
+
#
|
40
|
+
# # To map a single input record manually to an ouput_hash,
|
41
|
+
# # ignoring Readers and Writers
|
42
|
+
# hash = indexer.map_record(record)
|
14
43
|
#
|
15
|
-
#
|
44
|
+
#
|
45
|
+
# ## Readers and Writers
|
16
46
|
#
|
17
47
|
# The Indexer has a modularized architecture for readers and writers, for where
|
18
48
|
# source records come from (reader), and where output is sent to (writer).
|
@@ -73,28 +103,38 @@ class Traject::Indexer
|
|
73
103
|
def initialize(arg_settings = {})
|
74
104
|
@settings = Settings.new(arg_settings)
|
75
105
|
@index_steps = []
|
106
|
+
@after_processing_steps = []
|
76
107
|
end
|
77
108
|
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
109
|
+
# Part of the config file DSL, for writing settings values.
|
110
|
+
#
|
111
|
+
# The Indexer's settings consist of a hash-like Traject::Settings
|
112
|
+
# object. The settings hash is *not* nested hashes, just one level
|
113
|
+
# of configuration settings. Keys are always strings, and by convention
|
114
|
+
# use "." for namespacing, eg `log.file`
|
81
115
|
#
|
82
|
-
# The settings method with no arguments returns that
|
116
|
+
# The settings method with no arguments returns that Settings object.
|
83
117
|
#
|
84
118
|
# With a hash and/or block argument, can be used to set
|
85
119
|
# new key/values. Each call merges onto the existing settings
|
86
|
-
# hash.
|
120
|
+
# hash. The block is `instance_eval`d in the context
|
121
|
+
# of the Traject::Settings object.
|
87
122
|
#
|
88
123
|
# indexer.settings("a" => "a", "b" => "b")
|
89
124
|
#
|
90
125
|
# indexer.settings do
|
91
|
-
#
|
126
|
+
# provide "b", "new b"
|
92
127
|
# end
|
93
128
|
#
|
94
129
|
# indexer.settings #=> {"a" => "a", "b" => "new b"}
|
95
130
|
#
|
96
|
-
#
|
97
|
-
#
|
131
|
+
# Note the #provide method is defined on Traject::Settings to
|
132
|
+
# write to a setting only if previously not set. You can also
|
133
|
+
# use #store to force over-writing even if an existing setting.
|
134
|
+
#
|
135
|
+
# Even with arguments, Indexer#settings returns the Settings object,
|
136
|
+
# hash too, so can method calls can be chained.
|
137
|
+
#
|
98
138
|
def settings(new_settings = nil, &block)
|
99
139
|
@settings.merge!(new_settings) if new_settings
|
100
140
|
|
@@ -103,6 +143,24 @@ class Traject::Indexer
|
|
103
143
|
return @settings
|
104
144
|
end
|
105
145
|
|
146
|
+
# Part of DSL, used to define an indexing mapping. Register logic
|
147
|
+
# to be called for each record, and generate values for a particular
|
148
|
+
# output field.
|
149
|
+
def to_field(field_name, aLambda = nil, &block)
|
150
|
+
@index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
151
|
+
end
|
152
|
+
|
153
|
+
# Part of DSL, register logic to be called for each record
|
154
|
+
def each_record(aLambda = nil, &block)
|
155
|
+
@index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
156
|
+
end
|
157
|
+
|
158
|
+
# Part of DSL, register logic to be called once at the end
|
159
|
+
# of processing a stream of records.
|
160
|
+
def after_processing(aLambda = nil, &block)
|
161
|
+
@after_processing_steps << AfterProcessingStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first))
|
162
|
+
end
|
163
|
+
|
106
164
|
def logger
|
107
165
|
@logger ||= create_logger
|
108
166
|
end
|
@@ -149,20 +207,6 @@ class Traject::Indexer
|
|
149
207
|
return logger
|
150
208
|
end
|
151
209
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
# Used to define an indexing mapping.
|
157
|
-
def to_field(field_name, aLambda = nil, &block)
|
158
|
-
@index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
159
|
-
end
|
160
|
-
|
161
|
-
def each_record(aLambda = nil, &block)
|
162
|
-
@index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
163
|
-
end
|
164
|
-
|
165
|
-
|
166
210
|
# Processes a single record according to indexing rules set up in
|
167
211
|
# this indexer. Returns the output hash (a hash whose keys are
|
168
212
|
# string fields, and values are arrays of one or more values in that field)
|
@@ -293,7 +337,7 @@ class Traject::Indexer
|
|
293
337
|
# of having it be bound to the original variable in a non-threadsafe way.
|
294
338
|
# This is confusing, I might not be understanding things properly, but that's where i am.
|
295
339
|
#thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
|
296
|
-
thread_pool.maybe_in_thread_pool do
|
340
|
+
thread_pool.maybe_in_thread_pool(record, settings, position) do |record, settings, position|
|
297
341
|
context = Context.new(:source_record => record, :settings => settings, :position => position)
|
298
342
|
context.logger = logger
|
299
343
|
map_to_context!(context)
|
@@ -317,6 +361,15 @@ class Traject::Indexer
|
|
317
361
|
|
318
362
|
writer.close if writer.respond_to?(:close)
|
319
363
|
|
364
|
+
@after_processing_steps.each do |step|
|
365
|
+
begin
|
366
|
+
step.execute
|
367
|
+
rescue Exception => e
|
368
|
+
logger.fatal("Unexpected exception #{e} when executing #{step}")
|
369
|
+
raise e
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
320
373
|
elapsed = Time.now - start_time
|
321
374
|
avg_rps = (count / elapsed)
|
322
375
|
logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
|
@@ -513,6 +566,30 @@ class Traject::Indexer
|
|
513
566
|
|
514
567
|
end
|
515
568
|
|
569
|
+
# A class representing a block of logic called after
|
570
|
+
# processing, registered with #after_processing
|
571
|
+
class AfterProcessingStep
|
572
|
+
attr_accessor :lambda, :block, :source_location
|
573
|
+
def initialize(lambda, block, source_location)
|
574
|
+
self.lambda = lambda
|
575
|
+
self.block = block
|
576
|
+
self.source_location = source_location
|
577
|
+
end
|
578
|
+
|
579
|
+
# after_processing steps get no args yielded to
|
580
|
+
# their blocks, they just are what they are.
|
581
|
+
def execute
|
582
|
+
[lambda, block].each do |aProc|
|
583
|
+
next unless aProc
|
584
|
+
aProc.call
|
585
|
+
end
|
586
|
+
end
|
587
|
+
|
588
|
+
def inspect
|
589
|
+
"(after_processing at #{self.source_location}"
|
590
|
+
end
|
591
|
+
end
|
592
|
+
|
516
593
|
|
517
594
|
|
518
595
|
|
@@ -1,22 +1,23 @@
|
|
1
1
|
require 'hashie'
|
2
2
|
|
3
|
-
# A Hash of settings for a Traject::Indexer, which also ends up passed along
|
4
|
-
# to other objects Traject::Indexer interacts with.
|
5
|
-
#
|
6
|
-
# Enhanced with a few features from Hashie, to make it for
|
7
|
-
# instance string/symbol indifferent
|
8
|
-
#
|
9
|
-
# #provide(key, value) is added, to do like settings[key] ||= value,
|
10
|
-
# set only if not already set (but unlike ||=, nil or false can count as already set)
|
11
|
-
#
|
12
|
-
# Also has an interesting 'defaults' system, meant to play along
|
13
|
-
# with configuration file 'provide' statements. There is a built-in hash of
|
14
|
-
# defaults, which will be lazily filled in if accessed and not yet
|
15
|
-
# set. (nil can count as set, though!). If they haven't been lazily
|
16
|
-
# set yet, then #provide will still fill them in. But you can also call
|
17
|
-
# fill_in_defaults! to fill all defaults in, if you know configuration
|
18
|
-
# files have all been loaded, and want to fill them in for inspection.
|
19
3
|
class Traject::Indexer
|
4
|
+
|
5
|
+
# A Hash of settings for a Traject::Indexer, which also ends up passed along
|
6
|
+
# to other objects Traject::Indexer interacts with.
|
7
|
+
#
|
8
|
+
# Enhanced with a few features from Hashie, to make it for
|
9
|
+
# instance string/symbol indifferent
|
10
|
+
#
|
11
|
+
# method #provide(key, value) is added, to do like settings[key] ||= value,
|
12
|
+
# set only if not already set (but unlike ||=, nil or false can count as already set)
|
13
|
+
#
|
14
|
+
# Also has an interesting 'defaults' system, meant to play along
|
15
|
+
# with configuration file 'provide' statements. There is a built-in hash of
|
16
|
+
# defaults, which will be lazily filled in if accessed and not yet
|
17
|
+
# set. (nil can count as set, though!). If they haven't been lazily
|
18
|
+
# set yet, then #provide will still fill them in. But you can also call
|
19
|
+
# fill_in_defaults! to fill all defaults in, if you know configuration
|
20
|
+
# files have all been loaded, and want to fill them in for inspection.
|
20
21
|
class Settings < Hash
|
21
22
|
include Hashie::Extensions::MergeInitializer # can init with hash
|
22
23
|
include Hashie::Extensions::IndifferentAccess
|
@@ -80,4 +81,4 @@ class Traject::Indexer
|
|
80
81
|
end.inspect
|
81
82
|
end
|
82
83
|
end
|
83
|
-
end
|
84
|
+
end
|
data/lib/traject/json_writer.rb
CHANGED
@@ -1,21 +1,42 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'traject/line_writer'
|
3
3
|
|
4
|
-
#
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
4
|
+
# The JsonWriter outputs one JSON hash per record, separated by newlines.
|
5
|
+
#
|
6
|
+
# It's newline delimitted json, which should be suitable for being
|
7
|
+
# read by simple NDJ readers. (TODO: We have no checks right now to
|
8
|
+
# make sure the standard json serializers we're using don't put any
|
9
|
+
# internal newlines as whitespace in the json. Which would break NDJ
|
10
|
+
# reading. Should we?)
|
8
11
|
#
|
9
12
|
# Should be thread-safe (ie, multiple worker threads can be calling #put
|
10
|
-
# concurrently),
|
13
|
+
# concurrently), because output to file is wrapped in a mutex synchronize.
|
11
14
|
# This does not seem to effect performance much, as far as I could tell
|
12
15
|
# benchmarking.
|
13
16
|
#
|
14
|
-
#
|
15
|
-
#
|
17
|
+
# ## Settings
|
18
|
+
#
|
19
|
+
# * output_file A filename to send output; default will use stdout.
|
20
|
+
#
|
21
|
+
# * json_writer.pretty_print: [default: false]: Pretty-print (e.g., include newlines, indentation, etc.)
|
22
|
+
# each JSON record instead of just mashing it all together on one line. The default, no pretty-printing option
|
23
|
+
# produces one record per line, easy to process with another program.
|
24
|
+
#
|
25
|
+
# ## Example output
|
26
|
+
#
|
27
|
+
# Without pretty printing, you end up with something like this (just two records shown):
|
28
|
+
#
|
29
|
+
# {"id":["000001118"],"oclc":["ocm00085737"],"sdrnum":["sdr-nrlf.b170195454"],"isbn":["0137319924"],"lccn":["73120791"],"mainauthor":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"author2":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"authorSort":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author_top":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Edited by David A. Hamburg.","Hamburg, David A., 1925- ed."],"title":["Psychiatry as a behavioral science."],"title_a":["Psychiatry as a behavioral science."],"title_ab":["Psychiatry as a behavioral science."],"title_c":["Edited by David A. Hamburg."],"titleSort":["Psychiatry as a behavioral science"],"title_top":["Psychiatry as a behavioral science."],"title_rest":["A Spectrum book"],"series2":["A Spectrum book"],"callnumber":["RC327 .B41"],"broad_subject":["Medicine"],"pubdate":[1970],"format":["Book","Online","Print"],"publisher":["Prentice-Hall"],"language":["English"],"language008":["eng"],"editor":["David A. Hamburg."]}
|
30
|
+
# {"id":["000000794"],"oclc":["ocm00067181"],"lccn":["78011026"],"mainauthor":["Clark, Albert Curtis, 1859-1937."],"author":["Clark, Albert Curtis, 1859-1937."],"authorSort":["Clark, Albert Curtis, 1859-1937."],"author_top":["Clark, Albert Curtis, 1859-1937."],"title":["The descent of manuscripts.","descent of manuscripts."],"title_a":["The descent of manuscripts.","descent of manuscripts."],"title_ab":["The descent of manuscripts.","descent of manuscripts."],"titleSort":["descent of manuscripts"],"title_top":["The descent of manuscripts."],"callnumber":["PA47 .C45 1970"],"broad_subject":["Language & Literature"],"pubdate":[1918],"format":["Book","Online","Print"],"publisher":["Clarendon Press"],"language":["English"],"language008":["eng"]}
|
31
|
+
#
|
32
|
+
# ## Example configuration file
|
33
|
+
#
|
34
|
+
# require 'traject/json_writer'
|
16
35
|
#
|
17
|
-
#
|
18
|
-
#
|
36
|
+
# settings do
|
37
|
+
# provide "writer_class_name", "Traject::JsonWriter"
|
38
|
+
# provide "output_file", "out.json"
|
39
|
+
# end
|
19
40
|
class Traject::JsonWriter < Traject::LineWriter
|
20
41
|
|
21
42
|
def serialize(context)
|
@@ -25,6 +46,6 @@ class Traject::JsonWriter < Traject::LineWriter
|
|
25
46
|
else
|
26
47
|
JSON.generate(hash)
|
27
48
|
end
|
28
|
-
end
|
49
|
+
end
|
29
50
|
|
30
|
-
end
|
51
|
+
end
|
data/lib/traject/line_writer.rb
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
require 'thread'
|
2
2
|
|
3
3
|
# A writer for Traject::Indexer, that just writes out
|
4
|
-
# all the output as serialized text with #puts.
|
4
|
+
# all the output as serialized text with #puts.
|
5
5
|
#
|
6
6
|
# Should be thread-safe (ie, multiple worker threads can be calling #put
|
7
7
|
# concurrently), by wrapping write to actual output file in a mutex synchronize.
|
8
8
|
# This does not seem to effect performance much, as far as I could tell
|
9
9
|
# benchmarking.
|
10
10
|
#
|
11
|
-
# Output will be sent to settings["output_file"] string path, or else
|
12
|
-
# settings["output_stream"] (ruby IO object), or else stdout.
|
11
|
+
# Output will be sent to `settings["output_file"]` string path, or else
|
12
|
+
# `settings["output_stream"]` (ruby IO object), or else stdout.
|
13
13
|
#
|
14
14
|
# This class can be sub-classed to write out different serialized
|
15
15
|
# reprentations -- subclasses will just override the #serialize
|
16
|
-
# method. For instance, see JsonWriter.
|
16
|
+
# method. For instance, see JsonWriter.
|
17
17
|
class Traject::LineWriter
|
18
18
|
attr_reader :settings
|
19
19
|
attr_reader :write_mutex
|
@@ -29,7 +29,7 @@ class Traject::LineWriter
|
|
29
29
|
|
30
30
|
def serialize(context)
|
31
31
|
context.output_hash
|
32
|
-
end
|
32
|
+
end
|
33
33
|
|
34
34
|
def put(context)
|
35
35
|
serialized = serialize(context)
|
@@ -56,4 +56,4 @@ class Traject::LineWriter
|
|
56
56
|
@output_file.close unless (@output_file.nil? || @output_file.tty?)
|
57
57
|
end
|
58
58
|
|
59
|
-
end
|
59
|
+
end
|
data/lib/traject/macros/basic.rb
CHANGED
@@ -20,29 +20,33 @@ module Traject::Macros
|
|
20
20
|
# and others. By default, will de-duplicate results, but see :allow_duplicates
|
21
21
|
#
|
22
22
|
# * :first => true: take only first value
|
23
|
+
#
|
23
24
|
# * :translation_map => String: translate with named translation map looked up in load
|
24
25
|
# path, uses Tranject::TranslationMap.new(translation_map_arg)
|
26
|
+
#
|
25
27
|
# * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
|
26
28
|
# have shown themselves useful with Marc, using Marc21.trim_punctuation
|
29
|
+
#
|
27
30
|
# * :default => String: if otherwise empty, add default value
|
31
|
+
#
|
28
32
|
# * :allow_duplicates => boolean, default false, if set to true then will avoid
|
29
33
|
# de-duplicating the result array (array.uniq!)
|
30
34
|
#
|
31
35
|
#
|
32
36
|
# Examples:
|
33
37
|
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
#
|
38
|
+
# to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
|
39
|
+
# to_field("id"), extract_marc("001", :first => true)
|
40
|
+
# to_field("geo"), extract_marc("040a", :separator => nil, :translation_map => "marc040")
|
37
41
|
def extract_marc(spec, options = {})
|
38
|
-
|
42
|
+
|
39
43
|
# Raise an error if there are any invalid options, indicating a
|
40
44
|
# misspelled or illegal option, using a string instead of a symbol, etc.
|
41
|
-
|
45
|
+
|
42
46
|
unless (options.keys - EXTRACT_MARC_VALID_OPTIONS).empty?
|
43
47
|
raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_MARC_VALID_OPTIONS).join(', ')}' in extract_marc at #{Traject::Util.extract_caller_location(caller.first)}")
|
44
48
|
end
|
45
|
-
|
49
|
+
|
46
50
|
only_first = options.delete(:first)
|
47
51
|
trim_punctuation = options.delete(:trim_punctuation)
|
48
52
|
default_value = options.delete(:default)
|
@@ -53,12 +57,12 @@ module Traject::Macros
|
|
53
57
|
# ones, and not have to create a new one per-execution.
|
54
58
|
#
|
55
59
|
# Benchmarking shows for MarcExtractor at least, there is
|
56
|
-
# significant performance advantage.
|
60
|
+
# significant performance advantage.
|
57
61
|
|
58
62
|
if translation_map_arg = options.delete(:translation_map)
|
59
63
|
translation_map = Traject::TranslationMap.new(translation_map_arg)
|
60
64
|
end
|
61
|
-
|
65
|
+
|
62
66
|
|
63
67
|
extractor = Traject::MarcExtractor.new(spec, options)
|
64
68
|
|
@@ -76,7 +80,7 @@ module Traject::Macros
|
|
76
80
|
if trim_punctuation
|
77
81
|
accumulator.collect! {|s| Marc21.trim_punctuation(s)}
|
78
82
|
end
|
79
|
-
|
83
|
+
|
80
84
|
unless allow_duplicates
|
81
85
|
accumulator.uniq!
|
82
86
|
end
|
@@ -84,14 +88,14 @@ module Traject::Macros
|
|
84
88
|
if default_value && accumulator.empty?
|
85
89
|
accumulator << default_value
|
86
90
|
end
|
87
|
-
|
91
|
+
|
88
92
|
end
|
89
93
|
end
|
90
94
|
# A list of symbols that are valid keys in the options hash
|
91
|
-
EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
|
92
|
-
:allow_duplicates, :separator, :translation_map,
|
95
|
+
EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
|
96
|
+
:allow_duplicates, :separator, :translation_map,
|
93
97
|
:alternate_script]
|
94
|
-
|
98
|
+
|
95
99
|
# Serializes complete marc record to a serialization format.
|
96
100
|
# required param :format,
|
97
101
|
# serialize_marc(:format => :binary)
|