traject 2.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
@@ -0,0 +1,613 @@
|
|
1
|
+
require 'yell'
|
2
|
+
|
3
|
+
require 'traject'
|
4
|
+
require 'traject/qualified_const_get'
|
5
|
+
require 'traject/thread_pool'
|
6
|
+
|
7
|
+
require 'traject/indexer/settings'
|
8
|
+
require 'traject/marc_reader'
|
9
|
+
require 'traject/json_writer'
|
10
|
+
require 'traject/solr_json_writer'
|
11
|
+
|
12
|
+
require 'traject/macros/marc21'
|
13
|
+
require 'traject/macros/basic'
|
14
|
+
|
15
|
+
if defined? JRUBY_VERSION
|
16
|
+
require 'traject/marc4j_reader'
|
17
|
+
end
|
18
|
+
|
19
|
+
# This class does indexing for traject: Getting input records from a Reader
|
20
|
+
# class, mapping the input records to an output hash, and then sending the output
|
21
|
+
# hash off somewhere (usually Solr) with a Writer class.
|
22
|
+
#
|
23
|
+
# Traject config files are `instance_eval`d in an Indexer object, so `self` in
|
24
|
+
# a config file is an Indexer, and any Indexer methods can be called.
|
25
|
+
#
|
26
|
+
# However, certain Indexer methods exist almost entirely for the purpose of
|
27
|
+
# being called in config files; these methods are part of the expected
|
28
|
+
# Domain-Specific Language ("DSL") for config files, and will ordinarily
|
29
|
+
# form the bulk or entirety of config files:
|
30
|
+
#
|
31
|
+
# * #settings
|
32
|
+
# * #to_field
|
33
|
+
# * #each_record
|
34
|
+
# * #after_procesing
|
35
|
+
# * #logger (rarely used in config files, but in some cases to set up custom logging config)
|
36
|
+
#
|
37
|
+
# If accessing a Traject::Indexer programmatically (instead of via command line with
|
38
|
+
# config files), additional methods of note include:
|
39
|
+
#
|
40
|
+
# # to process a stream of input records from configured Reader,
|
41
|
+
# # to configured Writer:
|
42
|
+
# indexer.process(io_stream)
|
43
|
+
#
|
44
|
+
# # To map a single input record manually to an ouput_hash,
|
45
|
+
# # ignoring Readers and Writers
|
46
|
+
# hash = indexer.map_record(record)
|
47
|
+
#
|
48
|
+
#
|
49
|
+
# ## Readers and Writers
|
50
|
+
#
|
51
|
+
# The Indexer has a modularized architecture for readers and writers, for where
|
52
|
+
# source records come from (reader), and where output is sent to (writer).
|
53
|
+
#
|
54
|
+
# A Reader is any class that:
|
55
|
+
# 1) Has a two-argument initializer taking an IO stream and a Settings hash
|
56
|
+
# 2) Responds to the usual ruby #each, returning a source record from each #each.
|
57
|
+
# (Including Enumerable is prob a good idea too)
|
58
|
+
#
|
59
|
+
# The default reader is the Traject::MarcReader, who's behavior is
|
60
|
+
# further customized by several settings in the Settings hash. Jruby users
|
61
|
+
# with specialized needs may want to look at the gem traject-marc4j_reader.
|
62
|
+
#
|
63
|
+
# Alternate readers can be set directly with the #reader_class= method, or
|
64
|
+
# with the "reader_class_name" Setting, a String name of a class
|
65
|
+
# meeting the reader contract.
|
66
|
+
#
|
67
|
+
#
|
68
|
+
# A Writer is any class that:
|
69
|
+
# 1) Has a one-argument initializer taking a Settings hash. (The logger
|
70
|
+
# is provided to the Writer in settings["logger"])
|
71
|
+
# 2) Responds to a one argument #put method, where the argument is
|
72
|
+
# a Traject::Indexer::Context, containing an #output_hash
|
73
|
+
# hash of mapped keys/values. The writer should write them
|
74
|
+
# to the appropriate place.
|
75
|
+
# 3) Responds to a #close method, called when we're done.
|
76
|
+
# 4) Optionally implements a #skipped_record_count method, returning int count of records
|
77
|
+
# that were skipped due to errors (and presumably logged)
|
78
|
+
#
|
79
|
+
# Traject packages one solr writer: traject/solr_json_writer, which sends
|
80
|
+
# in json format and works under both ruby and jruby, but only with solr version
|
81
|
+
# >= 3.2. To index to an older solr installation, you'll need to use jruby and
|
82
|
+
# install the gem traject-solrj_writer, which uses the solrj .jar underneath.
|
83
|
+
#
|
84
|
+
# You can set alternate writers by setting a Class object directly
|
85
|
+
# with the #writer_class method, or by the 'writer_class_name' Setting,
|
86
|
+
# with a String name of class meeting the Writer contract. There are several
|
87
|
+
# that ship with traject itself:
|
88
|
+
#
|
89
|
+
# * traject/json_writer (Traject::JsonWriter) -- write newline-delimied json files.
|
90
|
+
# * traject/yaml_writer (Traject::YamlWriter) -- write pretty yaml file; very human-readable
|
91
|
+
# * traject/debug_writer (Traject::DebugWriter) -- write a tab-delimited file where
|
92
|
+
# each line consists of the id, field, and value(s).
|
93
|
+
# * traject/delimited_writer and traject/csv_writer -- write character-delimited files
|
94
|
+
# (default is tab-delimited) or comma-separated-value files.
|
95
|
+
#
|
96
|
+
class Traject::Indexer
|
97
|
+
|
98
|
+
# Arity error on a passed block
|
99
|
+
class ArityError < ArgumentError; end
|
100
|
+
class NamingError < ArgumentError; end
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
include Traject::QualifiedConstGet
|
105
|
+
|
106
|
+
attr_writer :reader_class, :writer_class
|
107
|
+
|
108
|
+
# For now we hard-code these basic macro's included
|
109
|
+
# TODO, make these added with extend per-indexer,
|
110
|
+
# added by default but easily turned off (or have other
|
111
|
+
# default macro modules provided)
|
112
|
+
include Traject::Macros::Marc21
|
113
|
+
include Traject::Macros::Basic
|
114
|
+
|
115
|
+
|
116
|
+
# optional hash or Traject::Indexer::Settings object of settings.
|
117
|
+
def initialize(arg_settings = {})
|
118
|
+
@settings = Settings.new(arg_settings)
|
119
|
+
@index_steps = []
|
120
|
+
@after_processing_steps = []
|
121
|
+
end
|
122
|
+
|
123
|
+
# Part of the config file DSL, for writing settings values.
|
124
|
+
#
|
125
|
+
# The Indexer's settings consist of a hash-like Traject::Settings
|
126
|
+
# object. The settings hash is *not* nested hashes, just one level
|
127
|
+
# of configuration settings. Keys are always strings, and by convention
|
128
|
+
# use "." for namespacing, eg `log.file`
|
129
|
+
#
|
130
|
+
# The settings method with no arguments returns that Settings object.
|
131
|
+
#
|
132
|
+
# With a hash and/or block argument, can be used to set
|
133
|
+
# new key/values. Each call merges onto the existing settings
|
134
|
+
# hash. The block is `instance_eval`d in the context
|
135
|
+
# of the Traject::Settings object.
|
136
|
+
#
|
137
|
+
# indexer.settings("a" => "a", "b" => "b")
|
138
|
+
#
|
139
|
+
# indexer.settings do
|
140
|
+
# provide "b", "new b"
|
141
|
+
# end
|
142
|
+
#
|
143
|
+
# indexer.settings #=> {"a" => "a", "b" => "new b"}
|
144
|
+
#
|
145
|
+
# Note the #provide method is defined on Traject::Settings to
|
146
|
+
# write to a setting only if previously not set. You can also
|
147
|
+
# use #store to force over-writing even if an existing setting.
|
148
|
+
#
|
149
|
+
# Even with arguments, Indexer#settings returns the Settings object,
|
150
|
+
# hash too, so can method calls can be chained.
|
151
|
+
#
|
152
|
+
def settings(new_settings = nil, &block)
|
153
|
+
@settings.merge!(new_settings) if new_settings
|
154
|
+
|
155
|
+
@settings.instance_eval &block if block
|
156
|
+
|
157
|
+
return @settings
|
158
|
+
end
|
159
|
+
|
160
|
+
# Part of DSL, used to define an indexing mapping. Register logic
|
161
|
+
# to be called for each record, and generate values for a particular
|
162
|
+
# output field.
|
163
|
+
def to_field(field_name, aLambda = nil, &block)
|
164
|
+
@index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
165
|
+
end
|
166
|
+
|
167
|
+
# Part of DSL, register logic to be called for each record
|
168
|
+
def each_record(aLambda = nil, &block)
|
169
|
+
@index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
170
|
+
end
|
171
|
+
|
172
|
+
# Part of DSL, register logic to be called once at the end
|
173
|
+
# of processing a stream of records.
|
174
|
+
def after_processing(aLambda = nil, &block)
|
175
|
+
@after_processing_steps << AfterProcessingStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first))
|
176
|
+
end
|
177
|
+
|
178
|
+
def logger
|
179
|
+
@logger ||= create_logger
|
180
|
+
end
|
181
|
+
attr_writer :logger
|
182
|
+
|
183
|
+
|
184
|
+
def logger_format
|
185
|
+
format = settings["log.format"] || "%d %5L %m"
|
186
|
+
format = case format
|
187
|
+
when "false" then false
|
188
|
+
when "" then nil
|
189
|
+
else format
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
# Create logger according to settings
|
194
|
+
def create_logger
|
195
|
+
|
196
|
+
logger_level = settings["log.level"] || "info"
|
197
|
+
|
198
|
+
# log everything to STDERR or specified logfile
|
199
|
+
logger = Yell::Logger.new(:null)
|
200
|
+
logger.format = logger_format
|
201
|
+
logger.level = logger_level
|
202
|
+
|
203
|
+
logger_destination = settings["log.file"] || "STDERR"
|
204
|
+
# We intentionally repeat the logger_level
|
205
|
+
# on the adapter, so it will stay there if overall level
|
206
|
+
# is changed.
|
207
|
+
case logger_destination
|
208
|
+
when "STDERR"
|
209
|
+
logger.adapter :stderr, level: logger_level, format: logger_format
|
210
|
+
when "STDOUT"
|
211
|
+
logger.adapter :stdout, level: logger_level, format: logger_format
|
212
|
+
else
|
213
|
+
logger.adapter :file, logger_destination, level: logger_level, format: logger_format
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
# ADDITIONALLY log error and higher to....
|
218
|
+
if settings["log.error_file"]
|
219
|
+
logger.adapter :file, settings["log.error_file"], :level => 'gte.error'
|
220
|
+
end
|
221
|
+
|
222
|
+
return logger
|
223
|
+
end
|
224
|
+
|
225
|
+
# Processes a single record according to indexing rules set up in
|
226
|
+
# this indexer. Returns the output hash (a hash whose keys are
|
227
|
+
# string fields, and values are arrays of one or more values in that field)
|
228
|
+
#
|
229
|
+
# This is a convenience shortcut for #map_to_context! -- use that one
|
230
|
+
# if you want to provide addtional context
|
231
|
+
# like position, and/or get back the full context.
|
232
|
+
def map_record(record)
|
233
|
+
context = Context.new(:source_record => record, :settings => settings)
|
234
|
+
map_to_context!(context)
|
235
|
+
return context.output_hash
|
236
|
+
end
|
237
|
+
|
238
|
+
# Maps a single record INTO the second argument, a Traject::Indexer::Context.
|
239
|
+
#
|
240
|
+
# Context must be passed with a #source_record and #settings, and optionally
|
241
|
+
# a #position.
|
242
|
+
#
|
243
|
+
# Context will be mutated by this method, most significantly by adding
|
244
|
+
# an #output_hash, a hash from fieldname to array of values in that field.
|
245
|
+
#
|
246
|
+
# Pass in a context with a set #position if you want that to be available
|
247
|
+
# to mapping routines.
|
248
|
+
#
|
249
|
+
# Returns the context passed in as second arg, as a convenience for chaining etc.
|
250
|
+
|
251
|
+
def map_to_context!(context)
|
252
|
+
@index_steps.each do |index_step|
|
253
|
+
# Don't bother if we're skipping this record
|
254
|
+
break if context.skip?
|
255
|
+
|
256
|
+
context.index_step = index_step
|
257
|
+
accumulator = log_mapping_errors(context, index_step) do
|
258
|
+
index_step.execute(context) # will always return [] for an each_record step
|
259
|
+
end
|
260
|
+
|
261
|
+
if accumulator.size > 0
|
262
|
+
accumulator.compact!
|
263
|
+
(context.output_hash[index_step.field_name] ||= []).concat accumulator
|
264
|
+
end
|
265
|
+
|
266
|
+
context.index_step = nil
|
267
|
+
end
|
268
|
+
|
269
|
+
return context
|
270
|
+
end
|
271
|
+
|
272
|
+
# just a wrapper that captures and records any unexpected
|
273
|
+
# errors raised in mapping, along with contextual information
|
274
|
+
# on record and location in source file of mapping rule.
|
275
|
+
#
|
276
|
+
# Re-raises error at the moment.
|
277
|
+
#
|
278
|
+
# log_mapping_errors(context, index_step) do
|
279
|
+
# all_sorts_of_stuff # that will have errors logged
|
280
|
+
# end
|
281
|
+
def log_mapping_errors(context, index_step)
|
282
|
+
begin
|
283
|
+
yield
|
284
|
+
rescue Exception => e
|
285
|
+
msg = "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
|
286
|
+
msg += " while executing #{index_step.inspect}\n"
|
287
|
+
msg += Traject::Util.exception_to_log_message(e)
|
288
|
+
|
289
|
+
logger.error msg
|
290
|
+
begin
|
291
|
+
logger.debug "Record: " + context.source_record.to_s
|
292
|
+
rescue Exception => marc_to_s_exception
|
293
|
+
logger.debug "(Could not log record, #{marc_to_s_exception})"
|
294
|
+
end
|
295
|
+
|
296
|
+
raise e
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
# get a printable id from record for error logging.
|
301
|
+
# Maybe override this for a future XML version.
|
302
|
+
def id_string(record)
|
303
|
+
record && record['001'] && record['001'].value.to_s
|
304
|
+
end
|
305
|
+
|
306
|
+
# Processes a stream of records, reading from the configured Reader,
|
307
|
+
# mapping according to configured mapping rules, and then writing
|
308
|
+
# to configured Writer.
|
309
|
+
#
|
310
|
+
# returns 'false' as a signal to command line to return non-zero exit code
|
311
|
+
# for some reason (reason found in logs, presumably). This particular mechanism
|
312
|
+
# is open to complexification, starting simple. We do need SOME way to return
|
313
|
+
# non-zero to command line.
|
314
|
+
#
|
315
|
+
def process(io_stream)
|
316
|
+
settings.fill_in_defaults!
|
317
|
+
|
318
|
+
count = 0
|
319
|
+
start_time = batch_start_time = Time.now
|
320
|
+
logger.debug "beginning Indexer#process with settings: #{settings.inspect}"
|
321
|
+
|
322
|
+
reader = self.reader!(io_stream)
|
323
|
+
writer = self.writer!
|
324
|
+
|
325
|
+
|
326
|
+
processing_threads = settings["processing_thread_pool"].to_i
|
327
|
+
thread_pool = Traject::ThreadPool.new(processing_threads)
|
328
|
+
|
329
|
+
logger.info " Indexer with #{processing_threads} processing threads, reader: #{reader.class.name} and writer: #{writer.class.name}"
|
330
|
+
|
331
|
+
log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
|
332
|
+
|
333
|
+
reader.each do |record; position|
|
334
|
+
count += 1
|
335
|
+
|
336
|
+
# have to use a block local var, so the changing `count` one
|
337
|
+
# doesn't get caught in the closure. Weird, yeah.
|
338
|
+
position = count
|
339
|
+
|
340
|
+
thread_pool.raise_collected_exception!
|
341
|
+
|
342
|
+
if settings["debug_ascii_progress"].to_s == "true"
|
343
|
+
$stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
|
344
|
+
end
|
345
|
+
|
346
|
+
if log_batch_size && (count % log_batch_size == 0)
|
347
|
+
batch_rps = log_batch_size / (Time.now - batch_start_time)
|
348
|
+
overall_rps = count / (Time.now - start_time)
|
349
|
+
logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at id:#{id_string(record)}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
|
350
|
+
batch_start_time = Time.now
|
351
|
+
end
|
352
|
+
|
353
|
+
# we have to use this weird lambda to properly "capture" the count, instead
|
354
|
+
# of having it be bound to the original variable in a non-threadsafe way.
|
355
|
+
# This is confusing, I might not be understanding things properly, but that's where i am.
|
356
|
+
#thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
|
357
|
+
thread_pool.maybe_in_thread_pool(record, settings, position) do |record, settings, position|
|
358
|
+
context = Context.new(:source_record => record, :settings => settings, :position => position)
|
359
|
+
context.logger = logger
|
360
|
+
map_to_context!(context)
|
361
|
+
if context.skip?
|
362
|
+
log_skip(context)
|
363
|
+
else
|
364
|
+
writer.put context
|
365
|
+
end
|
366
|
+
|
367
|
+
end
|
368
|
+
|
369
|
+
end
|
370
|
+
$stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
|
371
|
+
|
372
|
+
logger.debug "Shutting down #processing mapper threadpool..."
|
373
|
+
thread_pool.shutdown_and_wait
|
374
|
+
logger.debug "#processing mapper threadpool shutdown complete."
|
375
|
+
|
376
|
+
thread_pool.raise_collected_exception!
|
377
|
+
|
378
|
+
|
379
|
+
writer.close if writer.respond_to?(:close)
|
380
|
+
|
381
|
+
@after_processing_steps.each do |step|
|
382
|
+
begin
|
383
|
+
step.execute
|
384
|
+
rescue Exception => e
|
385
|
+
logger.fatal("Unexpected exception #{e} when executing #{step}")
|
386
|
+
raise e
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|
390
|
+
elapsed = Time.now - start_time
|
391
|
+
avg_rps = (count / elapsed)
|
392
|
+
logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
|
393
|
+
|
394
|
+
if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
|
395
|
+
logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
|
396
|
+
return false
|
397
|
+
end
|
398
|
+
|
399
|
+
return true
|
400
|
+
end
|
401
|
+
|
402
|
+
# Log that the current record is being skipped, using
|
403
|
+
# data in context.position and context.skipmessage
|
404
|
+
def log_skip(context)
|
405
|
+
logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
|
406
|
+
end
|
407
|
+
|
408
|
+
def reader_class
|
409
|
+
unless defined? @reader_class
|
410
|
+
@reader_class = qualified_const_get(settings["reader_class_name"])
|
411
|
+
end
|
412
|
+
return @reader_class
|
413
|
+
end
|
414
|
+
|
415
|
+
def writer_class
|
416
|
+
unless defined? @writer_class
|
417
|
+
@writer_class = qualified_const_get(settings["writer_class_name"])
|
418
|
+
end
|
419
|
+
return @writer_class
|
420
|
+
end
|
421
|
+
|
422
|
+
# Instantiate a Traject Reader, using class set
|
423
|
+
# in #reader_class, initialized with io_stream passed in
|
424
|
+
def reader!(io_stream)
|
425
|
+
return reader_class.new(io_stream, settings.merge("logger" => logger))
|
426
|
+
end
|
427
|
+
|
428
|
+
# Instantiate a Traject Writer, suing class set in #writer_class
|
429
|
+
def writer!
|
430
|
+
return writer_class.new(settings.merge("logger" => logger))
|
431
|
+
end
|
432
|
+
|
433
|
+
# Represents the context of a specific record being indexed, passed
|
434
|
+
# to indexing logic blocks
|
435
|
+
#
|
436
|
+
class Context
|
437
|
+
def initialize(hash_init = {})
|
438
|
+
# TODO, argument checking for required args?
|
439
|
+
|
440
|
+
self.clipboard = {}
|
441
|
+
self.output_hash = {}
|
442
|
+
|
443
|
+
hash_init.each_pair do |key, value|
|
444
|
+
self.send("#{key}=", value)
|
445
|
+
end
|
446
|
+
|
447
|
+
@skip = false
|
448
|
+
end
|
449
|
+
|
450
|
+
attr_accessor :clipboard, :output_hash, :logger
|
451
|
+
attr_accessor :index_step, :source_record, :settings
|
452
|
+
# 1-based position in stream of processed records.
|
453
|
+
attr_accessor :position
|
454
|
+
|
455
|
+
# Should we be skipping this record?
|
456
|
+
attr_accessor :skipmessage
|
457
|
+
|
458
|
+
# Set the fact that this record should be skipped, with an
|
459
|
+
# optional message
|
460
|
+
def skip!(msg = '(no message given)')
|
461
|
+
@skipmessage = msg
|
462
|
+
@skip = true
|
463
|
+
end
|
464
|
+
|
465
|
+
# Should we skip this record?
|
466
|
+
def skip?
|
467
|
+
@skip
|
468
|
+
end
|
469
|
+
|
470
|
+
end
|
471
|
+
|
472
|
+
|
473
|
+
|
474
|
+
# An indexing step definition, including it's source location
|
475
|
+
# for logging
|
476
|
+
#
|
477
|
+
# This one represents an "each_record" step, a subclass below
|
478
|
+
# for "to_field"
|
479
|
+
#
|
480
|
+
# source_location is just a string with filename and line number for
|
481
|
+
# showing to devs in debugging.
|
482
|
+
class EachRecordStep
|
483
|
+
attr_accessor :source_location, :lambda, :block
|
484
|
+
|
485
|
+
def initialize(lambda, block, source_location)
|
486
|
+
self.lambda = lambda
|
487
|
+
self.block = block
|
488
|
+
self.source_location = source_location
|
489
|
+
|
490
|
+
self.validate!
|
491
|
+
end
|
492
|
+
|
493
|
+
# raises if bad data
|
494
|
+
def validate!
|
495
|
+
unless self.lambda or self.block
|
496
|
+
raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{self.inspect})")
|
497
|
+
end
|
498
|
+
|
499
|
+
[self.lambda, self.block].each do |proc|
|
500
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
501
|
+
# but for positive arrity, we need 1 or 2 args
|
502
|
+
if proc
|
503
|
+
unless proc.is_a?(Proc)
|
504
|
+
raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} #{self.inspect}")
|
505
|
+
end
|
506
|
+
if (proc.arity == 0 || proc.arity > 2)
|
507
|
+
raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: #{self.inspect}")
|
508
|
+
end
|
509
|
+
end
|
510
|
+
end
|
511
|
+
end
|
512
|
+
|
513
|
+
# For each_record, always return an empty array as the
|
514
|
+
# accumulator, since it doesn't have those kinds of side effects
|
515
|
+
def execute(context)
|
516
|
+
[@lambda, @block].each do |aProc|
|
517
|
+
next unless aProc
|
518
|
+
|
519
|
+
if aProc.arity == 1
|
520
|
+
aProc.call(context.source_record)
|
521
|
+
else
|
522
|
+
aProc.call(context.source_record, context)
|
523
|
+
end
|
524
|
+
|
525
|
+
end
|
526
|
+
return [] # empty -- no accumulator for each_record
|
527
|
+
end
|
528
|
+
|
529
|
+
# Over-ride inspect for outputting error messages etc.
|
530
|
+
def inspect
|
531
|
+
"(each_record at #{source_location})"
|
532
|
+
end
|
533
|
+
end
|
534
|
+
|
535
|
+
|
536
|
+
# An indexing step definition for a "to_field" step to specific
|
537
|
+
# field.
|
538
|
+
class ToFieldStep
|
539
|
+
attr_accessor :field_name, :lambda, :block, :source_location
|
540
|
+
def initialize(fieldname, lambda, block, source_location)
|
541
|
+
self.field_name = fieldname
|
542
|
+
self.lambda = lambda
|
543
|
+
self.block = block
|
544
|
+
self.source_location = source_location
|
545
|
+
|
546
|
+
validate!
|
547
|
+
end
|
548
|
+
|
549
|
+
def validate!
|
550
|
+
|
551
|
+
if self.field_name.nil? || !self.field_name.is_a?(String) || self.field_name.empty?
|
552
|
+
raise NamingError.new("to_field requires the field name (as a string) as the first argument at #{self.source_location})")
|
553
|
+
end
|
554
|
+
|
555
|
+
[self.lambda, self.block].each do |proc|
|
556
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
557
|
+
# but for positive arrity, we need 2 or 3 args
|
558
|
+
if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
|
559
|
+
raise ArityError.new("error parsing field '#{self.field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{self.inspect})")
|
560
|
+
end
|
561
|
+
end
|
562
|
+
end
|
563
|
+
|
564
|
+
# Override inspect for developer debug messages
|
565
|
+
def inspect
|
566
|
+
"(to_field #{self.field_name} at #{self.source_location})"
|
567
|
+
end
|
568
|
+
|
569
|
+
def execute(context)
|
570
|
+
accumulator = []
|
571
|
+
[@lambda, @block].each do |aProc|
|
572
|
+
next unless aProc
|
573
|
+
|
574
|
+
if aProc.arity == 2
|
575
|
+
aProc.call(context.source_record, accumulator)
|
576
|
+
else
|
577
|
+
aProc.call(context.source_record, accumulator, context)
|
578
|
+
end
|
579
|
+
|
580
|
+
end
|
581
|
+
return accumulator
|
582
|
+
end
|
583
|
+
|
584
|
+
end
|
585
|
+
|
586
|
+
# A class representing a block of logic called after
|
587
|
+
# processing, registered with #after_processing
|
588
|
+
class AfterProcessingStep
|
589
|
+
attr_accessor :lambda, :block, :source_location
|
590
|
+
def initialize(lambda, block, source_location)
|
591
|
+
self.lambda = lambda
|
592
|
+
self.block = block
|
593
|
+
self.source_location = source_location
|
594
|
+
end
|
595
|
+
|
596
|
+
# after_processing steps get no args yielded to
|
597
|
+
# their blocks, they just are what they are.
|
598
|
+
def execute
|
599
|
+
[lambda, block].each do |aProc|
|
600
|
+
next unless aProc
|
601
|
+
aProc.call
|
602
|
+
end
|
603
|
+
end
|
604
|
+
|
605
|
+
def inspect
|
606
|
+
"(after_processing at #{self.source_location}"
|
607
|
+
end
|
608
|
+
end
|
609
|
+
|
610
|
+
|
611
|
+
|
612
|
+
|
613
|
+
end
|