traject 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
|
@@ -0,0 +1,613 @@
|
|
|
1
|
+
require 'yell'
|
|
2
|
+
|
|
3
|
+
require 'traject'
|
|
4
|
+
require 'traject/qualified_const_get'
|
|
5
|
+
require 'traject/thread_pool'
|
|
6
|
+
|
|
7
|
+
require 'traject/indexer/settings'
|
|
8
|
+
require 'traject/marc_reader'
|
|
9
|
+
require 'traject/json_writer'
|
|
10
|
+
require 'traject/solr_json_writer'
|
|
11
|
+
|
|
12
|
+
require 'traject/macros/marc21'
|
|
13
|
+
require 'traject/macros/basic'
|
|
14
|
+
|
|
15
|
+
if defined? JRUBY_VERSION
|
|
16
|
+
require 'traject/marc4j_reader'
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# This class does indexing for traject: Getting input records from a Reader
|
|
20
|
+
# class, mapping the input records to an output hash, and then sending the output
|
|
21
|
+
# hash off somewhere (usually Solr) with a Writer class.
|
|
22
|
+
#
|
|
23
|
+
# Traject config files are `instance_eval`d in an Indexer object, so `self` in
|
|
24
|
+
# a config file is an Indexer, and any Indexer methods can be called.
|
|
25
|
+
#
|
|
26
|
+
# However, certain Indexer methods exist almost entirely for the purpose of
|
|
27
|
+
# being called in config files; these methods are part of the expected
|
|
28
|
+
# Domain-Specific Language ("DSL") for config files, and will ordinarily
|
|
29
|
+
# form the bulk or entirety of config files:
|
|
30
|
+
#
|
|
31
|
+
# * #settings
|
|
32
|
+
# * #to_field
|
|
33
|
+
# * #each_record
|
|
34
|
+
# * #after_procesing
|
|
35
|
+
# * #logger (rarely used in config files, but in some cases to set up custom logging config)
|
|
36
|
+
#
|
|
37
|
+
# If accessing a Traject::Indexer programmatically (instead of via command line with
|
|
38
|
+
# config files), additional methods of note include:
|
|
39
|
+
#
|
|
40
|
+
# # to process a stream of input records from configured Reader,
|
|
41
|
+
# # to configured Writer:
|
|
42
|
+
# indexer.process(io_stream)
|
|
43
|
+
#
|
|
44
|
+
# # To map a single input record manually to an ouput_hash,
|
|
45
|
+
# # ignoring Readers and Writers
|
|
46
|
+
# hash = indexer.map_record(record)
|
|
47
|
+
#
|
|
48
|
+
#
|
|
49
|
+
# ## Readers and Writers
|
|
50
|
+
#
|
|
51
|
+
# The Indexer has a modularized architecture for readers and writers, for where
|
|
52
|
+
# source records come from (reader), and where output is sent to (writer).
|
|
53
|
+
#
|
|
54
|
+
# A Reader is any class that:
|
|
55
|
+
# 1) Has a two-argument initializer taking an IO stream and a Settings hash
|
|
56
|
+
# 2) Responds to the usual ruby #each, returning a source record from each #each.
|
|
57
|
+
# (Including Enumerable is prob a good idea too)
|
|
58
|
+
#
|
|
59
|
+
# The default reader is the Traject::MarcReader, who's behavior is
|
|
60
|
+
# further customized by several settings in the Settings hash. Jruby users
|
|
61
|
+
# with specialized needs may want to look at the gem traject-marc4j_reader.
|
|
62
|
+
#
|
|
63
|
+
# Alternate readers can be set directly with the #reader_class= method, or
|
|
64
|
+
# with the "reader_class_name" Setting, a String name of a class
|
|
65
|
+
# meeting the reader contract.
|
|
66
|
+
#
|
|
67
|
+
#
|
|
68
|
+
# A Writer is any class that:
|
|
69
|
+
# 1) Has a one-argument initializer taking a Settings hash. (The logger
|
|
70
|
+
# is provided to the Writer in settings["logger"])
|
|
71
|
+
# 2) Responds to a one argument #put method, where the argument is
|
|
72
|
+
# a Traject::Indexer::Context, containing an #output_hash
|
|
73
|
+
# hash of mapped keys/values. The writer should write them
|
|
74
|
+
# to the appropriate place.
|
|
75
|
+
# 3) Responds to a #close method, called when we're done.
|
|
76
|
+
# 4) Optionally implements a #skipped_record_count method, returning int count of records
|
|
77
|
+
# that were skipped due to errors (and presumably logged)
|
|
78
|
+
#
|
|
79
|
+
# Traject packages one solr writer: traject/solr_json_writer, which sends
|
|
80
|
+
# in json format and works under both ruby and jruby, but only with solr version
|
|
81
|
+
# >= 3.2. To index to an older solr installation, you'll need to use jruby and
|
|
82
|
+
# install the gem traject-solrj_writer, which uses the solrj .jar underneath.
|
|
83
|
+
#
|
|
84
|
+
# You can set alternate writers by setting a Class object directly
|
|
85
|
+
# with the #writer_class method, or by the 'writer_class_name' Setting,
|
|
86
|
+
# with a String name of class meeting the Writer contract. There are several
|
|
87
|
+
# that ship with traject itself:
|
|
88
|
+
#
|
|
89
|
+
# * traject/json_writer (Traject::JsonWriter) -- write newline-delimied json files.
|
|
90
|
+
# * traject/yaml_writer (Traject::YamlWriter) -- write pretty yaml file; very human-readable
|
|
91
|
+
# * traject/debug_writer (Traject::DebugWriter) -- write a tab-delimited file where
|
|
92
|
+
# each line consists of the id, field, and value(s).
|
|
93
|
+
# * traject/delimited_writer and traject/csv_writer -- write character-delimited files
|
|
94
|
+
# (default is tab-delimited) or comma-separated-value files.
|
|
95
|
+
#
|
|
96
|
+
class Traject::Indexer
|
|
97
|
+
|
|
98
|
+
# Arity error on a passed block
|
|
99
|
+
class ArityError < ArgumentError; end
|
|
100
|
+
class NamingError < ArgumentError; end
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
include Traject::QualifiedConstGet
|
|
105
|
+
|
|
106
|
+
attr_writer :reader_class, :writer_class
|
|
107
|
+
|
|
108
|
+
# For now we hard-code these basic macro's included
|
|
109
|
+
# TODO, make these added with extend per-indexer,
|
|
110
|
+
# added by default but easily turned off (or have other
|
|
111
|
+
# default macro modules provided)
|
|
112
|
+
include Traject::Macros::Marc21
|
|
113
|
+
include Traject::Macros::Basic
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# optional hash or Traject::Indexer::Settings object of settings.
|
|
117
|
+
def initialize(arg_settings = {})
|
|
118
|
+
@settings = Settings.new(arg_settings)
|
|
119
|
+
@index_steps = []
|
|
120
|
+
@after_processing_steps = []
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Part of the config file DSL, for writing settings values.
|
|
124
|
+
#
|
|
125
|
+
# The Indexer's settings consist of a hash-like Traject::Settings
|
|
126
|
+
# object. The settings hash is *not* nested hashes, just one level
|
|
127
|
+
# of configuration settings. Keys are always strings, and by convention
|
|
128
|
+
# use "." for namespacing, eg `log.file`
|
|
129
|
+
#
|
|
130
|
+
# The settings method with no arguments returns that Settings object.
|
|
131
|
+
#
|
|
132
|
+
# With a hash and/or block argument, can be used to set
|
|
133
|
+
# new key/values. Each call merges onto the existing settings
|
|
134
|
+
# hash. The block is `instance_eval`d in the context
|
|
135
|
+
# of the Traject::Settings object.
|
|
136
|
+
#
|
|
137
|
+
# indexer.settings("a" => "a", "b" => "b")
|
|
138
|
+
#
|
|
139
|
+
# indexer.settings do
|
|
140
|
+
# provide "b", "new b"
|
|
141
|
+
# end
|
|
142
|
+
#
|
|
143
|
+
# indexer.settings #=> {"a" => "a", "b" => "new b"}
|
|
144
|
+
#
|
|
145
|
+
# Note the #provide method is defined on Traject::Settings to
|
|
146
|
+
# write to a setting only if previously not set. You can also
|
|
147
|
+
# use #store to force over-writing even if an existing setting.
|
|
148
|
+
#
|
|
149
|
+
# Even with arguments, Indexer#settings returns the Settings object,
|
|
150
|
+
# hash too, so can method calls can be chained.
|
|
151
|
+
#
|
|
152
|
+
def settings(new_settings = nil, &block)
|
|
153
|
+
@settings.merge!(new_settings) if new_settings
|
|
154
|
+
|
|
155
|
+
@settings.instance_eval &block if block
|
|
156
|
+
|
|
157
|
+
return @settings
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Part of DSL, used to define an indexing mapping. Register logic
|
|
161
|
+
# to be called for each record, and generate values for a particular
|
|
162
|
+
# output field.
|
|
163
|
+
def to_field(field_name, aLambda = nil, &block)
|
|
164
|
+
@index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Part of DSL, register logic to be called for each record
|
|
168
|
+
def each_record(aLambda = nil, &block)
|
|
169
|
+
@index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Part of DSL, register logic to be called once at the end
|
|
173
|
+
# of processing a stream of records.
|
|
174
|
+
def after_processing(aLambda = nil, &block)
|
|
175
|
+
@after_processing_steps << AfterProcessingStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first))
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def logger
|
|
179
|
+
@logger ||= create_logger
|
|
180
|
+
end
|
|
181
|
+
attr_writer :logger
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def logger_format
|
|
185
|
+
format = settings["log.format"] || "%d %5L %m"
|
|
186
|
+
format = case format
|
|
187
|
+
when "false" then false
|
|
188
|
+
when "" then nil
|
|
189
|
+
else format
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Create logger according to settings
|
|
194
|
+
def create_logger
|
|
195
|
+
|
|
196
|
+
logger_level = settings["log.level"] || "info"
|
|
197
|
+
|
|
198
|
+
# log everything to STDERR or specified logfile
|
|
199
|
+
logger = Yell::Logger.new(:null)
|
|
200
|
+
logger.format = logger_format
|
|
201
|
+
logger.level = logger_level
|
|
202
|
+
|
|
203
|
+
logger_destination = settings["log.file"] || "STDERR"
|
|
204
|
+
# We intentionally repeat the logger_level
|
|
205
|
+
# on the adapter, so it will stay there if overall level
|
|
206
|
+
# is changed.
|
|
207
|
+
case logger_destination
|
|
208
|
+
when "STDERR"
|
|
209
|
+
logger.adapter :stderr, level: logger_level, format: logger_format
|
|
210
|
+
when "STDOUT"
|
|
211
|
+
logger.adapter :stdout, level: logger_level, format: logger_format
|
|
212
|
+
else
|
|
213
|
+
logger.adapter :file, logger_destination, level: logger_level, format: logger_format
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# ADDITIONALLY log error and higher to....
|
|
218
|
+
if settings["log.error_file"]
|
|
219
|
+
logger.adapter :file, settings["log.error_file"], :level => 'gte.error'
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
return logger
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Processes a single record according to indexing rules set up in
|
|
226
|
+
# this indexer. Returns the output hash (a hash whose keys are
|
|
227
|
+
# string fields, and values are arrays of one or more values in that field)
|
|
228
|
+
#
|
|
229
|
+
# This is a convenience shortcut for #map_to_context! -- use that one
|
|
230
|
+
# if you want to provide addtional context
|
|
231
|
+
# like position, and/or get back the full context.
|
|
232
|
+
def map_record(record)
|
|
233
|
+
context = Context.new(:source_record => record, :settings => settings)
|
|
234
|
+
map_to_context!(context)
|
|
235
|
+
return context.output_hash
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Maps a single record INTO the second argument, a Traject::Indexer::Context.
|
|
239
|
+
#
|
|
240
|
+
# Context must be passed with a #source_record and #settings, and optionally
|
|
241
|
+
# a #position.
|
|
242
|
+
#
|
|
243
|
+
# Context will be mutated by this method, most significantly by adding
|
|
244
|
+
# an #output_hash, a hash from fieldname to array of values in that field.
|
|
245
|
+
#
|
|
246
|
+
# Pass in a context with a set #position if you want that to be available
|
|
247
|
+
# to mapping routines.
|
|
248
|
+
#
|
|
249
|
+
# Returns the context passed in as second arg, as a convenience for chaining etc.
|
|
250
|
+
|
|
251
|
+
def map_to_context!(context)
|
|
252
|
+
@index_steps.each do |index_step|
|
|
253
|
+
# Don't bother if we're skipping this record
|
|
254
|
+
break if context.skip?
|
|
255
|
+
|
|
256
|
+
context.index_step = index_step
|
|
257
|
+
accumulator = log_mapping_errors(context, index_step) do
|
|
258
|
+
index_step.execute(context) # will always return [] for an each_record step
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
if accumulator.size > 0
|
|
262
|
+
accumulator.compact!
|
|
263
|
+
(context.output_hash[index_step.field_name] ||= []).concat accumulator
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
context.index_step = nil
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
return context
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# just a wrapper that captures and records any unexpected
|
|
273
|
+
# errors raised in mapping, along with contextual information
|
|
274
|
+
# on record and location in source file of mapping rule.
|
|
275
|
+
#
|
|
276
|
+
# Re-raises error at the moment.
|
|
277
|
+
#
|
|
278
|
+
# log_mapping_errors(context, index_step) do
|
|
279
|
+
# all_sorts_of_stuff # that will have errors logged
|
|
280
|
+
# end
|
|
281
|
+
def log_mapping_errors(context, index_step)
|
|
282
|
+
begin
|
|
283
|
+
yield
|
|
284
|
+
rescue Exception => e
|
|
285
|
+
msg = "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
|
|
286
|
+
msg += " while executing #{index_step.inspect}\n"
|
|
287
|
+
msg += Traject::Util.exception_to_log_message(e)
|
|
288
|
+
|
|
289
|
+
logger.error msg
|
|
290
|
+
begin
|
|
291
|
+
logger.debug "Record: " + context.source_record.to_s
|
|
292
|
+
rescue Exception => marc_to_s_exception
|
|
293
|
+
logger.debug "(Could not log record, #{marc_to_s_exception})"
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
raise e
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
# get a printable id from record for error logging.
|
|
301
|
+
# Maybe override this for a future XML version.
|
|
302
|
+
def id_string(record)
|
|
303
|
+
record && record['001'] && record['001'].value.to_s
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
# Processes a stream of records, reading from the configured Reader,
|
|
307
|
+
# mapping according to configured mapping rules, and then writing
|
|
308
|
+
# to configured Writer.
|
|
309
|
+
#
|
|
310
|
+
# returns 'false' as a signal to command line to return non-zero exit code
|
|
311
|
+
# for some reason (reason found in logs, presumably). This particular mechanism
|
|
312
|
+
# is open to complexification, starting simple. We do need SOME way to return
|
|
313
|
+
# non-zero to command line.
|
|
314
|
+
#
|
|
315
|
+
def process(io_stream)
|
|
316
|
+
settings.fill_in_defaults!
|
|
317
|
+
|
|
318
|
+
count = 0
|
|
319
|
+
start_time = batch_start_time = Time.now
|
|
320
|
+
logger.debug "beginning Indexer#process with settings: #{settings.inspect}"
|
|
321
|
+
|
|
322
|
+
reader = self.reader!(io_stream)
|
|
323
|
+
writer = self.writer!
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
processing_threads = settings["processing_thread_pool"].to_i
|
|
327
|
+
thread_pool = Traject::ThreadPool.new(processing_threads)
|
|
328
|
+
|
|
329
|
+
logger.info " Indexer with #{processing_threads} processing threads, reader: #{reader.class.name} and writer: #{writer.class.name}"
|
|
330
|
+
|
|
331
|
+
log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
|
|
332
|
+
|
|
333
|
+
reader.each do |record; position|
|
|
334
|
+
count += 1
|
|
335
|
+
|
|
336
|
+
# have to use a block local var, so the changing `count` one
|
|
337
|
+
# doesn't get caught in the closure. Weird, yeah.
|
|
338
|
+
position = count
|
|
339
|
+
|
|
340
|
+
thread_pool.raise_collected_exception!
|
|
341
|
+
|
|
342
|
+
if settings["debug_ascii_progress"].to_s == "true"
|
|
343
|
+
$stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
if log_batch_size && (count % log_batch_size == 0)
|
|
347
|
+
batch_rps = log_batch_size / (Time.now - batch_start_time)
|
|
348
|
+
overall_rps = count / (Time.now - start_time)
|
|
349
|
+
logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at id:#{id_string(record)}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
|
|
350
|
+
batch_start_time = Time.now
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
# we have to use this weird lambda to properly "capture" the count, instead
|
|
354
|
+
# of having it be bound to the original variable in a non-threadsafe way.
|
|
355
|
+
# This is confusing, I might not be understanding things properly, but that's where i am.
|
|
356
|
+
#thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
|
|
357
|
+
thread_pool.maybe_in_thread_pool(record, settings, position) do |record, settings, position|
|
|
358
|
+
context = Context.new(:source_record => record, :settings => settings, :position => position)
|
|
359
|
+
context.logger = logger
|
|
360
|
+
map_to_context!(context)
|
|
361
|
+
if context.skip?
|
|
362
|
+
log_skip(context)
|
|
363
|
+
else
|
|
364
|
+
writer.put context
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
end
|
|
370
|
+
$stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
|
|
371
|
+
|
|
372
|
+
logger.debug "Shutting down #processing mapper threadpool..."
|
|
373
|
+
thread_pool.shutdown_and_wait
|
|
374
|
+
logger.debug "#processing mapper threadpool shutdown complete."
|
|
375
|
+
|
|
376
|
+
thread_pool.raise_collected_exception!
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
writer.close if writer.respond_to?(:close)
|
|
380
|
+
|
|
381
|
+
@after_processing_steps.each do |step|
|
|
382
|
+
begin
|
|
383
|
+
step.execute
|
|
384
|
+
rescue Exception => e
|
|
385
|
+
logger.fatal("Unexpected exception #{e} when executing #{step}")
|
|
386
|
+
raise e
|
|
387
|
+
end
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
elapsed = Time.now - start_time
|
|
391
|
+
avg_rps = (count / elapsed)
|
|
392
|
+
logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
|
|
393
|
+
|
|
394
|
+
if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
|
|
395
|
+
logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
|
|
396
|
+
return false
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
return true
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
# Log that the current record is being skipped, using
|
|
403
|
+
# data in context.position and context.skipmessage
|
|
404
|
+
def log_skip(context)
|
|
405
|
+
logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
def reader_class
|
|
409
|
+
unless defined? @reader_class
|
|
410
|
+
@reader_class = qualified_const_get(settings["reader_class_name"])
|
|
411
|
+
end
|
|
412
|
+
return @reader_class
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
def writer_class
|
|
416
|
+
unless defined? @writer_class
|
|
417
|
+
@writer_class = qualified_const_get(settings["writer_class_name"])
|
|
418
|
+
end
|
|
419
|
+
return @writer_class
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
# Instantiate a Traject Reader, using class set
|
|
423
|
+
# in #reader_class, initialized with io_stream passed in
|
|
424
|
+
def reader!(io_stream)
|
|
425
|
+
return reader_class.new(io_stream, settings.merge("logger" => logger))
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
# Instantiate a Traject Writer, suing class set in #writer_class
|
|
429
|
+
def writer!
|
|
430
|
+
return writer_class.new(settings.merge("logger" => logger))
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
# Represents the context of a specific record being indexed, passed
|
|
434
|
+
# to indexing logic blocks
|
|
435
|
+
#
|
|
436
|
+
class Context
|
|
437
|
+
def initialize(hash_init = {})
|
|
438
|
+
# TODO, argument checking for required args?
|
|
439
|
+
|
|
440
|
+
self.clipboard = {}
|
|
441
|
+
self.output_hash = {}
|
|
442
|
+
|
|
443
|
+
hash_init.each_pair do |key, value|
|
|
444
|
+
self.send("#{key}=", value)
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
@skip = false
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
attr_accessor :clipboard, :output_hash, :logger
|
|
451
|
+
attr_accessor :index_step, :source_record, :settings
|
|
452
|
+
# 1-based position in stream of processed records.
|
|
453
|
+
attr_accessor :position
|
|
454
|
+
|
|
455
|
+
# Should we be skipping this record?
|
|
456
|
+
attr_accessor :skipmessage
|
|
457
|
+
|
|
458
|
+
# Set the fact that this record should be skipped, with an
|
|
459
|
+
# optional message
|
|
460
|
+
def skip!(msg = '(no message given)')
|
|
461
|
+
@skipmessage = msg
|
|
462
|
+
@skip = true
|
|
463
|
+
end
|
|
464
|
+
|
|
465
|
+
# Should we skip this record?
|
|
466
|
+
def skip?
|
|
467
|
+
@skip
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
# An indexing step definition, including it's source location
|
|
475
|
+
# for logging
|
|
476
|
+
#
|
|
477
|
+
# This one represents an "each_record" step, a subclass below
|
|
478
|
+
# for "to_field"
|
|
479
|
+
#
|
|
480
|
+
# source_location is just a string with filename and line number for
|
|
481
|
+
# showing to devs in debugging.
|
|
482
|
+
class EachRecordStep
|
|
483
|
+
attr_accessor :source_location, :lambda, :block
|
|
484
|
+
|
|
485
|
+
def initialize(lambda, block, source_location)
|
|
486
|
+
self.lambda = lambda
|
|
487
|
+
self.block = block
|
|
488
|
+
self.source_location = source_location
|
|
489
|
+
|
|
490
|
+
self.validate!
|
|
491
|
+
end
|
|
492
|
+
|
|
493
|
+
# raises if bad data
|
|
494
|
+
def validate!
|
|
495
|
+
unless self.lambda or self.block
|
|
496
|
+
raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{self.inspect})")
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
[self.lambda, self.block].each do |proc|
|
|
500
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
|
501
|
+
# but for positive arrity, we need 1 or 2 args
|
|
502
|
+
if proc
|
|
503
|
+
unless proc.is_a?(Proc)
|
|
504
|
+
raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} #{self.inspect}")
|
|
505
|
+
end
|
|
506
|
+
if (proc.arity == 0 || proc.arity > 2)
|
|
507
|
+
raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: #{self.inspect}")
|
|
508
|
+
end
|
|
509
|
+
end
|
|
510
|
+
end
|
|
511
|
+
end
|
|
512
|
+
|
|
513
|
+
# For each_record, always return an empty array as the
|
|
514
|
+
# accumulator, since it doesn't have those kinds of side effects
|
|
515
|
+
def execute(context)
|
|
516
|
+
[@lambda, @block].each do |aProc|
|
|
517
|
+
next unless aProc
|
|
518
|
+
|
|
519
|
+
if aProc.arity == 1
|
|
520
|
+
aProc.call(context.source_record)
|
|
521
|
+
else
|
|
522
|
+
aProc.call(context.source_record, context)
|
|
523
|
+
end
|
|
524
|
+
|
|
525
|
+
end
|
|
526
|
+
return [] # empty -- no accumulator for each_record
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
# Over-ride inspect for outputting error messages etc.
|
|
530
|
+
def inspect
|
|
531
|
+
"(each_record at #{source_location})"
|
|
532
|
+
end
|
|
533
|
+
end
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
# An indexing step definition for a "to_field" step to specific
|
|
537
|
+
# field.
|
|
538
|
+
class ToFieldStep
|
|
539
|
+
attr_accessor :field_name, :lambda, :block, :source_location
|
|
540
|
+
def initialize(fieldname, lambda, block, source_location)
|
|
541
|
+
self.field_name = fieldname
|
|
542
|
+
self.lambda = lambda
|
|
543
|
+
self.block = block
|
|
544
|
+
self.source_location = source_location
|
|
545
|
+
|
|
546
|
+
validate!
|
|
547
|
+
end
|
|
548
|
+
|
|
549
|
+
def validate!
|
|
550
|
+
|
|
551
|
+
if self.field_name.nil? || !self.field_name.is_a?(String) || self.field_name.empty?
|
|
552
|
+
raise NamingError.new("to_field requires the field name (as a string) as the first argument at #{self.source_location})")
|
|
553
|
+
end
|
|
554
|
+
|
|
555
|
+
[self.lambda, self.block].each do |proc|
|
|
556
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
|
557
|
+
# but for positive arrity, we need 2 or 3 args
|
|
558
|
+
if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
|
|
559
|
+
raise ArityError.new("error parsing field '#{self.field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{self.inspect})")
|
|
560
|
+
end
|
|
561
|
+
end
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
# Override inspect for developer debug messages
|
|
565
|
+
def inspect
|
|
566
|
+
"(to_field #{self.field_name} at #{self.source_location})"
|
|
567
|
+
end
|
|
568
|
+
|
|
569
|
+
def execute(context)
|
|
570
|
+
accumulator = []
|
|
571
|
+
[@lambda, @block].each do |aProc|
|
|
572
|
+
next unless aProc
|
|
573
|
+
|
|
574
|
+
if aProc.arity == 2
|
|
575
|
+
aProc.call(context.source_record, accumulator)
|
|
576
|
+
else
|
|
577
|
+
aProc.call(context.source_record, accumulator, context)
|
|
578
|
+
end
|
|
579
|
+
|
|
580
|
+
end
|
|
581
|
+
return accumulator
|
|
582
|
+
end
|
|
583
|
+
|
|
584
|
+
end
|
|
585
|
+
|
|
586
|
+
# A class representing a block of logic called after
|
|
587
|
+
# processing, registered with #after_processing
|
|
588
|
+
class AfterProcessingStep
|
|
589
|
+
attr_accessor :lambda, :block, :source_location
|
|
590
|
+
def initialize(lambda, block, source_location)
|
|
591
|
+
self.lambda = lambda
|
|
592
|
+
self.block = block
|
|
593
|
+
self.source_location = source_location
|
|
594
|
+
end
|
|
595
|
+
|
|
596
|
+
# after_processing steps get no args yielded to
|
|
597
|
+
# their blocks, they just are what they are.
|
|
598
|
+
def execute
|
|
599
|
+
[lambda, block].each do |aProc|
|
|
600
|
+
next unless aProc
|
|
601
|
+
aProc.call
|
|
602
|
+
end
|
|
603
|
+
end
|
|
604
|
+
|
|
605
|
+
def inspect
|
|
606
|
+
"(after_processing at #{self.source_location}"
|
|
607
|
+
end
|
|
608
|
+
end
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
end
|