traject 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,613 @@
1
+ require 'yell'
2
+
3
+ require 'traject'
4
+ require 'traject/qualified_const_get'
5
+ require 'traject/thread_pool'
6
+
7
+ require 'traject/indexer/settings'
8
+ require 'traject/marc_reader'
9
+ require 'traject/json_writer'
10
+ require 'traject/solr_json_writer'
11
+
12
+ require 'traject/macros/marc21'
13
+ require 'traject/macros/basic'
14
+
15
+ if defined? JRUBY_VERSION
16
+ require 'traject/marc4j_reader'
17
+ end
18
+
19
+ # This class does indexing for traject: Getting input records from a Reader
20
+ # class, mapping the input records to an output hash, and then sending the output
21
+ # hash off somewhere (usually Solr) with a Writer class.
22
+ #
23
+ # Traject config files are `instance_eval`d in an Indexer object, so `self` in
24
+ # a config file is an Indexer, and any Indexer methods can be called.
25
+ #
26
+ # However, certain Indexer methods exist almost entirely for the purpose of
27
+ # being called in config files; these methods are part of the expected
28
+ # Domain-Specific Language ("DSL") for config files, and will ordinarily
29
+ # form the bulk or entirety of config files:
30
+ #
31
+ # * #settings
32
+ # * #to_field
33
+ # * #each_record
34
+ # * #after_procesing
35
+ # * #logger (rarely used in config files, but in some cases to set up custom logging config)
36
+ #
37
+ # If accessing a Traject::Indexer programmatically (instead of via command line with
38
+ # config files), additional methods of note include:
39
+ #
40
+ # # to process a stream of input records from configured Reader,
41
+ # # to configured Writer:
42
+ # indexer.process(io_stream)
43
+ #
44
+ # # To map a single input record manually to an ouput_hash,
45
+ # # ignoring Readers and Writers
46
+ # hash = indexer.map_record(record)
47
+ #
48
+ #
49
+ # ## Readers and Writers
50
+ #
51
+ # The Indexer has a modularized architecture for readers and writers, for where
52
+ # source records come from (reader), and where output is sent to (writer).
53
+ #
54
+ # A Reader is any class that:
55
+ # 1) Has a two-argument initializer taking an IO stream and a Settings hash
56
+ # 2) Responds to the usual ruby #each, returning a source record from each #each.
57
+ # (Including Enumerable is prob a good idea too)
58
+ #
59
+ # The default reader is the Traject::MarcReader, who's behavior is
60
+ # further customized by several settings in the Settings hash. Jruby users
61
+ # with specialized needs may want to look at the gem traject-marc4j_reader.
62
+ #
63
+ # Alternate readers can be set directly with the #reader_class= method, or
64
+ # with the "reader_class_name" Setting, a String name of a class
65
+ # meeting the reader contract.
66
+ #
67
+ #
68
+ # A Writer is any class that:
69
+ # 1) Has a one-argument initializer taking a Settings hash. (The logger
70
+ # is provided to the Writer in settings["logger"])
71
+ # 2) Responds to a one argument #put method, where the argument is
72
+ # a Traject::Indexer::Context, containing an #output_hash
73
+ # hash of mapped keys/values. The writer should write them
74
+ # to the appropriate place.
75
+ # 3) Responds to a #close method, called when we're done.
76
+ # 4) Optionally implements a #skipped_record_count method, returning int count of records
77
+ # that were skipped due to errors (and presumably logged)
78
+ #
79
+ # Traject packages one solr writer: traject/solr_json_writer, which sends
80
+ # in json format and works under both ruby and jruby, but only with solr version
81
+ # >= 3.2. To index to an older solr installation, you'll need to use jruby and
82
+ # install the gem traject-solrj_writer, which uses the solrj .jar underneath.
83
+ #
84
+ # You can set alternate writers by setting a Class object directly
85
+ # with the #writer_class method, or by the 'writer_class_name' Setting,
86
+ # with a String name of class meeting the Writer contract. There are several
87
+ # that ship with traject itself:
88
+ #
89
+ # * traject/json_writer (Traject::JsonWriter) -- write newline-delimied json files.
90
+ # * traject/yaml_writer (Traject::YamlWriter) -- write pretty yaml file; very human-readable
91
+ # * traject/debug_writer (Traject::DebugWriter) -- write a tab-delimited file where
92
+ # each line consists of the id, field, and value(s).
93
+ # * traject/delimited_writer and traject/csv_writer -- write character-delimited files
94
+ # (default is tab-delimited) or comma-separated-value files.
95
+ #
96
+ class Traject::Indexer
97
+
98
+ # Arity error on a passed block
99
+ class ArityError < ArgumentError; end
100
+ class NamingError < ArgumentError; end
101
+
102
+
103
+
104
+ include Traject::QualifiedConstGet
105
+
106
+ attr_writer :reader_class, :writer_class
107
+
108
+ # For now we hard-code these basic macro's included
109
+ # TODO, make these added with extend per-indexer,
110
+ # added by default but easily turned off (or have other
111
+ # default macro modules provided)
112
+ include Traject::Macros::Marc21
113
+ include Traject::Macros::Basic
114
+
115
+
116
+ # optional hash or Traject::Indexer::Settings object of settings.
117
+ def initialize(arg_settings = {})
118
+ @settings = Settings.new(arg_settings)
119
+ @index_steps = []
120
+ @after_processing_steps = []
121
+ end
122
+
123
+ # Part of the config file DSL, for writing settings values.
124
+ #
125
+ # The Indexer's settings consist of a hash-like Traject::Settings
126
+ # object. The settings hash is *not* nested hashes, just one level
127
+ # of configuration settings. Keys are always strings, and by convention
128
+ # use "." for namespacing, eg `log.file`
129
+ #
130
+ # The settings method with no arguments returns that Settings object.
131
+ #
132
+ # With a hash and/or block argument, can be used to set
133
+ # new key/values. Each call merges onto the existing settings
134
+ # hash. The block is `instance_eval`d in the context
135
+ # of the Traject::Settings object.
136
+ #
137
+ # indexer.settings("a" => "a", "b" => "b")
138
+ #
139
+ # indexer.settings do
140
+ # provide "b", "new b"
141
+ # end
142
+ #
143
+ # indexer.settings #=> {"a" => "a", "b" => "new b"}
144
+ #
145
+ # Note the #provide method is defined on Traject::Settings to
146
+ # write to a setting only if previously not set. You can also
147
+ # use #store to force over-writing even if an existing setting.
148
+ #
149
+ # Even with arguments, Indexer#settings returns the Settings object,
150
+ # hash too, so can method calls can be chained.
151
+ #
152
+ def settings(new_settings = nil, &block)
153
+ @settings.merge!(new_settings) if new_settings
154
+
155
+ @settings.instance_eval &block if block
156
+
157
+ return @settings
158
+ end
159
+
160
+ # Part of DSL, used to define an indexing mapping. Register logic
161
+ # to be called for each record, and generate values for a particular
162
+ # output field.
163
+ def to_field(field_name, aLambda = nil, &block)
164
+ @index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
165
+ end
166
+
167
+ # Part of DSL, register logic to be called for each record
168
+ def each_record(aLambda = nil, &block)
169
+ @index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
170
+ end
171
+
172
+ # Part of DSL, register logic to be called once at the end
173
+ # of processing a stream of records.
174
+ def after_processing(aLambda = nil, &block)
175
+ @after_processing_steps << AfterProcessingStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first))
176
+ end
177
+
178
+ def logger
179
+ @logger ||= create_logger
180
+ end
181
+ attr_writer :logger
182
+
183
+
184
+ def logger_format
185
+ format = settings["log.format"] || "%d %5L %m"
186
+ format = case format
187
+ when "false" then false
188
+ when "" then nil
189
+ else format
190
+ end
191
+ end
192
+
193
+ # Create logger according to settings
194
+ def create_logger
195
+
196
+ logger_level = settings["log.level"] || "info"
197
+
198
+ # log everything to STDERR or specified logfile
199
+ logger = Yell::Logger.new(:null)
200
+ logger.format = logger_format
201
+ logger.level = logger_level
202
+
203
+ logger_destination = settings["log.file"] || "STDERR"
204
+ # We intentionally repeat the logger_level
205
+ # on the adapter, so it will stay there if overall level
206
+ # is changed.
207
+ case logger_destination
208
+ when "STDERR"
209
+ logger.adapter :stderr, level: logger_level, format: logger_format
210
+ when "STDOUT"
211
+ logger.adapter :stdout, level: logger_level, format: logger_format
212
+ else
213
+ logger.adapter :file, logger_destination, level: logger_level, format: logger_format
214
+ end
215
+
216
+
217
+ # ADDITIONALLY log error and higher to....
218
+ if settings["log.error_file"]
219
+ logger.adapter :file, settings["log.error_file"], :level => 'gte.error'
220
+ end
221
+
222
+ return logger
223
+ end
224
+
225
+ # Processes a single record according to indexing rules set up in
226
+ # this indexer. Returns the output hash (a hash whose keys are
227
+ # string fields, and values are arrays of one or more values in that field)
228
+ #
229
+ # This is a convenience shortcut for #map_to_context! -- use that one
230
+ # if you want to provide addtional context
231
+ # like position, and/or get back the full context.
232
+ def map_record(record)
233
+ context = Context.new(:source_record => record, :settings => settings)
234
+ map_to_context!(context)
235
+ return context.output_hash
236
+ end
237
+
238
+ # Maps a single record INTO the second argument, a Traject::Indexer::Context.
239
+ #
240
+ # Context must be passed with a #source_record and #settings, and optionally
241
+ # a #position.
242
+ #
243
+ # Context will be mutated by this method, most significantly by adding
244
+ # an #output_hash, a hash from fieldname to array of values in that field.
245
+ #
246
+ # Pass in a context with a set #position if you want that to be available
247
+ # to mapping routines.
248
+ #
249
+ # Returns the context passed in as second arg, as a convenience for chaining etc.
250
+
251
+ def map_to_context!(context)
252
+ @index_steps.each do |index_step|
253
+ # Don't bother if we're skipping this record
254
+ break if context.skip?
255
+
256
+ context.index_step = index_step
257
+ accumulator = log_mapping_errors(context, index_step) do
258
+ index_step.execute(context) # will always return [] for an each_record step
259
+ end
260
+
261
+ if accumulator.size > 0
262
+ accumulator.compact!
263
+ (context.output_hash[index_step.field_name] ||= []).concat accumulator
264
+ end
265
+
266
+ context.index_step = nil
267
+ end
268
+
269
+ return context
270
+ end
271
+
272
+ # just a wrapper that captures and records any unexpected
273
+ # errors raised in mapping, along with contextual information
274
+ # on record and location in source file of mapping rule.
275
+ #
276
+ # Re-raises error at the moment.
277
+ #
278
+ # log_mapping_errors(context, index_step) do
279
+ # all_sorts_of_stuff # that will have errors logged
280
+ # end
281
+ def log_mapping_errors(context, index_step)
282
+ begin
283
+ yield
284
+ rescue Exception => e
285
+ msg = "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
286
+ msg += " while executing #{index_step.inspect}\n"
287
+ msg += Traject::Util.exception_to_log_message(e)
288
+
289
+ logger.error msg
290
+ begin
291
+ logger.debug "Record: " + context.source_record.to_s
292
+ rescue Exception => marc_to_s_exception
293
+ logger.debug "(Could not log record, #{marc_to_s_exception})"
294
+ end
295
+
296
+ raise e
297
+ end
298
+ end
299
+
300
+ # get a printable id from record for error logging.
301
+ # Maybe override this for a future XML version.
302
+ def id_string(record)
303
+ record && record['001'] && record['001'].value.to_s
304
+ end
305
+
306
+ # Processes a stream of records, reading from the configured Reader,
307
+ # mapping according to configured mapping rules, and then writing
308
+ # to configured Writer.
309
+ #
310
+ # returns 'false' as a signal to command line to return non-zero exit code
311
+ # for some reason (reason found in logs, presumably). This particular mechanism
312
+ # is open to complexification, starting simple. We do need SOME way to return
313
+ # non-zero to command line.
314
+ #
315
+ def process(io_stream)
316
+ settings.fill_in_defaults!
317
+
318
+ count = 0
319
+ start_time = batch_start_time = Time.now
320
+ logger.debug "beginning Indexer#process with settings: #{settings.inspect}"
321
+
322
+ reader = self.reader!(io_stream)
323
+ writer = self.writer!
324
+
325
+
326
+ processing_threads = settings["processing_thread_pool"].to_i
327
+ thread_pool = Traject::ThreadPool.new(processing_threads)
328
+
329
+ logger.info " Indexer with #{processing_threads} processing threads, reader: #{reader.class.name} and writer: #{writer.class.name}"
330
+
331
+ log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
332
+
333
+ reader.each do |record; position|
334
+ count += 1
335
+
336
+ # have to use a block local var, so the changing `count` one
337
+ # doesn't get caught in the closure. Weird, yeah.
338
+ position = count
339
+
340
+ thread_pool.raise_collected_exception!
341
+
342
+ if settings["debug_ascii_progress"].to_s == "true"
343
+ $stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
344
+ end
345
+
346
+ if log_batch_size && (count % log_batch_size == 0)
347
+ batch_rps = log_batch_size / (Time.now - batch_start_time)
348
+ overall_rps = count / (Time.now - start_time)
349
+ logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at id:#{id_string(record)}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
350
+ batch_start_time = Time.now
351
+ end
352
+
353
+ # we have to use this weird lambda to properly "capture" the count, instead
354
+ # of having it be bound to the original variable in a non-threadsafe way.
355
+ # This is confusing, I might not be understanding things properly, but that's where i am.
356
+ #thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
357
+ thread_pool.maybe_in_thread_pool(record, settings, position) do |record, settings, position|
358
+ context = Context.new(:source_record => record, :settings => settings, :position => position)
359
+ context.logger = logger
360
+ map_to_context!(context)
361
+ if context.skip?
362
+ log_skip(context)
363
+ else
364
+ writer.put context
365
+ end
366
+
367
+ end
368
+
369
+ end
370
+ $stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
371
+
372
+ logger.debug "Shutting down #processing mapper threadpool..."
373
+ thread_pool.shutdown_and_wait
374
+ logger.debug "#processing mapper threadpool shutdown complete."
375
+
376
+ thread_pool.raise_collected_exception!
377
+
378
+
379
+ writer.close if writer.respond_to?(:close)
380
+
381
+ @after_processing_steps.each do |step|
382
+ begin
383
+ step.execute
384
+ rescue Exception => e
385
+ logger.fatal("Unexpected exception #{e} when executing #{step}")
386
+ raise e
387
+ end
388
+ end
389
+
390
+ elapsed = Time.now - start_time
391
+ avg_rps = (count / elapsed)
392
+ logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
393
+
394
+ if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
395
+ logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
396
+ return false
397
+ end
398
+
399
+ return true
400
+ end
401
+
402
+ # Log that the current record is being skipped, using
403
+ # data in context.position and context.skipmessage
404
+ def log_skip(context)
405
+ logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
406
+ end
407
+
408
+ def reader_class
409
+ unless defined? @reader_class
410
+ @reader_class = qualified_const_get(settings["reader_class_name"])
411
+ end
412
+ return @reader_class
413
+ end
414
+
415
+ def writer_class
416
+ unless defined? @writer_class
417
+ @writer_class = qualified_const_get(settings["writer_class_name"])
418
+ end
419
+ return @writer_class
420
+ end
421
+
422
+ # Instantiate a Traject Reader, using class set
423
+ # in #reader_class, initialized with io_stream passed in
424
+ def reader!(io_stream)
425
+ return reader_class.new(io_stream, settings.merge("logger" => logger))
426
+ end
427
+
428
+ # Instantiate a Traject Writer, suing class set in #writer_class
429
+ def writer!
430
+ return writer_class.new(settings.merge("logger" => logger))
431
+ end
432
+
433
+ # Represents the context of a specific record being indexed, passed
434
+ # to indexing logic blocks
435
+ #
436
+ class Context
437
+ def initialize(hash_init = {})
438
+ # TODO, argument checking for required args?
439
+
440
+ self.clipboard = {}
441
+ self.output_hash = {}
442
+
443
+ hash_init.each_pair do |key, value|
444
+ self.send("#{key}=", value)
445
+ end
446
+
447
+ @skip = false
448
+ end
449
+
450
+ attr_accessor :clipboard, :output_hash, :logger
451
+ attr_accessor :index_step, :source_record, :settings
452
+ # 1-based position in stream of processed records.
453
+ attr_accessor :position
454
+
455
+ # Should we be skipping this record?
456
+ attr_accessor :skipmessage
457
+
458
+ # Set the fact that this record should be skipped, with an
459
+ # optional message
460
+ def skip!(msg = '(no message given)')
461
+ @skipmessage = msg
462
+ @skip = true
463
+ end
464
+
465
+ # Should we skip this record?
466
+ def skip?
467
+ @skip
468
+ end
469
+
470
+ end
471
+
472
+
473
+
474
+ # An indexing step definition, including it's source location
475
+ # for logging
476
+ #
477
+ # This one represents an "each_record" step, a subclass below
478
+ # for "to_field"
479
+ #
480
+ # source_location is just a string with filename and line number for
481
+ # showing to devs in debugging.
482
+ class EachRecordStep
483
+ attr_accessor :source_location, :lambda, :block
484
+
485
+ def initialize(lambda, block, source_location)
486
+ self.lambda = lambda
487
+ self.block = block
488
+ self.source_location = source_location
489
+
490
+ self.validate!
491
+ end
492
+
493
+ # raises if bad data
494
+ def validate!
495
+ unless self.lambda or self.block
496
+ raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{self.inspect})")
497
+ end
498
+
499
+ [self.lambda, self.block].each do |proc|
500
+ # allow negative arity, meaning variable/optional, trust em on that.
501
+ # but for positive arrity, we need 1 or 2 args
502
+ if proc
503
+ unless proc.is_a?(Proc)
504
+ raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} #{self.inspect}")
505
+ end
506
+ if (proc.arity == 0 || proc.arity > 2)
507
+ raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: #{self.inspect}")
508
+ end
509
+ end
510
+ end
511
+ end
512
+
513
+ # For each_record, always return an empty array as the
514
+ # accumulator, since it doesn't have those kinds of side effects
515
+ def execute(context)
516
+ [@lambda, @block].each do |aProc|
517
+ next unless aProc
518
+
519
+ if aProc.arity == 1
520
+ aProc.call(context.source_record)
521
+ else
522
+ aProc.call(context.source_record, context)
523
+ end
524
+
525
+ end
526
+ return [] # empty -- no accumulator for each_record
527
+ end
528
+
529
+ # Over-ride inspect for outputting error messages etc.
530
+ def inspect
531
+ "(each_record at #{source_location})"
532
+ end
533
+ end
534
+
535
+
536
+ # An indexing step definition for a "to_field" step to specific
537
+ # field.
538
+ class ToFieldStep
539
+ attr_accessor :field_name, :lambda, :block, :source_location
540
+ def initialize(fieldname, lambda, block, source_location)
541
+ self.field_name = fieldname
542
+ self.lambda = lambda
543
+ self.block = block
544
+ self.source_location = source_location
545
+
546
+ validate!
547
+ end
548
+
549
+ def validate!
550
+
551
+ if self.field_name.nil? || !self.field_name.is_a?(String) || self.field_name.empty?
552
+ raise NamingError.new("to_field requires the field name (as a string) as the first argument at #{self.source_location})")
553
+ end
554
+
555
+ [self.lambda, self.block].each do |proc|
556
+ # allow negative arity, meaning variable/optional, trust em on that.
557
+ # but for positive arrity, we need 2 or 3 args
558
+ if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
559
+ raise ArityError.new("error parsing field '#{self.field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{self.inspect})")
560
+ end
561
+ end
562
+ end
563
+
564
+ # Override inspect for developer debug messages
565
+ def inspect
566
+ "(to_field #{self.field_name} at #{self.source_location})"
567
+ end
568
+
569
+ def execute(context)
570
+ accumulator = []
571
+ [@lambda, @block].each do |aProc|
572
+ next unless aProc
573
+
574
+ if aProc.arity == 2
575
+ aProc.call(context.source_record, accumulator)
576
+ else
577
+ aProc.call(context.source_record, accumulator, context)
578
+ end
579
+
580
+ end
581
+ return accumulator
582
+ end
583
+
584
+ end
585
+
586
+ # A class representing a block of logic called after
587
+ # processing, registered with #after_processing
588
+ class AfterProcessingStep
589
+ attr_accessor :lambda, :block, :source_location
590
+ def initialize(lambda, block, source_location)
591
+ self.lambda = lambda
592
+ self.block = block
593
+ self.source_location = source_location
594
+ end
595
+
596
+ # after_processing steps get no args yielded to
597
+ # their blocks, they just are what they are.
598
+ def execute
599
+ [lambda, block].each do |aProc|
600
+ next unless aProc
601
+ aProc.call
602
+ end
603
+ end
604
+
605
+ def inspect
606
+ "(after_processing at #{self.source_location}"
607
+ end
608
+ end
609
+
610
+
611
+
612
+
613
+ end