traject 2.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,613 @@
1
+ require 'yell'
2
+
3
+ require 'traject'
4
+ require 'traject/qualified_const_get'
5
+ require 'traject/thread_pool'
6
+
7
+ require 'traject/indexer/settings'
8
+ require 'traject/marc_reader'
9
+ require 'traject/json_writer'
10
+ require 'traject/solr_json_writer'
11
+
12
+ require 'traject/macros/marc21'
13
+ require 'traject/macros/basic'
14
+
15
+ if defined? JRUBY_VERSION
16
+ require 'traject/marc4j_reader'
17
+ end
18
+
19
+ # This class does indexing for traject: Getting input records from a Reader
20
+ # class, mapping the input records to an output hash, and then sending the output
21
+ # hash off somewhere (usually Solr) with a Writer class.
22
+ #
23
+ # Traject config files are `instance_eval`d in an Indexer object, so `self` in
24
+ # a config file is an Indexer, and any Indexer methods can be called.
25
+ #
26
+ # However, certain Indexer methods exist almost entirely for the purpose of
27
+ # being called in config files; these methods are part of the expected
28
+ # Domain-Specific Language ("DSL") for config files, and will ordinarily
29
+ # form the bulk or entirety of config files:
30
+ #
31
+ # * #settings
32
+ # * #to_field
33
+ # * #each_record
34
+ # * #after_procesing
35
+ # * #logger (rarely used in config files, but in some cases to set up custom logging config)
36
+ #
37
+ # If accessing a Traject::Indexer programmatically (instead of via command line with
38
+ # config files), additional methods of note include:
39
+ #
40
+ # # to process a stream of input records from configured Reader,
41
+ # # to configured Writer:
42
+ # indexer.process(io_stream)
43
+ #
44
+ # # To map a single input record manually to an ouput_hash,
45
+ # # ignoring Readers and Writers
46
+ # hash = indexer.map_record(record)
47
+ #
48
+ #
49
+ # ## Readers and Writers
50
+ #
51
+ # The Indexer has a modularized architecture for readers and writers, for where
52
+ # source records come from (reader), and where output is sent to (writer).
53
+ #
54
+ # A Reader is any class that:
55
+ # 1) Has a two-argument initializer taking an IO stream and a Settings hash
56
+ # 2) Responds to the usual ruby #each, returning a source record from each #each.
57
+ # (Including Enumerable is prob a good idea too)
58
+ #
59
+ # The default reader is the Traject::MarcReader, who's behavior is
60
+ # further customized by several settings in the Settings hash. Jruby users
61
+ # with specialized needs may want to look at the gem traject-marc4j_reader.
62
+ #
63
+ # Alternate readers can be set directly with the #reader_class= method, or
64
+ # with the "reader_class_name" Setting, a String name of a class
65
+ # meeting the reader contract.
66
+ #
67
+ #
68
+ # A Writer is any class that:
69
+ # 1) Has a one-argument initializer taking a Settings hash. (The logger
70
+ # is provided to the Writer in settings["logger"])
71
+ # 2) Responds to a one argument #put method, where the argument is
72
+ # a Traject::Indexer::Context, containing an #output_hash
73
+ # hash of mapped keys/values. The writer should write them
74
+ # to the appropriate place.
75
+ # 3) Responds to a #close method, called when we're done.
76
+ # 4) Optionally implements a #skipped_record_count method, returning int count of records
77
+ # that were skipped due to errors (and presumably logged)
78
+ #
79
+ # Traject packages one solr writer: traject/solr_json_writer, which sends
80
+ # in json format and works under both ruby and jruby, but only with solr version
81
+ # >= 3.2. To index to an older solr installation, you'll need to use jruby and
82
+ # install the gem traject-solrj_writer, which uses the solrj .jar underneath.
83
+ #
84
+ # You can set alternate writers by setting a Class object directly
85
+ # with the #writer_class method, or by the 'writer_class_name' Setting,
86
+ # with a String name of class meeting the Writer contract. There are several
87
+ # that ship with traject itself:
88
+ #
89
+ # * traject/json_writer (Traject::JsonWriter) -- write newline-delimied json files.
90
+ # * traject/yaml_writer (Traject::YamlWriter) -- write pretty yaml file; very human-readable
91
+ # * traject/debug_writer (Traject::DebugWriter) -- write a tab-delimited file where
92
+ # each line consists of the id, field, and value(s).
93
+ # * traject/delimited_writer and traject/csv_writer -- write character-delimited files
94
+ # (default is tab-delimited) or comma-separated-value files.
95
+ #
96
+ class Traject::Indexer
97
+
98
+ # Arity error on a passed block
99
+ class ArityError < ArgumentError; end
100
+ class NamingError < ArgumentError; end
101
+
102
+
103
+
104
+ include Traject::QualifiedConstGet
105
+
106
+ attr_writer :reader_class, :writer_class
107
+
108
+ # For now we hard-code these basic macro's included
109
+ # TODO, make these added with extend per-indexer,
110
+ # added by default but easily turned off (or have other
111
+ # default macro modules provided)
112
+ include Traject::Macros::Marc21
113
+ include Traject::Macros::Basic
114
+
115
+
116
+ # optional hash or Traject::Indexer::Settings object of settings.
117
+ def initialize(arg_settings = {})
118
+ @settings = Settings.new(arg_settings)
119
+ @index_steps = []
120
+ @after_processing_steps = []
121
+ end
122
+
123
+ # Part of the config file DSL, for writing settings values.
124
+ #
125
+ # The Indexer's settings consist of a hash-like Traject::Settings
126
+ # object. The settings hash is *not* nested hashes, just one level
127
+ # of configuration settings. Keys are always strings, and by convention
128
+ # use "." for namespacing, eg `log.file`
129
+ #
130
+ # The settings method with no arguments returns that Settings object.
131
+ #
132
+ # With a hash and/or block argument, can be used to set
133
+ # new key/values. Each call merges onto the existing settings
134
+ # hash. The block is `instance_eval`d in the context
135
+ # of the Traject::Settings object.
136
+ #
137
+ # indexer.settings("a" => "a", "b" => "b")
138
+ #
139
+ # indexer.settings do
140
+ # provide "b", "new b"
141
+ # end
142
+ #
143
+ # indexer.settings #=> {"a" => "a", "b" => "new b"}
144
+ #
145
+ # Note the #provide method is defined on Traject::Settings to
146
+ # write to a setting only if previously not set. You can also
147
+ # use #store to force over-writing even if an existing setting.
148
+ #
149
+ # Even with arguments, Indexer#settings returns the Settings object,
150
+ # hash too, so can method calls can be chained.
151
+ #
152
+ def settings(new_settings = nil, &block)
153
+ @settings.merge!(new_settings) if new_settings
154
+
155
+ @settings.instance_eval &block if block
156
+
157
+ return @settings
158
+ end
159
+
160
+ # Part of DSL, used to define an indexing mapping. Register logic
161
+ # to be called for each record, and generate values for a particular
162
+ # output field.
163
+ def to_field(field_name, aLambda = nil, &block)
164
+ @index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
165
+ end
166
+
167
+ # Part of DSL, register logic to be called for each record
168
+ def each_record(aLambda = nil, &block)
169
+ @index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
170
+ end
171
+
172
+ # Part of DSL, register logic to be called once at the end
173
+ # of processing a stream of records.
174
+ def after_processing(aLambda = nil, &block)
175
+ @after_processing_steps << AfterProcessingStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first))
176
+ end
177
+
178
+ def logger
179
+ @logger ||= create_logger
180
+ end
181
+ attr_writer :logger
182
+
183
+
184
+ def logger_format
185
+ format = settings["log.format"] || "%d %5L %m"
186
+ format = case format
187
+ when "false" then false
188
+ when "" then nil
189
+ else format
190
+ end
191
+ end
192
+
193
+ # Create logger according to settings
194
+ def create_logger
195
+
196
+ logger_level = settings["log.level"] || "info"
197
+
198
+ # log everything to STDERR or specified logfile
199
+ logger = Yell::Logger.new(:null)
200
+ logger.format = logger_format
201
+ logger.level = logger_level
202
+
203
+ logger_destination = settings["log.file"] || "STDERR"
204
+ # We intentionally repeat the logger_level
205
+ # on the adapter, so it will stay there if overall level
206
+ # is changed.
207
+ case logger_destination
208
+ when "STDERR"
209
+ logger.adapter :stderr, level: logger_level, format: logger_format
210
+ when "STDOUT"
211
+ logger.adapter :stdout, level: logger_level, format: logger_format
212
+ else
213
+ logger.adapter :file, logger_destination, level: logger_level, format: logger_format
214
+ end
215
+
216
+
217
+ # ADDITIONALLY log error and higher to....
218
+ if settings["log.error_file"]
219
+ logger.adapter :file, settings["log.error_file"], :level => 'gte.error'
220
+ end
221
+
222
+ return logger
223
+ end
224
+
225
+ # Processes a single record according to indexing rules set up in
226
+ # this indexer. Returns the output hash (a hash whose keys are
227
+ # string fields, and values are arrays of one or more values in that field)
228
+ #
229
+ # This is a convenience shortcut for #map_to_context! -- use that one
230
+ # if you want to provide addtional context
231
+ # like position, and/or get back the full context.
232
+ def map_record(record)
233
+ context = Context.new(:source_record => record, :settings => settings)
234
+ map_to_context!(context)
235
+ return context.output_hash
236
+ end
237
+
238
+ # Maps a single record INTO the second argument, a Traject::Indexer::Context.
239
+ #
240
+ # Context must be passed with a #source_record and #settings, and optionally
241
+ # a #position.
242
+ #
243
+ # Context will be mutated by this method, most significantly by adding
244
+ # an #output_hash, a hash from fieldname to array of values in that field.
245
+ #
246
+ # Pass in a context with a set #position if you want that to be available
247
+ # to mapping routines.
248
+ #
249
+ # Returns the context passed in as second arg, as a convenience for chaining etc.
250
+
251
+ def map_to_context!(context)
252
+ @index_steps.each do |index_step|
253
+ # Don't bother if we're skipping this record
254
+ break if context.skip?
255
+
256
+ context.index_step = index_step
257
+ accumulator = log_mapping_errors(context, index_step) do
258
+ index_step.execute(context) # will always return [] for an each_record step
259
+ end
260
+
261
+ if accumulator.size > 0
262
+ accumulator.compact!
263
+ (context.output_hash[index_step.field_name] ||= []).concat accumulator
264
+ end
265
+
266
+ context.index_step = nil
267
+ end
268
+
269
+ return context
270
+ end
271
+
272
+ # just a wrapper that captures and records any unexpected
273
+ # errors raised in mapping, along with contextual information
274
+ # on record and location in source file of mapping rule.
275
+ #
276
+ # Re-raises error at the moment.
277
+ #
278
+ # log_mapping_errors(context, index_step) do
279
+ # all_sorts_of_stuff # that will have errors logged
280
+ # end
281
+ def log_mapping_errors(context, index_step)
282
+ begin
283
+ yield
284
+ rescue Exception => e
285
+ msg = "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
286
+ msg += " while executing #{index_step.inspect}\n"
287
+ msg += Traject::Util.exception_to_log_message(e)
288
+
289
+ logger.error msg
290
+ begin
291
+ logger.debug "Record: " + context.source_record.to_s
292
+ rescue Exception => marc_to_s_exception
293
+ logger.debug "(Could not log record, #{marc_to_s_exception})"
294
+ end
295
+
296
+ raise e
297
+ end
298
+ end
299
+
300
+ # get a printable id from record for error logging.
301
+ # Maybe override this for a future XML version.
302
+ def id_string(record)
303
+ record && record['001'] && record['001'].value.to_s
304
+ end
305
+
306
+ # Processes a stream of records, reading from the configured Reader,
307
+ # mapping according to configured mapping rules, and then writing
308
+ # to configured Writer.
309
+ #
310
+ # returns 'false' as a signal to command line to return non-zero exit code
311
+ # for some reason (reason found in logs, presumably). This particular mechanism
312
+ # is open to complexification, starting simple. We do need SOME way to return
313
+ # non-zero to command line.
314
+ #
315
+ def process(io_stream)
316
+ settings.fill_in_defaults!
317
+
318
+ count = 0
319
+ start_time = batch_start_time = Time.now
320
+ logger.debug "beginning Indexer#process with settings: #{settings.inspect}"
321
+
322
+ reader = self.reader!(io_stream)
323
+ writer = self.writer!
324
+
325
+
326
+ processing_threads = settings["processing_thread_pool"].to_i
327
+ thread_pool = Traject::ThreadPool.new(processing_threads)
328
+
329
+ logger.info " Indexer with #{processing_threads} processing threads, reader: #{reader.class.name} and writer: #{writer.class.name}"
330
+
331
+ log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
332
+
333
+ reader.each do |record; position|
334
+ count += 1
335
+
336
+ # have to use a block local var, so the changing `count` one
337
+ # doesn't get caught in the closure. Weird, yeah.
338
+ position = count
339
+
340
+ thread_pool.raise_collected_exception!
341
+
342
+ if settings["debug_ascii_progress"].to_s == "true"
343
+ $stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
344
+ end
345
+
346
+ if log_batch_size && (count % log_batch_size == 0)
347
+ batch_rps = log_batch_size / (Time.now - batch_start_time)
348
+ overall_rps = count / (Time.now - start_time)
349
+ logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at id:#{id_string(record)}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
350
+ batch_start_time = Time.now
351
+ end
352
+
353
+ # we have to use this weird lambda to properly "capture" the count, instead
354
+ # of having it be bound to the original variable in a non-threadsafe way.
355
+ # This is confusing, I might not be understanding things properly, but that's where i am.
356
+ #thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
357
+ thread_pool.maybe_in_thread_pool(record, settings, position) do |record, settings, position|
358
+ context = Context.new(:source_record => record, :settings => settings, :position => position)
359
+ context.logger = logger
360
+ map_to_context!(context)
361
+ if context.skip?
362
+ log_skip(context)
363
+ else
364
+ writer.put context
365
+ end
366
+
367
+ end
368
+
369
+ end
370
+ $stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
371
+
372
+ logger.debug "Shutting down #processing mapper threadpool..."
373
+ thread_pool.shutdown_and_wait
374
+ logger.debug "#processing mapper threadpool shutdown complete."
375
+
376
+ thread_pool.raise_collected_exception!
377
+
378
+
379
+ writer.close if writer.respond_to?(:close)
380
+
381
+ @after_processing_steps.each do |step|
382
+ begin
383
+ step.execute
384
+ rescue Exception => e
385
+ logger.fatal("Unexpected exception #{e} when executing #{step}")
386
+ raise e
387
+ end
388
+ end
389
+
390
+ elapsed = Time.now - start_time
391
+ avg_rps = (count / elapsed)
392
+ logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
393
+
394
+ if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
395
+ logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
396
+ return false
397
+ end
398
+
399
+ return true
400
+ end
401
+
402
+ # Log that the current record is being skipped, using
403
+ # data in context.position and context.skipmessage
404
+ def log_skip(context)
405
+ logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
406
+ end
407
+
408
+ def reader_class
409
+ unless defined? @reader_class
410
+ @reader_class = qualified_const_get(settings["reader_class_name"])
411
+ end
412
+ return @reader_class
413
+ end
414
+
415
+ def writer_class
416
+ unless defined? @writer_class
417
+ @writer_class = qualified_const_get(settings["writer_class_name"])
418
+ end
419
+ return @writer_class
420
+ end
421
+
422
+ # Instantiate a Traject Reader, using class set
423
+ # in #reader_class, initialized with io_stream passed in
424
+ def reader!(io_stream)
425
+ return reader_class.new(io_stream, settings.merge("logger" => logger))
426
+ end
427
+
428
+ # Instantiate a Traject Writer, suing class set in #writer_class
429
+ def writer!
430
+ return writer_class.new(settings.merge("logger" => logger))
431
+ end
432
+
433
+ # Represents the context of a specific record being indexed, passed
434
+ # to indexing logic blocks
435
+ #
436
+ class Context
437
+ def initialize(hash_init = {})
438
+ # TODO, argument checking for required args?
439
+
440
+ self.clipboard = {}
441
+ self.output_hash = {}
442
+
443
+ hash_init.each_pair do |key, value|
444
+ self.send("#{key}=", value)
445
+ end
446
+
447
+ @skip = false
448
+ end
449
+
450
+ attr_accessor :clipboard, :output_hash, :logger
451
+ attr_accessor :index_step, :source_record, :settings
452
+ # 1-based position in stream of processed records.
453
+ attr_accessor :position
454
+
455
+ # Should we be skipping this record?
456
+ attr_accessor :skipmessage
457
+
458
+ # Set the fact that this record should be skipped, with an
459
+ # optional message
460
+ def skip!(msg = '(no message given)')
461
+ @skipmessage = msg
462
+ @skip = true
463
+ end
464
+
465
+ # Should we skip this record?
466
+ def skip?
467
+ @skip
468
+ end
469
+
470
+ end
471
+
472
+
473
+
474
+ # An indexing step definition, including it's source location
475
+ # for logging
476
+ #
477
+ # This one represents an "each_record" step, a subclass below
478
+ # for "to_field"
479
+ #
480
+ # source_location is just a string with filename and line number for
481
+ # showing to devs in debugging.
482
+ class EachRecordStep
483
+ attr_accessor :source_location, :lambda, :block
484
+
485
+ def initialize(lambda, block, source_location)
486
+ self.lambda = lambda
487
+ self.block = block
488
+ self.source_location = source_location
489
+
490
+ self.validate!
491
+ end
492
+
493
+ # raises if bad data
494
+ def validate!
495
+ unless self.lambda or self.block
496
+ raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{self.inspect})")
497
+ end
498
+
499
+ [self.lambda, self.block].each do |proc|
500
+ # allow negative arity, meaning variable/optional, trust em on that.
501
+ # but for positive arrity, we need 1 or 2 args
502
+ if proc
503
+ unless proc.is_a?(Proc)
504
+ raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} #{self.inspect}")
505
+ end
506
+ if (proc.arity == 0 || proc.arity > 2)
507
+ raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: #{self.inspect}")
508
+ end
509
+ end
510
+ end
511
+ end
512
+
513
+ # For each_record, always return an empty array as the
514
+ # accumulator, since it doesn't have those kinds of side effects
515
+ def execute(context)
516
+ [@lambda, @block].each do |aProc|
517
+ next unless aProc
518
+
519
+ if aProc.arity == 1
520
+ aProc.call(context.source_record)
521
+ else
522
+ aProc.call(context.source_record, context)
523
+ end
524
+
525
+ end
526
+ return [] # empty -- no accumulator for each_record
527
+ end
528
+
529
+ # Over-ride inspect for outputting error messages etc.
530
+ def inspect
531
+ "(each_record at #{source_location})"
532
+ end
533
+ end
534
+
535
+
536
+ # An indexing step definition for a "to_field" step to specific
537
+ # field.
538
+ class ToFieldStep
539
+ attr_accessor :field_name, :lambda, :block, :source_location
540
+ def initialize(fieldname, lambda, block, source_location)
541
+ self.field_name = fieldname
542
+ self.lambda = lambda
543
+ self.block = block
544
+ self.source_location = source_location
545
+
546
+ validate!
547
+ end
548
+
549
+ def validate!
550
+
551
+ if self.field_name.nil? || !self.field_name.is_a?(String) || self.field_name.empty?
552
+ raise NamingError.new("to_field requires the field name (as a string) as the first argument at #{self.source_location})")
553
+ end
554
+
555
+ [self.lambda, self.block].each do |proc|
556
+ # allow negative arity, meaning variable/optional, trust em on that.
557
+ # but for positive arrity, we need 2 or 3 args
558
+ if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
559
+ raise ArityError.new("error parsing field '#{self.field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{self.inspect})")
560
+ end
561
+ end
562
+ end
563
+
564
+ # Override inspect for developer debug messages
565
+ def inspect
566
+ "(to_field #{self.field_name} at #{self.source_location})"
567
+ end
568
+
569
+ def execute(context)
570
+ accumulator = []
571
+ [@lambda, @block].each do |aProc|
572
+ next unless aProc
573
+
574
+ if aProc.arity == 2
575
+ aProc.call(context.source_record, accumulator)
576
+ else
577
+ aProc.call(context.source_record, accumulator, context)
578
+ end
579
+
580
+ end
581
+ return accumulator
582
+ end
583
+
584
+ end
585
+
586
+ # A class representing a block of logic called after
587
+ # processing, registered with #after_processing
588
+ class AfterProcessingStep
589
+ attr_accessor :lambda, :block, :source_location
590
+ def initialize(lambda, block, source_location)
591
+ self.lambda = lambda
592
+ self.block = block
593
+ self.source_location = source_location
594
+ end
595
+
596
+ # after_processing steps get no args yielded to
597
+ # their blocks, they just are what they are.
598
+ def execute
599
+ [lambda, block].each do |aProc|
600
+ next unless aProc
601
+ aProc.call
602
+ end
603
+ end
604
+
605
+ def inspect
606
+ "(after_processing at #{self.source_location}"
607
+ end
608
+ end
609
+
610
+
611
+
612
+
613
+ end