traject 2.3.4 → 3.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -40,12 +40,9 @@ class Traject::DebugWriter < Traject::LineWriter
40
40
  @idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
41
41
  @format = settings['debug_writer.format'] || DEFAULT_FORMAT
42
42
 
43
- if @idfield == 'record_position' then
44
- @use_position = true
45
- end
43
+ @use_position = (@idfield == 'record_position')
46
44
 
47
45
  @already_threw_warning_about_missing_id = false
48
-
49
46
  end
50
47
 
51
48
  def record_number(context)
@@ -54,7 +51,7 @@ class Traject::DebugWriter < Traject::LineWriter
54
51
  context.output_hash[@idfield].first
55
52
  else
56
53
  unless @already_threw_warning_about_missing_id
57
- context.logger.warn "At least one record (##{context.position}) doesn't define field '#{@idfield}'.
54
+ context.logger.warn "At least one record (#{context.record_inspect}) doesn't define field '#{@idfield}'.
58
55
  All records are assumed to have a unique id. You can set which field to look in via the setting 'debug_writer.idfield'"
59
56
  @already_threw_warning_about_missing_id = true
60
57
  end
@@ -0,0 +1,276 @@
1
+ module Traject
2
+ # An EXPERIMENTAL HALF-FINISHED implementation of a streaming/pull reader using Nokogiri.
3
+ # Not ready for use, not stable API, could go away.
4
+ #
5
+ # This was my first try at a NokogiriReader implementation, it didn't work out, at least without
6
+ # a lot more work. I think we'd need to re-do it to build the Nokogiri::XML::Nodes by hand as the
7
+ # source is traversed, instead of relying on #outer_xml -- outer_xml returning a string results in a double-parsing,
8
+ # with the expected 50% performance hit. Picadillos in Nokogiri JRuby namespace handling don't help.
9
+ #
10
+ # All in all, it's possible something could be gotten here with a lot more work, it's also possible
11
+ # Nokogiri's antipathy to namespaces could keep getting in the way.
12
+ class ExperimentalNokogiriStreamingReader
13
+ include Enumerable
14
+
15
+ attr_reader :settings, :input_stream, :clipboard, :path_tracker
16
+
17
+ def initialize(input_stream, settings)
18
+ @settings = Traject::Indexer::Settings.new settings
19
+ @input_stream = input_stream
20
+ @clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new
21
+
22
+ if each_record_xpath
23
+ @path_tracker = PathTracker.new(each_record_xpath,
24
+ clipboard: self.clipboard,
25
+ namespaces: default_namespaces,
26
+ extra_xpath_hooks: extra_xpath_hooks)
27
+ end
28
+
29
+ default_namespaces # trigger validation
30
+ validate_limited_xpath(each_record_xpath, key_name: "each_record_xpath")
31
+
32
+ end
33
+
34
+ def each_record_xpath
35
+ @each_record_xpath ||= settings["nokogiri.each_record_xpath"]
36
+ end
37
+
38
+ def extra_xpath_hooks
39
+ @extra_xpath_hooks ||= begin
40
+ (settings["nokogiri_reader.extra_xpath_hooks"] || {}).tap do |hash|
41
+ hash.each_pair do |limited_xpath, callable|
42
+ validate_limited_xpath(limited_xpath, key_name: "nokogiri_reader.extra_xpath_hooks")
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+ protected def validate_limited_xpath(each_record_xpath, key_name:)
49
+ return unless each_record_xpath
50
+
51
+ components = each_record_xpath.split('/')
52
+ components.each do |component|
53
+ prefix, element = component.split(':')
54
+ unless element
55
+ # there was no namespace
56
+ prefix, element = nil, prefix
57
+ end
58
+
59
+ # We don't support brackets or any xpath beyond the MOST simple.
60
+ # Catch a few we can catch.
61
+ if element =~ /::/ || element =~ /[\[\]]/
62
+ raise ArgumentError, "#{key_name}: Only very simple xpaths supported. '//some/path' or '/some/path'. Not: #{each_record_xpath.inspect}"
63
+ end
64
+
65
+ if prefix
66
+ ns_uri = default_namespaces[prefix]
67
+ if ns_uri.nil?
68
+ raise ArgumentError, "each_record_xpath: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}"
69
+ end
70
+ end
71
+ end
72
+
73
+ each_record_xpath
74
+ end
75
+
76
+
77
+ def default_namespaces
78
+ @default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
79
+ unless ns.kind_of?(Hash)
80
+ raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
81
+ end
82
+ }
83
+ end
84
+
85
+ def each
86
+ unless each_record_xpath
87
+ # forget streaming, just read it and return it once, done.
88
+ yield Nokogiri::XML.parse(input_stream)
89
+ return
90
+ end
91
+
92
+ reader = Nokogiri::XML::Reader(input_stream)
93
+
94
+ reader.each do |reader_node|
95
+ if reader_node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
96
+ path_tracker.push(reader_node)
97
+
98
+ if path_tracker.match?
99
+ yield path_tracker.current_node_doc
100
+ end
101
+ path_tracker.run_extra_xpath_hooks
102
+
103
+ if reader_node.self_closing?
104
+ path_tracker.pop
105
+ end
106
+ end
107
+
108
+ if reader_node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT
109
+ path_tracker.pop
110
+ end
111
+ end
112
+ end
113
+
114
+ private
115
+
116
+ # initialized with the specification (a very small subset of xpath) for
117
+ # what records to yield-on-each. Tests to see if a Nokogiri::XML::Reader
118
+ # node matches spec.
119
+ #
120
+ # '//record'
121
+ # or anchored to root:
122
+ # '/body/head/meta' same thing as './body/head/meta' or 'head/meta'
123
+ #
124
+ # Elements can (and must, to match) have XML namespaces, if and only if
125
+ # they are registered with settings nokogiri.namespaces
126
+ #
127
+ # sadly JRuby Nokogiri has an incompatibility with true nokogiri, and
128
+ # doesn't preserve our namespaces on outer_xml,
129
+ # so in JRuby we have to track them ourselves, and then also do yet ANOTHER
130
+ # parse in nokogiri. This may make this in Java even LESS performant, I'm afraid.
131
+ class PathTracker
132
+ attr_reader :path_spec, :inverted_namespaces, :current_path, :namespaces_stack, :extra_xpath_hooks, :clipboard
133
+ def initialize(str_spec, clipboard:, namespaces: {}, extra_xpath_hooks: {})
134
+ @inverted_namespaces = namespaces.invert
135
+ @clipboard = clipboard
136
+ # We're guessing using a string will be more efficient than an array
137
+ @current_path = ""
138
+ @floating = false
139
+
140
+ @path_spec, @floating = parse_path(str_spec)
141
+
142
+ @namespaces_stack = []
143
+
144
+
145
+ @extra_xpath_hooks = extra_xpath_hooks.collect do |path, callable|
146
+ bare_path, floating = parse_path(path)
147
+ {
148
+ path: bare_path,
149
+ floating: floating,
150
+ callable: callable
151
+ }
152
+ end
153
+ end
154
+
155
+ # returns [bare_path, is_floating]
156
+ protected def parse_path(str_spec)
157
+ floating = false
158
+
159
+ if str_spec.start_with?('//')
160
+ str_spec = str_spec.slice(2..-1)
161
+ floating = true
162
+ else
163
+ str_spec = str_spec.slice(1..-1) if str_spec.start_with?(".")
164
+ str_spec = "/" + str_spec unless str_spec.start_with?("/")
165
+ end
166
+
167
+ return [str_spec, floating]
168
+ end
169
+
170
+ def is_jruby?
171
+ Traject::Util.is_jruby?
172
+ end
173
+
174
+ # adds a component to slash-separated current_path, with namespace prefix.
175
+ def push(reader_node)
176
+ namespace_prefix = reader_node.namespace_uri && inverted_namespaces[reader_node.namespace_uri]
177
+
178
+ # gah, reader_node.name has the namespace prefix in there
179
+ node_name = reader_node.name.gsub(/[^:]+:/, '')
180
+
181
+ node_str = if namespace_prefix
182
+ namespace_prefix + ":" + node_name
183
+ else
184
+ reader_node.name
185
+ end
186
+
187
+ current_path << ("/" + node_str)
188
+
189
+ if is_jruby?
190
+ namespaces_stack << reader_node.namespaces
191
+ end
192
+ @current_node = reader_node
193
+ end
194
+
195
+ def current_node_doc
196
+ return nil unless @current_node
197
+
198
+ # yeah, sadly we got to have nokogiri parse it again
199
+ fix_namespaces(Nokogiri::XML.parse(@current_node.outer_xml))
200
+ end
201
+
202
+ # removes the last slash-separated component from current_path
203
+ def pop
204
+ current_path.slice!( current_path.rindex('/')..-1 )
205
+ @current_node = nil
206
+
207
+ if is_jruby?
208
+ namespaces_stack.pop
209
+ end
210
+ end
211
+
212
+ def floating?
213
+ !!@floating
214
+ end
215
+
216
+ def match?
217
+ match_path?(path_spec, floating: floating?)
218
+ end
219
+
220
+ def match_path?(path_to_match, floating:)
221
+ if floating?
222
+ current_path.end_with?(path_to_match)
223
+ else
224
+ current_path == path_to_match
225
+ end
226
+ end
227
+
228
+ def run_extra_xpath_hooks
229
+ return unless @current_node
230
+
231
+ extra_xpath_hooks.each do |hook_spec|
232
+ if match_path?(hook_spec[:path], floating: hook_spec[:floating])
233
+ hook_spec[:callable].call(current_node_doc, clipboard)
234
+ end
235
+ end
236
+ end
237
+
238
+ # no-op unless it's jruby, and then we use our namespace stack to
239
+ # correctly add namespaces to the Nokogiri::XML::Document, cause
240
+ # in Jruby outer_xml on the Reader doesn't do it for us. :(
241
+ def fix_namespaces(doc)
242
+ if is_jruby?
243
+ # Only needed in jruby, nokogiri's jruby implementation isn't weird
244
+ # around namespaces in exactly the same way as MRI. We need to keep
245
+ # track of the namespaces in outer contexts ourselves, and then see
246
+ # if they are needed ourselves. :(
247
+ namespaces = namespaces_stack.compact.reduce({}, :merge)
248
+ default_ns = namespaces.delete("xmlns")
249
+
250
+ namespaces.each_pair do |attrib, uri|
251
+ ns_prefix = attrib.sub(/\Axmlns:/, '')
252
+
253
+ # gotta make sure it's actually used in the doc to not add it
254
+ # unecessarily. GAH.
255
+ if doc.xpath("//*[starts-with(name(), '#{ns_prefix}:')][1]").empty? &&
256
+ doc.xpath("//@*[starts-with(name(), '#{ns_prefix}:')][1]").empty?
257
+ next
258
+ end
259
+ doc.root.add_namespace_definition(ns_prefix, uri)
260
+ end
261
+
262
+ if default_ns
263
+ doc.root.default_namespace = default_ns
264
+ # OMG nokogiri, really?
265
+ default_ns = doc.root.namespace
266
+ doc.xpath("//*[namespace-uri()='']").each do |node|
267
+ node.namespace = default_ns
268
+ end
269
+ end
270
+
271
+ end
272
+ return doc
273
+ end
274
+ end
275
+ end
276
+ end
@@ -0,0 +1,25 @@
1
+ require 'hashie'
2
+
3
+ module Traject
4
+ module Hashie
5
+ # Backporting fix from https://github.com/intridea/hashie/commit/a82c594710e1bc9460d3de4d2989cb700f4c3c7f
6
+ # into Hashie.
7
+ #
8
+ # This makes merge(ordinary_hash) on a Hash that has IndifferentAccess included work, without
9
+ # raising. Which we needed.
10
+ #
11
+ # As of this writing that fix is not available in a Hashie release, if it becomes so
12
+ # later than this monkey-patch may no longer be required, we can just depend on fixed version.
13
+ #
14
+ # See also https://github.com/intridea/hashie/issues/451
15
+ module IndifferentAccessFix
16
+ def merge(*args)
17
+ result = super
18
+ ::Hashie::Extensions::IndifferentAccess.inject!(result) if hash_lacking_indifference?(result)
19
+ result.convert!
20
+ end
21
+ end
22
+ end
23
+ end
24
+ Hashie::Extensions::IndifferentAccess.include(Traject::Hashie::IndifferentAccessFix)
25
+
@@ -11,14 +11,14 @@ require 'traject/marc_reader'
11
11
  require 'traject/json_writer'
12
12
  require 'traject/solr_json_writer'
13
13
  require 'traject/debug_writer'
14
-
14
+ require 'traject/array_writer'
15
15
 
16
16
  require 'traject/macros/marc21'
17
17
  require 'traject/macros/basic'
18
+ require 'traject/macros/transformation'
19
+
20
+ require 'traject/indexer/marc_indexer'
18
21
 
19
- if defined? JRUBY_VERSION
20
- require 'traject/marc4j_reader'
21
- end
22
22
 
23
23
  # This class does indexing for traject: Getting input records from a Reader
24
24
  # class, mapping the input records to an output hash, and then sending the output
@@ -157,33 +157,39 @@ end
157
157
  # inconveient for you, we'd like to know your use case and improve things.
158
158
  #
159
159
  class Traject::Indexer
160
-
161
- # Arity error on a passed block
162
- class ArityError < ArgumentError;
163
- end
164
- class NamingError < ArgumentError;
165
- end
166
-
160
+ CompletedStateError = Class.new(StandardError)
161
+ ArityError = Class.new(ArgumentError)
162
+ NamingError = Class.new(ArgumentError)
167
163
 
168
164
  include Traject::QualifiedConstGet
165
+ extend Traject::QualifiedConstGet
169
166
 
170
167
  attr_writer :reader_class, :writer_class, :writer
171
168
 
172
- # For now we hard-code these basic macro's included
173
- # TODO, make these added with extend per-indexer,
174
- # added by default but easily turned off (or have other
175
- # default macro modules provided)
176
- include Traject::Macros::Marc21
177
169
  include Traject::Macros::Basic
170
+ include Traject::Macros::Transformation
178
171
 
179
172
 
180
173
  # optional hash or Traject::Indexer::Settings object of settings.
181
- def initialize(arg_settings = {})
182
- @settings = Settings.new(arg_settings)
174
+ # optionally takes a block which is instance_eval'd in the indexer,
175
+ # intended for configuration simimlar to what would be in a config file.
176
+ def initialize(arg_settings = {}, &block)
177
+ @writer_class = nil
178
+ @completed = false
179
+ @settings = Settings.new(arg_settings).with_defaults(self.class.default_settings)
183
180
  @index_steps = []
184
181
  @after_processing_steps = []
182
+
183
+ instance_eval(&block) if block
184
+ end
185
+
186
+ # Right now just does an `instance_eval`, but encouraged in case we change the underlying
187
+ # implementation later, and to make intent more clear.
188
+ def configure(&block)
189
+ instance_eval(&block)
185
190
  end
186
191
 
192
+
187
193
  # Pass a string file path, a Pathname, or a File object, for
188
194
  # a config file to load into indexer.
189
195
  #
@@ -234,16 +240,81 @@ class Traject::Indexer
234
240
  def settings(new_settings = nil, &block)
235
241
  @settings.merge!(new_settings) if new_settings
236
242
 
237
- @settings.instance_eval &block if block_given?
243
+ @settings.instance_eval(&block) if block_given?
238
244
 
239
245
  return @settings
240
246
  end
241
247
 
248
+ # Hash is frozen to avoid inheritance-mutability confusion.
249
+ def self.default_settings
250
+ @default_settings ||= {
251
+ # Writer defaults
252
+ "writer_class_name" => "Traject::SolrJsonWriter",
253
+ "solr_writer.batch_size" => 100,
254
+ "solr_writer.thread_pool" => 1,
255
+
256
+ # Threading and logging
257
+ "processing_thread_pool" => Traject::Indexer::Settings.default_processing_thread_pool,
258
+ "log.batch_size.severity" => "info",
259
+
260
+ # how to post-process the accumulator
261
+ "allow_nil_values" => false,
262
+ "allow_duplicate_values" => true,
263
+
264
+ "allow_empty_fields" => false
265
+ }.freeze
266
+ end
267
+
268
+
269
+ # Not sure if allowing changing of default_settings is a good idea, but we do
270
+ # use it in test. For now we make it private to require extreme measures to do it,
271
+ # and advertise that this API could go away or change without a major version release,
272
+ # it is experimental and internal.
273
+ private_class_method def self.default_settings=(settings)
274
+ @default_settings = settings
275
+ end
276
+
277
+ # Sub-classes should override to return a _proc_ object that takes one arg,
278
+ # a source record, and returns an identifier for it that can be used in
279
+ # logged messages. This differs depending on input record format, is why we
280
+ # leave it to sub-classes.
281
+ def source_record_id_proc
282
+ if defined?(@@legacy_marc_mode) && @@legacy_marc_mode
283
+ return @source_record_id_proc ||= lambda do |source_marc_record|
284
+ if ( source_marc_record &&
285
+ source_marc_record.kind_of?(MARC::Record) &&
286
+ source_marc_record['001'] )
287
+ source_marc_record['001'].value
288
+ end
289
+ end
290
+ end
291
+
292
+ @source_record_id_proc ||= lambda { |source| nil }
293
+ end
294
+
295
+ def self.legacy_marc_mode!
296
+ @@legacy_marc_mode = true
297
+ # include legacy Marc macros
298
+ include Traject::Macros::Marc21
299
+
300
+ # Reader defaults
301
+ legacy_settings = {
302
+ "reader_class_name" => "Traject::MarcReader",
303
+ "marc_source.type" => "binary",
304
+ }
305
+
306
+ default_settings.merge!(legacy_settings)
307
+
308
+ self
309
+ end
310
+
242
311
  # Part of DSL, used to define an indexing mapping. Register logic
243
312
  # to be called for each record, and generate values for a particular
244
- # output field.
245
- def to_field(field_name, aLambda = nil, &block)
246
- @index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first))
313
+ # output field. The first field_name argument can be a single string, or
314
+ # an array of multiple strings -- in the latter case, the processed values
315
+ # will be added to each field mentioned.
316
+ def to_field(field_name, *procs, &block)
317
+ @index_steps << ToFieldStep.new(field_name, procs, block, Traject::Util.extract_caller_location(caller.first))
247
318
  end
248
319
 
249
320
  # Part of DSL, register logic to be called for each record
@@ -313,14 +384,33 @@ class Traject::Indexer
313
384
  # this indexer. Returns the output hash (a hash whose keys are
314
385
  # string fields, and values are arrays of one or more values in that field)
315
386
  #
387
+ # If the record is marked `skip` as part of processing, this will return
388
+ # nil.
389
+ #
316
390
  # This is a convenience shortcut for #map_to_context! -- use that one
317
391
  # if you want to provide addtional context
318
392
  # like position, and/or get back the full context.
319
393
  def map_record(record)
320
- context = Context.new(:source_record => record, :settings => settings)
394
+ context = Context.new(:source_record => record, :settings => settings, :source_record_id_proc => source_record_id_proc, :logger => logger)
395
+ map_to_context!(context)
396
+ return context.output_hash unless context.skip?
397
+ end
398
+
399
+ # Takes a single record, maps it, and sends it to the instance-configured
400
+ # writer. No threading, no logging, no error handling. Respects skipped
401
+ # records by not adding them. Returns the Traject::Indexer::Context.
402
+ #
403
+ # Aliased as #<<
404
+ def process_record(record)
405
+ check_uncompleted
406
+
407
+ context = Context.new(:source_record => record, :settings => settings, :source_record_id_proc => source_record_id_proc, :logger => logger)
321
408
  map_to_context!(context)
322
- return context.output_hash
409
+ writer.put( context ) unless context.skip?
410
+
411
+ return context
323
412
  end
413
+ alias_method :<<, :process_record
324
414
 
325
415
  # Maps a single record INTO the second argument, a Traject::Indexer::Context.
326
416
  #
@@ -342,7 +432,7 @@ class Traject::Indexer
342
432
 
343
433
  # Set the index step for error reporting
344
434
  context.index_step = index_step
345
- log_mapping_errors(context, index_step) do
435
+ handle_mapping_errors(context) do
346
436
  index_step.execute(context) # will always return [] for an each_record step
347
437
  end
348
438
 
@@ -353,31 +443,40 @@ class Traject::Indexer
353
443
  return context
354
444
  end
355
445
 
356
- # just a wrapper that captures and records any unexpected
357
- # errors raised in mapping, along with contextual information
358
- # on record and location in source file of mapping rule.
446
+
447
+ protected def default_mapping_rescue
448
+ @default_mapping_rescue ||= lambda do |context, exception|
449
+ msg = "Unexpected error on record #{context.record_inspect}\n"
450
+ msg += " while executing #{context.index_step.inspect}\n"
451
+
452
+ msg += begin
453
+ "\n Record: #{context.source_record.to_s}\n"
454
+ rescue StandardError => to_s_exception
455
+ "\n (Could not log record, #{to_s_exception})\n"
456
+ end
457
+
458
+ msg += Traject::Util.exception_to_log_message(exception)
459
+
460
+ context.logger.error(msg) if context.logger
461
+
462
+ raise exception
463
+ end
464
+ end
465
+
466
+ # just a wrapper that catches any errors, and handles them. By default, logs
467
+ # and re-raises. But you can set custom setting `mapping_rescue`
468
+ # to customize
359
469
  #
360
- # Re-raises error at the moment.
361
470
  #
362
- # log_mapping_errors(context, index_step) do
471
+ # handle_mapping_errors(context, index_step) do
363
472
  # all_sorts_of_stuff # that will have errors logged
364
473
  # end
365
- def log_mapping_errors(context, index_step)
474
+ protected def handle_mapping_errors(context)
366
475
  begin
367
476
  yield
368
- rescue Exception => e
369
- msg = "Unexpected error on record id `#{context.source_record_id}` at file position #{context.position}\n"
370
- msg += " while executing #{index_step.inspect}\n"
371
- msg += Traject::Util.exception_to_log_message(e)
372
-
373
- logger.error msg
374
- begin
375
- logger.debug "Record: " + context.source_record.to_s
376
- rescue Exception => marc_to_s_exception
377
- logger.debug "(Could not log record, #{marc_to_s_exception})"
378
- end
379
-
380
- raise e
477
+ rescue StandardError => e
478
+ error_handler = settings["mapping_rescue"] || default_mapping_rescue
479
+ error_handler.call(context, e)
381
480
  end
382
481
  end
383
482
 
@@ -385,67 +484,80 @@ class Traject::Indexer
385
484
  # mapping according to configured mapping rules, and then writing
386
485
  # to configured Writer.
387
486
  #
487
+ # You instead give it an _array_ of streams, as well.
488
+ #
388
489
  # returns 'false' as a signal to command line to return non-zero exit code
389
490
  # for some reason (reason found in logs, presumably). This particular mechanism
390
491
  # is open to complexification, starting simple. We do need SOME way to return
391
492
  # non-zero to command line.
392
493
  #
393
- def process(io_stream)
494
+ # @param [#read, Array<#read>]
495
+ def process(io_stream_or_array)
496
+ check_uncompleted
497
+
394
498
  settings.fill_in_defaults!
395
499
 
396
500
  count = 0
397
501
  start_time = batch_start_time = Time.now
398
- logger.debug "beginning Indexer#process with settings: #{settings.inspect}"
399
-
400
- reader = self.reader!(io_stream)
502
+ logger.debug "beginning Traject::Indexer#process with settings: #{settings.inspect}"
401
503
 
402
504
  processing_threads = settings["processing_thread_pool"].to_i
403
505
  thread_pool = Traject::ThreadPool.new(processing_threads)
404
506
 
405
- logger.info " Indexer with #{processing_threads} processing threads, reader: #{reader.class.name} and writer: #{writer.class.name}"
507
+ logger.info " Traject::Indexer with #{processing_threads} processing threads, reader: #{reader_class.name} and writer: #{writer.class.name}"
406
508
 
407
- log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
509
+ #io_stream can now be an array of io_streams.
510
+ (io_stream_or_array.kind_of?(Array) ? io_stream_or_array : [io_stream_or_array]).each do |io_stream|
511
+ reader = self.reader!(io_stream)
512
+ input_name = Traject::Util.io_name(io_stream)
513
+ position_in_input = 0
408
514
 
409
- reader.each do |record; position |
410
- count += 1
515
+ log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
411
516
 
412
- # have to use a block local var, so the changing `count` one
413
- # doesn't get caught in the closure. Weird, yeah.
414
- position = count
517
+ reader.each do |record; safe_count, safe_position_in_input |
518
+ count += 1
519
+ position_in_input += 1
415
520
 
416
- thread_pool.raise_collected_exception!
521
+ # have to use a block local var, so the changing `count` one
522
+ # doesn't get caught in the closure. Don't totally get it, but
523
+ # I think it's so.
524
+ safe_count, safe_position_in_input = count, position_in_input
417
525
 
418
- if settings["debug_ascii_progress"].to_s == "true"
419
- $stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
420
- end
526
+ thread_pool.raise_collected_exception!
421
527
 
422
- context = Context.new(
423
- :source_record => record,
424
- :settings => settings,
425
- :position => position,
426
- :logger => logger
427
- )
428
-
429
- if log_batch_size && (count % log_batch_size == 0)
430
- batch_rps = log_batch_size / (Time.now - batch_start_time)
431
- overall_rps = count / (Time.now - start_time)
432
- logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at id:#{context.source_record_id}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
433
- batch_start_time = Time.now
434
- end
528
+ if settings["debug_ascii_progress"].to_s == "true"
529
+ $stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
530
+ end
435
531
 
436
- # We pass context in a block arg to properly 'capture' it, so
437
- # we don't accidentally share the local var under closure between
438
- # threads.
439
- thread_pool.maybe_in_thread_pool(context) do |context|
440
- map_to_context!(context)
441
- if context.skip?
442
- log_skip(context)
443
- else
444
- writer.put context
532
+ context = Context.new(
533
+ :source_record => record,
534
+ :source_record_id_proc => source_record_id_proc,
535
+ :settings => settings,
536
+ :position => safe_count,
537
+ :input_name => input_name,
538
+ :position_in_input => safe_position_in_input,
539
+ :logger => logger
540
+ )
541
+
542
+ if log_batch_size && (count % log_batch_size == 0)
543
+ batch_rps = log_batch_size / (Time.now - batch_start_time)
544
+ overall_rps = count / (Time.now - start_time)
545
+ logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at: #{context.source_inspect}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
546
+ batch_start_time = Time.now
445
547
  end
446
548
 
549
+ # We pass context in a block arg to properly 'capture' it, so
550
+ # we don't accidentally share the local var under closure between
551
+ # threads.
552
+ thread_pool.maybe_in_thread_pool(context) do |t_context|
553
+ map_to_context!(t_context)
554
+ if context.skip?
555
+ log_skip(t_context)
556
+ else
557
+ writer.put t_context
558
+ end
559
+ end
447
560
  end
448
-
449
561
  end
450
562
  $stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
451
563
 
@@ -455,39 +567,156 @@ class Traject::Indexer
455
567
 
456
568
  thread_pool.raise_collected_exception!
457
569
 
570
+ complete
571
+
572
+ elapsed = Time.now - start_time
573
+ avg_rps = (count / elapsed)
574
+ logger.info "finished Traject::Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
575
+
576
+ if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
577
+ logger.error "Traject::Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
578
+ return false
579
+ end
580
+
581
+ return true
582
+ end
583
+
584
+ def completed?
585
+ @completed
586
+ end
587
+
588
+ # Instance variable readers and writers are not generally re-usble.
589
+ # The writer may have been closed. The reader does it's thing and doesn't
590
+ # rewind. If we're completed, as a sanity check don't let someone do
591
+ # something with the indexer that uses the reader or writer and isn't gonna work.
592
+ protected def check_uncompleted
593
+ if completed?
594
+ raise CompletedStateError.new("This Traject::Indexer has been completed, and it's reader and writer are not in a usable state")
595
+ end
596
+ end
458
597
 
598
+ # Closes the writer (which may flush/save/finalize buffered records),
599
+ # and calls run_after_processing_steps
600
+ def complete
459
601
  writer.close if writer.respond_to?(:close)
602
+ run_after_processing_steps
460
603
 
604
+ # after an indexer has been completed, it is not really usable anymore,
605
+ # as the writer has been closed.
606
+ @completed = true
607
+ end
608
+
609
+ def run_after_processing_steps
461
610
  @after_processing_steps.each do |step|
462
611
  begin
463
612
  step.execute
464
- rescue Exception => e
613
+ rescue StandardError => e
465
614
  logger.fatal("Unexpected exception #{e} when executing #{step}")
466
615
  raise e
467
616
  end
468
617
  end
618
+ end
469
619
 
470
- elapsed = Time.now - start_time
471
- avg_rps = (count / elapsed)
472
- logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
620
+ # A light-weight process method meant for programmatic use, generally
621
+ # intended for only a "few" (not milliions) of records.
622
+ #
623
+ # It does _not_ use instance-configured reader or writer, instead taking
624
+ # a source/reader and destination/writer as arguments to this call.
625
+ #
626
+ # The reader can be anything that has an #each returning source
627
+ # records. This includes an ordinary array of source records, or any
628
+ # traject Reader.
629
+ #
630
+ # The writer can be anything with a #put method taking a Traject::Indexer::Context.
631
+ # For convenience, see the Traject::ArrayWriter that just collects output in an array.
632
+ #
633
+ # Return value of process_with is the writer passed as second arg, for your convenience.
634
+ #
635
+ # This does much less than the full #process method, to be more flexible
636
+ # and make fewer assumptions:
637
+ #
638
+ # * Will never use any additional threads (unless writer does). Wrap in your own threading if desired.
639
+ # * Will not do any standard logging or progress bars, regardless of indexer settings.
640
+ # Log yourself if desired.
641
+ # * Will _not_ call any `after_processing` steps. Call yourself with `indexer.run_after_processing_steps` as desired.
642
+ # * WILL by default call #close on the writer, IF the writer has a #close method.
643
+ # pass `:close_writer => false` to not do so.
644
+ # * exceptions will just raise out, unless you pass in a rescue: option, value is a proc/lambda
645
+ # that will receive two args, context and exception. If the rescue proc doesn't re-raise,
646
+ # `process_with` will continue to process subsequent records.
647
+ #
648
+ # @example
649
+ # array_writer_instance = indexer.process_with([record1, record2], Traject::ArrayWriter.new)
650
+ #
651
+ # @example With a block, in addition to or instead of a writer.
652
+ #
653
+ # indexer.process_with([record]) do |context|
654
+ # do_something_with(context.output_hash)
655
+ # end
656
+ #
657
+ # @param source [#each]
658
+ # @param destination [#put]
659
+ # @param close_writer whether the destination should have #close called on it, if it responds to.
660
+ # @param rescue_with [Proc] to call on errors, taking two args: A Traject::Indexer::Context and an exception.
661
+ # If nil (default), exceptions will be raised out. If set, you can raise or handle otherwise if you like.
662
+ # @param on_skipped [Proc] will be called for any skipped records, with one arg Traject::Indexer::Context
663
+ def process_with(source, destination = nil, close_writer: true, rescue_with: nil, on_skipped: nil)
664
+ unless destination || block_given?
665
+ raise ArgumentError, "Need either a second arg writer/destination, or a block"
666
+ end
473
667
 
474
- if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
475
- logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
476
- return false
668
+ settings.fill_in_defaults!
669
+
670
+ position = 0
671
+ input_name = Traject::Util.io_name(source)
672
+ source.each do |record |
673
+ begin
674
+ position += 1
675
+
676
+ context = Context.new(
677
+ :source_record => record,
678
+ :source_record_id_proc => source_record_id_proc,
679
+ :settings => settings,
680
+ :position => position,
681
+ :position_in_input => (position if input_name),
682
+ :logger => logger
683
+ )
684
+
685
+ map_to_context!(context)
686
+
687
+ if context.skip?
688
+ on_skipped.call(context) if on_skipped
689
+ else
690
+ destination.put(context) if destination
691
+ yield(context) if block_given?
692
+ end
693
+ rescue StandardError => e
694
+ if rescue_with
695
+ rescue_with.call(context, e)
696
+ else
697
+ raise e
698
+ end
699
+ end
477
700
  end
478
701
 
479
- return true
702
+ if close_writer && destination.respond_to?(:close)
703
+ destination.close
704
+ end
705
+
706
+ return destination
480
707
  end
481
708
 
482
709
  # Log that the current record is being skipped, using
483
710
  # data in context.position and context.skipmessage
484
711
  def log_skip(context)
485
- logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
712
+ logger.debug "Skipped record #{context.record_inspect}: #{context.skipmessage}"
486
713
  end
487
714
 
488
715
  def reader_class
489
716
  unless defined? @reader_class
490
- @reader_class = qualified_const_get(settings["reader_class_name"])
717
+ reader_class_name = settings["reader_class_name"]
718
+
719
+ @reader_class = qualified_const_get(reader_class_name)
491
720
  end
492
721
  return @reader_class
493
722
  end