traject 2.3.4 → 3.0.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -40,12 +40,9 @@ class Traject::DebugWriter < Traject::LineWriter
40
40
  @idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
41
41
  @format = settings['debug_writer.format'] || DEFAULT_FORMAT
42
42
 
43
- if @idfield == 'record_position' then
44
- @use_position = true
45
- end
43
+ @use_position = (@idfield == 'record_position')
46
44
 
47
45
  @already_threw_warning_about_missing_id = false
48
-
49
46
  end
50
47
 
51
48
  def record_number(context)
@@ -54,7 +51,7 @@ class Traject::DebugWriter < Traject::LineWriter
54
51
  context.output_hash[@idfield].first
55
52
  else
56
53
  unless @already_threw_warning_about_missing_id
57
- context.logger.warn "At least one record (##{context.position}) doesn't define field '#{@idfield}'.
54
+ context.logger.warn "At least one record (#{context.record_inspect}) doesn't define field '#{@idfield}'.
58
55
  All records are assumed to have a unique id. You can set which field to look in via the setting 'debug_writer.idfield'"
59
56
  @already_threw_warning_about_missing_id = true
60
57
  end
@@ -0,0 +1,276 @@
1
+ module Traject
2
+ # An EXPERIMENTAL HALF-FINISHED implementation of a streaming/pull reader using Nokogiri.
3
+ # Not ready for use, not stable API, could go away.
4
+ #
5
+ # This was my first try at a NokogiriReader implementation, it didn't work out, at least without
6
+ # a lot more work. I think we'd need to re-do it to build the Nokogiri::XML::Nodes by hand as the
7
+ # source is traversed, instead of relying on #outer_xml -- outer_xml returning a string results in a double-parsing,
8
+ # with the expected 50% performance hit. Picadillos in Nokogiri JRuby namespace handling don't help.
9
+ #
10
+ # All in all, it's possible something could be gotten here with a lot more work, it's also possible
11
+ # Nokogiri's antipathy to namespaces could keep getting in the way.
12
+ class ExperimentalNokogiriStreamingReader
13
+ include Enumerable
14
+
15
+ attr_reader :settings, :input_stream, :clipboard, :path_tracker
16
+
17
+ def initialize(input_stream, settings)
18
+ @settings = Traject::Indexer::Settings.new settings
19
+ @input_stream = input_stream
20
+ @clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new
21
+
22
+ if each_record_xpath
23
+ @path_tracker = PathTracker.new(each_record_xpath,
24
+ clipboard: self.clipboard,
25
+ namespaces: default_namespaces,
26
+ extra_xpath_hooks: extra_xpath_hooks)
27
+ end
28
+
29
+ default_namespaces # trigger validation
30
+ validate_limited_xpath(each_record_xpath, key_name: "each_record_xpath")
31
+
32
+ end
33
+
34
+ def each_record_xpath
35
+ @each_record_xpath ||= settings["nokogiri.each_record_xpath"]
36
+ end
37
+
38
+ def extra_xpath_hooks
39
+ @extra_xpath_hooks ||= begin
40
+ (settings["nokogiri_reader.extra_xpath_hooks"] || {}).tap do |hash|
41
+ hash.each_pair do |limited_xpath, callable|
42
+ validate_limited_xpath(limited_xpath, key_name: "nokogiri_reader.extra_xpath_hooks")
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+ protected def validate_limited_xpath(each_record_xpath, key_name:)
49
+ return unless each_record_xpath
50
+
51
+ components = each_record_xpath.split('/')
52
+ components.each do |component|
53
+ prefix, element = component.split(':')
54
+ unless element
55
+ # there was no namespace
56
+ prefix, element = nil, prefix
57
+ end
58
+
59
+ # We don't support brackets or any xpath beyond the MOST simple.
60
+ # Catch a few we can catch.
61
+ if element =~ /::/ || element =~ /[\[\]]/
62
+ raise ArgumentError, "#{key_name}: Only very simple xpaths supported. '//some/path' or '/some/path'. Not: #{each_record_xpath.inspect}"
63
+ end
64
+
65
+ if prefix
66
+ ns_uri = default_namespaces[prefix]
67
+ if ns_uri.nil?
68
+ raise ArgumentError, "each_record_xpath: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}"
69
+ end
70
+ end
71
+ end
72
+
73
+ each_record_xpath
74
+ end
75
+
76
+
77
+ def default_namespaces
78
+ @default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
79
+ unless ns.kind_of?(Hash)
80
+ raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
81
+ end
82
+ }
83
+ end
84
+
85
+ def each
86
+ unless each_record_xpath
87
+ # forget streaming, just read it and return it once, done.
88
+ yield Nokogiri::XML.parse(input_stream)
89
+ return
90
+ end
91
+
92
+ reader = Nokogiri::XML::Reader(input_stream)
93
+
94
+ reader.each do |reader_node|
95
+ if reader_node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
96
+ path_tracker.push(reader_node)
97
+
98
+ if path_tracker.match?
99
+ yield path_tracker.current_node_doc
100
+ end
101
+ path_tracker.run_extra_xpath_hooks
102
+
103
+ if reader_node.self_closing?
104
+ path_tracker.pop
105
+ end
106
+ end
107
+
108
+ if reader_node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT
109
+ path_tracker.pop
110
+ end
111
+ end
112
+ end
113
+
114
+ private
115
+
116
+ # initialized with the specification (a very small subset of xpath) for
117
+ # what records to yield-on-each. Tests to see if a Nokogiri::XML::Reader
118
+ # node matches spec.
119
+ #
120
+ # '//record'
121
+ # or anchored to root:
122
+ # '/body/head/meta' same thing as './body/head/meta' or 'head/meta'
123
+ #
124
+ # Elements can (and must, to match) have XML namespaces, if and only if
125
+ # they are registered with settings nokogiri.namespaces
126
+ #
127
+ # sadly JRuby Nokogiri has an incompatibility with true nokogiri, and
128
+ # doesn't preserve our namespaces on outer_xml,
129
+ # so in JRuby we have to track them ourselves, and then also do yet ANOTHER
130
+ # parse in nokogiri. This may make this in Java even LESS performant, I'm afraid.
131
+ class PathTracker
132
+ attr_reader :path_spec, :inverted_namespaces, :current_path, :namespaces_stack, :extra_xpath_hooks, :clipboard
133
+ def initialize(str_spec, clipboard:, namespaces: {}, extra_xpath_hooks: {})
134
+ @inverted_namespaces = namespaces.invert
135
+ @clipboard = clipboard
136
+ # We're guessing using a string will be more efficient than an array
137
+ @current_path = ""
138
+ @floating = false
139
+
140
+ @path_spec, @floating = parse_path(str_spec)
141
+
142
+ @namespaces_stack = []
143
+
144
+
145
+ @extra_xpath_hooks = extra_xpath_hooks.collect do |path, callable|
146
+ bare_path, floating = parse_path(path)
147
+ {
148
+ path: bare_path,
149
+ floating: floating,
150
+ callable: callable
151
+ }
152
+ end
153
+ end
154
+
155
+ # returns [bare_path, is_floating]
156
+ protected def parse_path(str_spec)
157
+ floating = false
158
+
159
+ if str_spec.start_with?('//')
160
+ str_spec = str_spec.slice(2..-1)
161
+ floating = true
162
+ else
163
+ str_spec = str_spec.slice(1..-1) if str_spec.start_with?(".")
164
+ str_spec = "/" + str_spec unless str_spec.start_with?("/")
165
+ end
166
+
167
+ return [str_spec, floating]
168
+ end
169
+
170
+ def is_jruby?
171
+ Traject::Util.is_jruby?
172
+ end
173
+
174
+ # adds a component to slash-separated current_path, with namespace prefix.
175
+ def push(reader_node)
176
+ namespace_prefix = reader_node.namespace_uri && inverted_namespaces[reader_node.namespace_uri]
177
+
178
+ # gah, reader_node.name has the namespace prefix in there
179
+ node_name = reader_node.name.gsub(/[^:]+:/, '')
180
+
181
+ node_str = if namespace_prefix
182
+ namespace_prefix + ":" + node_name
183
+ else
184
+ reader_node.name
185
+ end
186
+
187
+ current_path << ("/" + node_str)
188
+
189
+ if is_jruby?
190
+ namespaces_stack << reader_node.namespaces
191
+ end
192
+ @current_node = reader_node
193
+ end
194
+
195
+ def current_node_doc
196
+ return nil unless @current_node
197
+
198
+ # yeah, sadly we got to have nokogiri parse it again
199
+ fix_namespaces(Nokogiri::XML.parse(@current_node.outer_xml))
200
+ end
201
+
202
+ # removes the last slash-separated component from current_path
203
+ def pop
204
+ current_path.slice!( current_path.rindex('/')..-1 )
205
+ @current_node = nil
206
+
207
+ if is_jruby?
208
+ namespaces_stack.pop
209
+ end
210
+ end
211
+
212
+ def floating?
213
+ !!@floating
214
+ end
215
+
216
+ def match?
217
+ match_path?(path_spec, floating: floating?)
218
+ end
219
+
220
+ def match_path?(path_to_match, floating:)
221
+ if floating?
222
+ current_path.end_with?(path_to_match)
223
+ else
224
+ current_path == path_to_match
225
+ end
226
+ end
227
+
228
+ def run_extra_xpath_hooks
229
+ return unless @current_node
230
+
231
+ extra_xpath_hooks.each do |hook_spec|
232
+ if match_path?(hook_spec[:path], floating: hook_spec[:floating])
233
+ hook_spec[:callable].call(current_node_doc, clipboard)
234
+ end
235
+ end
236
+ end
237
+
238
+ # no-op unless it's jruby, and then we use our namespace stack to
239
+ # correctly add namespaces to the Nokogiri::XML::Document, cause
240
+ # in Jruby outer_xml on the Reader doesn't do it for us. :(
241
+ def fix_namespaces(doc)
242
+ if is_jruby?
243
+ # Only needed in jruby, nokogiri's jruby implementation isn't weird
244
+ # around namespaces in exactly the same way as MRI. We need to keep
245
+ # track of the namespaces in outer contexts ourselves, and then see
246
+ # if they are needed ourselves. :(
247
+ namespaces = namespaces_stack.compact.reduce({}, :merge)
248
+ default_ns = namespaces.delete("xmlns")
249
+
250
+ namespaces.each_pair do |attrib, uri|
251
+ ns_prefix = attrib.sub(/\Axmlns:/, '')
252
+
253
+ # gotta make sure it's actually used in the doc to not add it
254
+ # unecessarily. GAH.
255
+ if doc.xpath("//*[starts-with(name(), '#{ns_prefix}:')][1]").empty? &&
256
+ doc.xpath("//@*[starts-with(name(), '#{ns_prefix}:')][1]").empty?
257
+ next
258
+ end
259
+ doc.root.add_namespace_definition(ns_prefix, uri)
260
+ end
261
+
262
+ if default_ns
263
+ doc.root.default_namespace = default_ns
264
+ # OMG nokogiri, really?
265
+ default_ns = doc.root.namespace
266
+ doc.xpath("//*[namespace-uri()='']").each do |node|
267
+ node.namespace = default_ns
268
+ end
269
+ end
270
+
271
+ end
272
+ return doc
273
+ end
274
+ end
275
+ end
276
+ end
@@ -0,0 +1,25 @@
1
+ require 'hashie'
2
+
3
+ module Traject
4
+ module Hashie
5
+ # Backporting fix from https://github.com/intridea/hashie/commit/a82c594710e1bc9460d3de4d2989cb700f4c3c7f
6
+ # into Hashie.
7
+ #
8
+ # This makes merge(ordinary_hash) on a Hash that has IndifferentAccess included work, without
9
+ # raising. Which we needed.
10
+ #
11
+ # As of this writing that fix is not available in a Hashie release, if it becomes so
12
+ # later than this monkey-patch may no longer be required, we can just depend on fixed version.
13
+ #
14
+ # See also https://github.com/intridea/hashie/issues/451
15
+ module IndifferentAccessFix
16
+ def merge(*args)
17
+ result = super
18
+ ::Hashie::Extensions::IndifferentAccess.inject!(result) if hash_lacking_indifference?(result)
19
+ result.convert!
20
+ end
21
+ end
22
+ end
23
+ end
24
+ Hashie::Extensions::IndifferentAccess.include(Traject::Hashie::IndifferentAccessFix)
25
+
@@ -11,14 +11,14 @@ require 'traject/marc_reader'
11
11
  require 'traject/json_writer'
12
12
  require 'traject/solr_json_writer'
13
13
  require 'traject/debug_writer'
14
-
14
+ require 'traject/array_writer'
15
15
 
16
16
  require 'traject/macros/marc21'
17
17
  require 'traject/macros/basic'
18
+ require 'traject/macros/transformation'
19
+
20
+ require 'traject/indexer/marc_indexer'
18
21
 
19
- if defined? JRUBY_VERSION
20
- require 'traject/marc4j_reader'
21
- end
22
22
 
23
23
  # This class does indexing for traject: Getting input records from a Reader
24
24
  # class, mapping the input records to an output hash, and then sending the output
@@ -157,33 +157,39 @@ end
157
157
  # inconveient for you, we'd like to know your use case and improve things.
158
158
  #
159
159
  class Traject::Indexer
160
-
161
- # Arity error on a passed block
162
- class ArityError < ArgumentError;
163
- end
164
- class NamingError < ArgumentError;
165
- end
166
-
160
+ CompletedStateError = Class.new(StandardError)
161
+ ArityError = Class.new(ArgumentError)
162
+ NamingError = Class.new(ArgumentError)
167
163
 
168
164
  include Traject::QualifiedConstGet
165
+ extend Traject::QualifiedConstGet
169
166
 
170
167
  attr_writer :reader_class, :writer_class, :writer
171
168
 
172
- # For now we hard-code these basic macro's included
173
- # TODO, make these added with extend per-indexer,
174
- # added by default but easily turned off (or have other
175
- # default macro modules provided)
176
- include Traject::Macros::Marc21
177
169
  include Traject::Macros::Basic
170
+ include Traject::Macros::Transformation
178
171
 
179
172
 
180
173
  # optional hash or Traject::Indexer::Settings object of settings.
181
- def initialize(arg_settings = {})
182
- @settings = Settings.new(arg_settings)
174
+ # optionally takes a block which is instance_eval'd in the indexer,
175
+ # intended for configuration simimlar to what would be in a config file.
176
+ def initialize(arg_settings = {}, &block)
177
+ @writer_class = nil
178
+ @completed = false
179
+ @settings = Settings.new(arg_settings).with_defaults(self.class.default_settings)
183
180
  @index_steps = []
184
181
  @after_processing_steps = []
182
+
183
+ instance_eval(&block) if block
184
+ end
185
+
186
+ # Right now just does an `instance_eval`, but encouraged in case we change the underlying
187
+ # implementation later, and to make intent more clear.
188
+ def configure(&block)
189
+ instance_eval(&block)
185
190
  end
186
191
 
192
+
187
193
  # Pass a string file path, a Pathname, or a File object, for
188
194
  # a config file to load into indexer.
189
195
  #
@@ -234,16 +240,81 @@ class Traject::Indexer
234
240
  def settings(new_settings = nil, &block)
235
241
  @settings.merge!(new_settings) if new_settings
236
242
 
237
- @settings.instance_eval &block if block_given?
243
+ @settings.instance_eval(&block) if block_given?
238
244
 
239
245
  return @settings
240
246
  end
241
247
 
248
+ # Hash is frozen to avoid inheritance-mutability confusion.
249
+ def self.default_settings
250
+ @default_settings ||= {
251
+ # Writer defaults
252
+ "writer_class_name" => "Traject::SolrJsonWriter",
253
+ "solr_writer.batch_size" => 100,
254
+ "solr_writer.thread_pool" => 1,
255
+
256
+ # Threading and logging
257
+ "processing_thread_pool" => Traject::Indexer::Settings.default_processing_thread_pool,
258
+ "log.batch_size.severity" => "info",
259
+
260
+ # how to post-process the accumulator
261
+ "allow_nil_values" => false,
262
+ "allow_duplicate_values" => true,
263
+
264
+ "allow_empty_fields" => false
265
+ }.freeze
266
+ end
267
+
268
+
269
+ # Not sure if allowing changing of default_settings is a good idea, but we do
270
+ # use it in test. For now we make it private to require extreme measures to do it,
271
+ # and advertise that this API could go away or change without a major version release,
272
+ # it is experimental and internal.
273
+ private_class_method def self.default_settings=(settings)
274
+ @default_settings = settings
275
+ end
276
+
277
+ # Sub-classes should override to return a _proc_ object that takes one arg,
278
+ # a source record, and returns an identifier for it that can be used in
279
+ # logged messages. This differs depending on input record format, is why we
280
+ # leave it to sub-classes.
281
+ def source_record_id_proc
282
+ if defined?(@@legacy_marc_mode) && @@legacy_marc_mode
283
+ return @source_record_id_proc ||= lambda do |source_marc_record|
284
+ if ( source_marc_record &&
285
+ source_marc_record.kind_of?(MARC::Record) &&
286
+ source_marc_record['001'] )
287
+ source_marc_record['001'].value
288
+ end
289
+ end
290
+ end
291
+
292
+ @source_record_id_proc ||= lambda { |source| nil }
293
+ end
294
+
295
+ def self.legacy_marc_mode!
296
+ @@legacy_marc_mode = true
297
+ # include legacy Marc macros
298
+ include Traject::Macros::Marc21
299
+
300
+ # Reader defaults
301
+ legacy_settings = {
302
+ "reader_class_name" => "Traject::MarcReader",
303
+ "marc_source.type" => "binary",
304
+ }
305
+
306
+ default_settings.merge!(legacy_settings)
307
+
308
+ self
309
+ end
310
+
242
311
  # Part of DSL, used to define an indexing mapping. Register logic
243
312
  # to be called for each record, and generate values for a particular
244
- # output field.
245
- def to_field(field_name, aLambda = nil, &block)
246
- @index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first))
313
+ # output field. The first field_name argument can be a single string, or
314
+ # an array of multiple strings -- in the latter case, the processed values
315
+ # will be added to each field mentioned.
316
+ def to_field(field_name, *procs, &block)
317
+ @index_steps << ToFieldStep.new(field_name, procs, block, Traject::Util.extract_caller_location(caller.first))
247
318
  end
248
319
 
249
320
  # Part of DSL, register logic to be called for each record
@@ -313,14 +384,33 @@ class Traject::Indexer
313
384
  # this indexer. Returns the output hash (a hash whose keys are
314
385
  # string fields, and values are arrays of one or more values in that field)
315
386
  #
387
+ # If the record is marked `skip` as part of processing, this will return
388
+ # nil.
389
+ #
316
390
  # This is a convenience shortcut for #map_to_context! -- use that one
317
391
  # if you want to provide addtional context
318
392
  # like position, and/or get back the full context.
319
393
  def map_record(record)
320
- context = Context.new(:source_record => record, :settings => settings)
394
+ context = Context.new(:source_record => record, :settings => settings, :source_record_id_proc => source_record_id_proc, :logger => logger)
395
+ map_to_context!(context)
396
+ return context.output_hash unless context.skip?
397
+ end
398
+
399
+ # Takes a single record, maps it, and sends it to the instance-configured
400
+ # writer. No threading, no logging, no error handling. Respects skipped
401
+ # records by not adding them. Returns the Traject::Indexer::Context.
402
+ #
403
+ # Aliased as #<<
404
+ def process_record(record)
405
+ check_uncompleted
406
+
407
+ context = Context.new(:source_record => record, :settings => settings, :source_record_id_proc => source_record_id_proc, :logger => logger)
321
408
  map_to_context!(context)
322
- return context.output_hash
409
+ writer.put( context ) unless context.skip?
410
+
411
+ return context
323
412
  end
413
+ alias_method :<<, :process_record
324
414
 
325
415
  # Maps a single record INTO the second argument, a Traject::Indexer::Context.
326
416
  #
@@ -342,7 +432,7 @@ class Traject::Indexer
342
432
 
343
433
  # Set the index step for error reporting
344
434
  context.index_step = index_step
345
- log_mapping_errors(context, index_step) do
435
+ handle_mapping_errors(context) do
346
436
  index_step.execute(context) # will always return [] for an each_record step
347
437
  end
348
438
 
@@ -353,31 +443,40 @@ class Traject::Indexer
353
443
  return context
354
444
  end
355
445
 
356
- # just a wrapper that captures and records any unexpected
357
- # errors raised in mapping, along with contextual information
358
- # on record and location in source file of mapping rule.
446
+
447
+ protected def default_mapping_rescue
448
+ @default_mapping_rescue ||= lambda do |context, exception|
449
+ msg = "Unexpected error on record #{context.record_inspect}\n"
450
+ msg += " while executing #{context.index_step.inspect}\n"
451
+
452
+ msg += begin
453
+ "\n Record: #{context.source_record.to_s}\n"
454
+ rescue StandardError => to_s_exception
455
+ "\n (Could not log record, #{to_s_exception})\n"
456
+ end
457
+
458
+ msg += Traject::Util.exception_to_log_message(exception)
459
+
460
+ context.logger.error(msg) if context.logger
461
+
462
+ raise exception
463
+ end
464
+ end
465
+
466
+ # just a wrapper that catches any errors, and handles them. By default, logs
467
+ # and re-raises. But you can set custom setting `mapping_rescue`
468
+ # to customize
359
469
  #
360
- # Re-raises error at the moment.
361
470
  #
362
- # log_mapping_errors(context, index_step) do
471
+ # handle_mapping_errors(context, index_step) do
363
472
  # all_sorts_of_stuff # that will have errors logged
364
473
  # end
365
- def log_mapping_errors(context, index_step)
474
+ protected def handle_mapping_errors(context)
366
475
  begin
367
476
  yield
368
- rescue Exception => e
369
- msg = "Unexpected error on record id `#{context.source_record_id}` at file position #{context.position}\n"
370
- msg += " while executing #{index_step.inspect}\n"
371
- msg += Traject::Util.exception_to_log_message(e)
372
-
373
- logger.error msg
374
- begin
375
- logger.debug "Record: " + context.source_record.to_s
376
- rescue Exception => marc_to_s_exception
377
- logger.debug "(Could not log record, #{marc_to_s_exception})"
378
- end
379
-
380
- raise e
477
+ rescue StandardError => e
478
+ error_handler = settings["mapping_rescue"] || default_mapping_rescue
479
+ error_handler.call(context, e)
381
480
  end
382
481
  end
383
482
 
@@ -385,67 +484,80 @@ class Traject::Indexer
385
484
  # mapping according to configured mapping rules, and then writing
386
485
  # to configured Writer.
387
486
  #
487
+ # You instead give it an _array_ of streams, as well.
488
+ #
388
489
  # returns 'false' as a signal to command line to return non-zero exit code
389
490
  # for some reason (reason found in logs, presumably). This particular mechanism
390
491
  # is open to complexification, starting simple. We do need SOME way to return
391
492
  # non-zero to command line.
392
493
  #
393
- def process(io_stream)
494
+ # @param [#read, Array<#read>]
495
+ def process(io_stream_or_array)
496
+ check_uncompleted
497
+
394
498
  settings.fill_in_defaults!
395
499
 
396
500
  count = 0
397
501
  start_time = batch_start_time = Time.now
398
- logger.debug "beginning Indexer#process with settings: #{settings.inspect}"
399
-
400
- reader = self.reader!(io_stream)
502
+ logger.debug "beginning Traject::Indexer#process with settings: #{settings.inspect}"
401
503
 
402
504
  processing_threads = settings["processing_thread_pool"].to_i
403
505
  thread_pool = Traject::ThreadPool.new(processing_threads)
404
506
 
405
- logger.info " Indexer with #{processing_threads} processing threads, reader: #{reader.class.name} and writer: #{writer.class.name}"
507
+ logger.info " Traject::Indexer with #{processing_threads} processing threads, reader: #{reader_class.name} and writer: #{writer.class.name}"
406
508
 
407
- log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
509
+ #io_stream can now be an array of io_streams.
510
+ (io_stream_or_array.kind_of?(Array) ? io_stream_or_array : [io_stream_or_array]).each do |io_stream|
511
+ reader = self.reader!(io_stream)
512
+ input_name = Traject::Util.io_name(io_stream)
513
+ position_in_input = 0
408
514
 
409
- reader.each do |record; position |
410
- count += 1
515
+ log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
411
516
 
412
- # have to use a block local var, so the changing `count` one
413
- # doesn't get caught in the closure. Weird, yeah.
414
- position = count
517
+ reader.each do |record; safe_count, safe_position_in_input |
518
+ count += 1
519
+ position_in_input += 1
415
520
 
416
- thread_pool.raise_collected_exception!
521
+ # have to use a block local var, so the changing `count` one
522
+ # doesn't get caught in the closure. Don't totally get it, but
523
+ # I think it's so.
524
+ safe_count, safe_position_in_input = count, position_in_input
417
525
 
418
- if settings["debug_ascii_progress"].to_s == "true"
419
- $stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
420
- end
526
+ thread_pool.raise_collected_exception!
421
527
 
422
- context = Context.new(
423
- :source_record => record,
424
- :settings => settings,
425
- :position => position,
426
- :logger => logger
427
- )
428
-
429
- if log_batch_size && (count % log_batch_size == 0)
430
- batch_rps = log_batch_size / (Time.now - batch_start_time)
431
- overall_rps = count / (Time.now - start_time)
432
- logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at id:#{context.source_record_id}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
433
- batch_start_time = Time.now
434
- end
528
+ if settings["debug_ascii_progress"].to_s == "true"
529
+ $stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
530
+ end
435
531
 
436
- # We pass context in a block arg to properly 'capture' it, so
437
- # we don't accidentally share the local var under closure between
438
- # threads.
439
- thread_pool.maybe_in_thread_pool(context) do |context|
440
- map_to_context!(context)
441
- if context.skip?
442
- log_skip(context)
443
- else
444
- writer.put context
532
+ context = Context.new(
533
+ :source_record => record,
534
+ :source_record_id_proc => source_record_id_proc,
535
+ :settings => settings,
536
+ :position => safe_count,
537
+ :input_name => input_name,
538
+ :position_in_input => safe_position_in_input,
539
+ :logger => logger
540
+ )
541
+
542
+ if log_batch_size && (count % log_batch_size == 0)
543
+ batch_rps = log_batch_size / (Time.now - batch_start_time)
544
+ overall_rps = count / (Time.now - start_time)
545
+ logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at: #{context.source_inspect}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
546
+ batch_start_time = Time.now
445
547
  end
446
548
 
549
+ # We pass context in a block arg to properly 'capture' it, so
550
+ # we don't accidentally share the local var under closure between
551
+ # threads.
552
+ thread_pool.maybe_in_thread_pool(context) do |t_context|
553
+ map_to_context!(t_context)
554
+ if context.skip?
555
+ log_skip(t_context)
556
+ else
557
+ writer.put t_context
558
+ end
559
+ end
447
560
  end
448
-
449
561
  end
450
562
  $stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
451
563
 
@@ -455,39 +567,156 @@ class Traject::Indexer
455
567
 
456
568
  thread_pool.raise_collected_exception!
457
569
 
570
+ complete
571
+
572
+ elapsed = Time.now - start_time
573
+ avg_rps = (count / elapsed)
574
+ logger.info "finished Traject::Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
575
+
576
+ if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
577
+ logger.error "Traject::Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
578
+ return false
579
+ end
580
+
581
+ return true
582
+ end
583
+
584
+ def completed?
585
+ @completed
586
+ end
587
+
588
+ # Instance variable readers and writers are not generally re-usble.
589
+ # The writer may have been closed. The reader does it's thing and doesn't
590
+ # rewind. If we're completed, as a sanity check don't let someone do
591
+ # something with the indexer that uses the reader or writer and isn't gonna work.
592
+ protected def check_uncompleted
593
+ if completed?
594
+ raise CompletedStateError.new("This Traject::Indexer has been completed, and it's reader and writer are not in a usable state")
595
+ end
596
+ end
458
597
 
598
+ # Closes the writer (which may flush/save/finalize buffered records),
599
+ # and calls run_after_processing_steps
600
+ def complete
459
601
  writer.close if writer.respond_to?(:close)
602
+ run_after_processing_steps
460
603
 
604
+ # after an indexer has been completed, it is not really usable anymore,
605
+ # as the writer has been closed.
606
+ @completed = true
607
+ end
608
+
609
+ def run_after_processing_steps
461
610
  @after_processing_steps.each do |step|
462
611
  begin
463
612
  step.execute
464
- rescue Exception => e
613
+ rescue StandardError => e
465
614
  logger.fatal("Unexpected exception #{e} when executing #{step}")
466
615
  raise e
467
616
  end
468
617
  end
618
+ end
469
619
 
470
- elapsed = Time.now - start_time
471
- avg_rps = (count / elapsed)
472
- logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
620
+ # A light-weight process method meant for programmatic use, generally
621
+ # intended for only a "few" (not milliions) of records.
622
+ #
623
+ # It does _not_ use instance-configured reader or writer, instead taking
624
+ # a source/reader and destination/writer as arguments to this call.
625
+ #
626
+ # The reader can be anything that has an #each returning source
627
+ # records. This includes an ordinary array of source records, or any
628
+ # traject Reader.
629
+ #
630
+ # The writer can be anything with a #put method taking a Traject::Indexer::Context.
631
+ # For convenience, see the Traject::ArrayWriter that just collects output in an array.
632
+ #
633
+ # Return value of process_with is the writer passed as second arg, for your convenience.
634
+ #
635
+ # This does much less than the full #process method, to be more flexible
636
+ # and make fewer assumptions:
637
+ #
638
+ # * Will never use any additional threads (unless writer does). Wrap in your own threading if desired.
639
+ # * Will not do any standard logging or progress bars, regardless of indexer settings.
640
+ # Log yourself if desired.
641
+ # * Will _not_ call any `after_processing` steps. Call yourself with `indexer.run_after_processing_steps` as desired.
642
+ # * WILL by default call #close on the writer, IF the writer has a #close method.
643
+ # pass `:close_writer => false` to not do so.
644
+ # * exceptions will just raise out, unless you pass in a rescue: option, value is a proc/lambda
645
+ # that will receive two args, context and exception. If the rescue proc doesn't re-raise,
646
+ # `process_with` will continue to process subsequent records.
647
+ #
648
+ # @example
649
+ # array_writer_instance = indexer.process_with([record1, record2], Traject::ArrayWriter.new)
650
+ #
651
+ # @example With a block, in addition to or instead of a writer.
652
+ #
653
+ # indexer.process_with([record]) do |context|
654
+ # do_something_with(context.output_hash)
655
+ # end
656
+ #
657
+ # @param source [#each]
658
+ # @param destination [#put]
659
+ # @param close_writer whether the destination should have #close called on it, if it responds to.
660
+ # @param rescue_with [Proc] to call on errors, taking two args: A Traject::Indexer::Context and an exception.
661
+ # If nil (default), exceptions will be raised out. If set, you can raise or handle otherwise if you like.
662
+ # @param on_skipped [Proc] will be called for any skipped records, with one arg Traject::Indexer::Context
663
+ def process_with(source, destination = nil, close_writer: true, rescue_with: nil, on_skipped: nil)
664
+ unless destination || block_given?
665
+ raise ArgumentError, "Need either a second arg writer/destination, or a block"
666
+ end
473
667
 
474
- if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
475
- logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
476
- return false
668
+ settings.fill_in_defaults!
669
+
670
+ position = 0
671
+ input_name = Traject::Util.io_name(source)
672
+ source.each do |record |
673
+ begin
674
+ position += 1
675
+
676
+ context = Context.new(
677
+ :source_record => record,
678
+ :source_record_id_proc => source_record_id_proc,
679
+ :settings => settings,
680
+ :position => position,
681
+ :position_in_input => (position if input_name),
682
+ :logger => logger
683
+ )
684
+
685
+ map_to_context!(context)
686
+
687
+ if context.skip?
688
+ on_skipped.call(context) if on_skipped
689
+ else
690
+ destination.put(context) if destination
691
+ yield(context) if block_given?
692
+ end
693
+ rescue StandardError => e
694
+ if rescue_with
695
+ rescue_with.call(context, e)
696
+ else
697
+ raise e
698
+ end
699
+ end
477
700
  end
478
701
 
479
- return true
702
+ if close_writer && destination.respond_to?(:close)
703
+ destination.close
704
+ end
705
+
706
+ return destination
480
707
  end
481
708
 
482
709
  # Log that the current record is being skipped, using
483
710
  # data in context.position and context.skipmessage
484
711
  def log_skip(context)
485
- logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
712
+ logger.debug "Skipped record #{context.record_inspect}: #{context.skipmessage}"
486
713
  end
487
714
 
488
715
  def reader_class
489
716
  unless defined? @reader_class
490
- @reader_class = qualified_const_get(settings["reader_class_name"])
717
+ reader_class_name = settings["reader_class_name"]
718
+
719
+ @reader_class = qualified_const_get(reader_class_name)
491
720
  end
492
721
  return @reader_class
493
722
  end