traject 2.3.4 → 3.0.0.alpha.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +16 -9
- data/CHANGES.md +74 -1
- data/Gemfile +2 -1
- data/README.md +104 -53
- data/Rakefile +8 -1
- data/doc/indexing_rules.md +79 -63
- data/doc/programmatic_use.md +218 -0
- data/doc/settings.md +28 -1
- data/doc/xml.md +134 -0
- data/lib/traject.rb +5 -0
- data/lib/traject/array_writer.rb +34 -0
- data/lib/traject/command_line.rb +18 -22
- data/lib/traject/debug_writer.rb +2 -5
- data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
- data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
- data/lib/traject/indexer.rb +321 -92
- data/lib/traject/indexer/context.rb +39 -13
- data/lib/traject/indexer/marc_indexer.rb +30 -0
- data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
- data/lib/traject/indexer/settings.rb +36 -53
- data/lib/traject/indexer/step.rb +27 -33
- data/lib/traject/macros/marc21.rb +37 -12
- data/lib/traject/macros/nokogiri_macros.rb +43 -0
- data/lib/traject/macros/transformation.rb +162 -0
- data/lib/traject/marc_extractor.rb +2 -0
- data/lib/traject/ndj_reader.rb +1 -1
- data/lib/traject/nokogiri_reader.rb +179 -0
- data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
- data/lib/traject/solr_json_writer.rb +19 -12
- data/lib/traject/thread_pool.rb +13 -0
- data/lib/traject/util.rb +14 -2
- data/lib/traject/version.rb +1 -1
- data/test/debug_writer_test.rb +3 -3
- data/test/delimited_writer_test.rb +3 -3
- data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
- data/test/indexer/context_test.rb +23 -13
- data/test/indexer/error_handler_test.rb +59 -0
- data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
- data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
- data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
- data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
- data/test/indexer/macros/to_field_test.rb +2 -2
- data/test/indexer/macros/transformation_test.rb +177 -0
- data/test/indexer/map_record_test.rb +2 -3
- data/test/indexer/nokogiri_indexer_test.rb +103 -0
- data/test/indexer/process_record_test.rb +55 -0
- data/test/indexer/process_with_test.rb +148 -0
- data/test/indexer/read_write_test.rb +52 -2
- data/test/indexer/settings_test.rb +34 -24
- data/test/indexer/to_field_test.rb +27 -2
- data/test/marc_extractor_test.rb +7 -7
- data/test/marc_reader_test.rb +4 -4
- data/test/nokogiri_reader_test.rb +158 -0
- data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
- data/test/solr_json_writer_test.rb +24 -28
- data/test/test_helper.rb +8 -2
- data/test/test_support/namespace-test.xml +7 -0
- data/test/test_support/nokogiri_demo_config.rb +17 -0
- data/test/test_support/oai-pmh-one-record-2.xml +24 -0
- data/test/test_support/oai-pmh-one-record-first.xml +24 -0
- data/test/test_support/sample-oai-no-namespace.xml +197 -0
- data/test/test_support/sample-oai-pmh.xml +197 -0
- data/test/thread_pool_test.rb +38 -0
- data/test/translation_map_test.rb +3 -3
- data/test/translation_maps/ruby_map.rb +2 -1
- data/test/translation_maps/yaml_map.yaml +2 -1
- data/traject.gemspec +4 -11
- metadata +92 -6
data/lib/traject/debug_writer.rb
CHANGED
@@ -40,12 +40,9 @@ class Traject::DebugWriter < Traject::LineWriter
|
|
40
40
|
@idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
|
41
41
|
@format = settings['debug_writer.format'] || DEFAULT_FORMAT
|
42
42
|
|
43
|
-
|
44
|
-
@use_position = true
|
45
|
-
end
|
43
|
+
@use_position = (@idfield == 'record_position')
|
46
44
|
|
47
45
|
@already_threw_warning_about_missing_id = false
|
48
|
-
|
49
46
|
end
|
50
47
|
|
51
48
|
def record_number(context)
|
@@ -54,7 +51,7 @@ class Traject::DebugWriter < Traject::LineWriter
|
|
54
51
|
context.output_hash[@idfield].first
|
55
52
|
else
|
56
53
|
unless @already_threw_warning_about_missing_id
|
57
|
-
context.logger.warn "At least one record (
|
54
|
+
context.logger.warn "At least one record (#{context.record_inspect}) doesn't define field '#{@idfield}'.
|
58
55
|
All records are assumed to have a unique id. You can set which field to look in via the setting 'debug_writer.idfield'"
|
59
56
|
@already_threw_warning_about_missing_id = true
|
60
57
|
end
|
@@ -0,0 +1,276 @@
|
|
1
|
+
module Traject
|
2
|
+
# An EXPERIMENTAL HALF-FINISHED implementation of a streaming/pull reader using Nokogiri.
|
3
|
+
# Not ready for use, not stable API, could go away.
|
4
|
+
#
|
5
|
+
# This was my first try at a NokogiriReader implementation, it didn't work out, at least without
|
6
|
+
# a lot more work. I think we'd need to re-do it to build the Nokogiri::XML::Nodes by hand as the
|
7
|
+
# source is traversed, instead of relying on #outer_xml -- outer_xml returning a string results in a double-parsing,
|
8
|
+
# with the expected 50% performance hit. Picadillos in Nokogiri JRuby namespace handling don't help.
|
9
|
+
#
|
10
|
+
# All in all, it's possible something could be gotten here with a lot more work, it's also possible
|
11
|
+
# Nokogiri's antipathy to namespaces could keep getting in the way.
|
12
|
+
class ExperimentalNokogiriStreamingReader
|
13
|
+
include Enumerable
|
14
|
+
|
15
|
+
attr_reader :settings, :input_stream, :clipboard, :path_tracker
|
16
|
+
|
17
|
+
def initialize(input_stream, settings)
|
18
|
+
@settings = Traject::Indexer::Settings.new settings
|
19
|
+
@input_stream = input_stream
|
20
|
+
@clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new
|
21
|
+
|
22
|
+
if each_record_xpath
|
23
|
+
@path_tracker = PathTracker.new(each_record_xpath,
|
24
|
+
clipboard: self.clipboard,
|
25
|
+
namespaces: default_namespaces,
|
26
|
+
extra_xpath_hooks: extra_xpath_hooks)
|
27
|
+
end
|
28
|
+
|
29
|
+
default_namespaces # trigger validation
|
30
|
+
validate_limited_xpath(each_record_xpath, key_name: "each_record_xpath")
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
def each_record_xpath
|
35
|
+
@each_record_xpath ||= settings["nokogiri.each_record_xpath"]
|
36
|
+
end
|
37
|
+
|
38
|
+
def extra_xpath_hooks
|
39
|
+
@extra_xpath_hooks ||= begin
|
40
|
+
(settings["nokogiri_reader.extra_xpath_hooks"] || {}).tap do |hash|
|
41
|
+
hash.each_pair do |limited_xpath, callable|
|
42
|
+
validate_limited_xpath(limited_xpath, key_name: "nokogiri_reader.extra_xpath_hooks")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
protected def validate_limited_xpath(each_record_xpath, key_name:)
|
49
|
+
return unless each_record_xpath
|
50
|
+
|
51
|
+
components = each_record_xpath.split('/')
|
52
|
+
components.each do |component|
|
53
|
+
prefix, element = component.split(':')
|
54
|
+
unless element
|
55
|
+
# there was no namespace
|
56
|
+
prefix, element = nil, prefix
|
57
|
+
end
|
58
|
+
|
59
|
+
# We don't support brackets or any xpath beyond the MOST simple.
|
60
|
+
# Catch a few we can catch.
|
61
|
+
if element =~ /::/ || element =~ /[\[\]]/
|
62
|
+
raise ArgumentError, "#{key_name}: Only very simple xpaths supported. '//some/path' or '/some/path'. Not: #{each_record_xpath.inspect}"
|
63
|
+
end
|
64
|
+
|
65
|
+
if prefix
|
66
|
+
ns_uri = default_namespaces[prefix]
|
67
|
+
if ns_uri.nil?
|
68
|
+
raise ArgumentError, "each_record_xpath: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
each_record_xpath
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
def default_namespaces
|
78
|
+
@default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
|
79
|
+
unless ns.kind_of?(Hash)
|
80
|
+
raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
|
81
|
+
end
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
def each
|
86
|
+
unless each_record_xpath
|
87
|
+
# forget streaming, just read it and return it once, done.
|
88
|
+
yield Nokogiri::XML.parse(input_stream)
|
89
|
+
return
|
90
|
+
end
|
91
|
+
|
92
|
+
reader = Nokogiri::XML::Reader(input_stream)
|
93
|
+
|
94
|
+
reader.each do |reader_node|
|
95
|
+
if reader_node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
96
|
+
path_tracker.push(reader_node)
|
97
|
+
|
98
|
+
if path_tracker.match?
|
99
|
+
yield path_tracker.current_node_doc
|
100
|
+
end
|
101
|
+
path_tracker.run_extra_xpath_hooks
|
102
|
+
|
103
|
+
if reader_node.self_closing?
|
104
|
+
path_tracker.pop
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
if reader_node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
109
|
+
path_tracker.pop
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
# initialized with the specification (a very small subset of xpath) for
|
117
|
+
# what records to yield-on-each. Tests to see if a Nokogiri::XML::Reader
|
118
|
+
# node matches spec.
|
119
|
+
#
|
120
|
+
# '//record'
|
121
|
+
# or anchored to root:
|
122
|
+
# '/body/head/meta' same thing as './body/head/meta' or 'head/meta'
|
123
|
+
#
|
124
|
+
# Elements can (and must, to match) have XML namespaces, if and only if
|
125
|
+
# they are registered with settings nokogiri.namespaces
|
126
|
+
#
|
127
|
+
# sadly JRuby Nokogiri has an incompatibility with true nokogiri, and
|
128
|
+
# doesn't preserve our namespaces on outer_xml,
|
129
|
+
# so in JRuby we have to track them ourselves, and then also do yet ANOTHER
|
130
|
+
# parse in nokogiri. This may make this in Java even LESS performant, I'm afraid.
|
131
|
+
class PathTracker
|
132
|
+
attr_reader :path_spec, :inverted_namespaces, :current_path, :namespaces_stack, :extra_xpath_hooks, :clipboard
|
133
|
+
def initialize(str_spec, clipboard:, namespaces: {}, extra_xpath_hooks: {})
|
134
|
+
@inverted_namespaces = namespaces.invert
|
135
|
+
@clipboard = clipboard
|
136
|
+
# We're guessing using a string will be more efficient than an array
|
137
|
+
@current_path = ""
|
138
|
+
@floating = false
|
139
|
+
|
140
|
+
@path_spec, @floating = parse_path(str_spec)
|
141
|
+
|
142
|
+
@namespaces_stack = []
|
143
|
+
|
144
|
+
|
145
|
+
@extra_xpath_hooks = extra_xpath_hooks.collect do |path, callable|
|
146
|
+
bare_path, floating = parse_path(path)
|
147
|
+
{
|
148
|
+
path: bare_path,
|
149
|
+
floating: floating,
|
150
|
+
callable: callable
|
151
|
+
}
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# returns [bare_path, is_floating]
|
156
|
+
protected def parse_path(str_spec)
|
157
|
+
floating = false
|
158
|
+
|
159
|
+
if str_spec.start_with?('//')
|
160
|
+
str_spec = str_spec.slice(2..-1)
|
161
|
+
floating = true
|
162
|
+
else
|
163
|
+
str_spec = str_spec.slice(1..-1) if str_spec.start_with?(".")
|
164
|
+
str_spec = "/" + str_spec unless str_spec.start_with?("/")
|
165
|
+
end
|
166
|
+
|
167
|
+
return [str_spec, floating]
|
168
|
+
end
|
169
|
+
|
170
|
+
def is_jruby?
|
171
|
+
Traject::Util.is_jruby?
|
172
|
+
end
|
173
|
+
|
174
|
+
# adds a component to slash-separated current_path, with namespace prefix.
|
175
|
+
def push(reader_node)
|
176
|
+
namespace_prefix = reader_node.namespace_uri && inverted_namespaces[reader_node.namespace_uri]
|
177
|
+
|
178
|
+
# gah, reader_node.name has the namespace prefix in there
|
179
|
+
node_name = reader_node.name.gsub(/[^:]+:/, '')
|
180
|
+
|
181
|
+
node_str = if namespace_prefix
|
182
|
+
namespace_prefix + ":" + node_name
|
183
|
+
else
|
184
|
+
reader_node.name
|
185
|
+
end
|
186
|
+
|
187
|
+
current_path << ("/" + node_str)
|
188
|
+
|
189
|
+
if is_jruby?
|
190
|
+
namespaces_stack << reader_node.namespaces
|
191
|
+
end
|
192
|
+
@current_node = reader_node
|
193
|
+
end
|
194
|
+
|
195
|
+
def current_node_doc
|
196
|
+
return nil unless @current_node
|
197
|
+
|
198
|
+
# yeah, sadly we got to have nokogiri parse it again
|
199
|
+
fix_namespaces(Nokogiri::XML.parse(@current_node.outer_xml))
|
200
|
+
end
|
201
|
+
|
202
|
+
# removes the last slash-separated component from current_path
|
203
|
+
def pop
|
204
|
+
current_path.slice!( current_path.rindex('/')..-1 )
|
205
|
+
@current_node = nil
|
206
|
+
|
207
|
+
if is_jruby?
|
208
|
+
namespaces_stack.pop
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
def floating?
|
213
|
+
!!@floating
|
214
|
+
end
|
215
|
+
|
216
|
+
def match?
|
217
|
+
match_path?(path_spec, floating: floating?)
|
218
|
+
end
|
219
|
+
|
220
|
+
def match_path?(path_to_match, floating:)
|
221
|
+
if floating?
|
222
|
+
current_path.end_with?(path_to_match)
|
223
|
+
else
|
224
|
+
current_path == path_to_match
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def run_extra_xpath_hooks
|
229
|
+
return unless @current_node
|
230
|
+
|
231
|
+
extra_xpath_hooks.each do |hook_spec|
|
232
|
+
if match_path?(hook_spec[:path], floating: hook_spec[:floating])
|
233
|
+
hook_spec[:callable].call(current_node_doc, clipboard)
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# no-op unless it's jruby, and then we use our namespace stack to
|
239
|
+
# correctly add namespaces to the Nokogiri::XML::Document, cause
|
240
|
+
# in Jruby outer_xml on the Reader doesn't do it for us. :(
|
241
|
+
def fix_namespaces(doc)
|
242
|
+
if is_jruby?
|
243
|
+
# Only needed in jruby, nokogiri's jruby implementation isn't weird
|
244
|
+
# around namespaces in exactly the same way as MRI. We need to keep
|
245
|
+
# track of the namespaces in outer contexts ourselves, and then see
|
246
|
+
# if they are needed ourselves. :(
|
247
|
+
namespaces = namespaces_stack.compact.reduce({}, :merge)
|
248
|
+
default_ns = namespaces.delete("xmlns")
|
249
|
+
|
250
|
+
namespaces.each_pair do |attrib, uri|
|
251
|
+
ns_prefix = attrib.sub(/\Axmlns:/, '')
|
252
|
+
|
253
|
+
# gotta make sure it's actually used in the doc to not add it
|
254
|
+
# unecessarily. GAH.
|
255
|
+
if doc.xpath("//*[starts-with(name(), '#{ns_prefix}:')][1]").empty? &&
|
256
|
+
doc.xpath("//@*[starts-with(name(), '#{ns_prefix}:')][1]").empty?
|
257
|
+
next
|
258
|
+
end
|
259
|
+
doc.root.add_namespace_definition(ns_prefix, uri)
|
260
|
+
end
|
261
|
+
|
262
|
+
if default_ns
|
263
|
+
doc.root.default_namespace = default_ns
|
264
|
+
# OMG nokogiri, really?
|
265
|
+
default_ns = doc.root.namespace
|
266
|
+
doc.xpath("//*[namespace-uri()='']").each do |node|
|
267
|
+
node.namespace = default_ns
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
end
|
272
|
+
return doc
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'hashie'
|
2
|
+
|
3
|
+
module Traject
|
4
|
+
module Hashie
|
5
|
+
# Backporting fix from https://github.com/intridea/hashie/commit/a82c594710e1bc9460d3de4d2989cb700f4c3c7f
|
6
|
+
# into Hashie.
|
7
|
+
#
|
8
|
+
# This makes merge(ordinary_hash) on a Hash that has IndifferentAccess included work, without
|
9
|
+
# raising. Which we needed.
|
10
|
+
#
|
11
|
+
# As of this writing that fix is not available in a Hashie release, if it becomes so
|
12
|
+
# later than this monkey-patch may no longer be required, we can just depend on fixed version.
|
13
|
+
#
|
14
|
+
# See also https://github.com/intridea/hashie/issues/451
|
15
|
+
module IndifferentAccessFix
|
16
|
+
def merge(*args)
|
17
|
+
result = super
|
18
|
+
::Hashie::Extensions::IndifferentAccess.inject!(result) if hash_lacking_indifference?(result)
|
19
|
+
result.convert!
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
Hashie::Extensions::IndifferentAccess.include(Traject::Hashie::IndifferentAccessFix)
|
25
|
+
|
data/lib/traject/indexer.rb
CHANGED
@@ -11,14 +11,14 @@ require 'traject/marc_reader'
|
|
11
11
|
require 'traject/json_writer'
|
12
12
|
require 'traject/solr_json_writer'
|
13
13
|
require 'traject/debug_writer'
|
14
|
-
|
14
|
+
require 'traject/array_writer'
|
15
15
|
|
16
16
|
require 'traject/macros/marc21'
|
17
17
|
require 'traject/macros/basic'
|
18
|
+
require 'traject/macros/transformation'
|
19
|
+
|
20
|
+
require 'traject/indexer/marc_indexer'
|
18
21
|
|
19
|
-
if defined? JRUBY_VERSION
|
20
|
-
require 'traject/marc4j_reader'
|
21
|
-
end
|
22
22
|
|
23
23
|
# This class does indexing for traject: Getting input records from a Reader
|
24
24
|
# class, mapping the input records to an output hash, and then sending the output
|
@@ -157,33 +157,39 @@ end
|
|
157
157
|
# inconveient for you, we'd like to know your use case and improve things.
|
158
158
|
#
|
159
159
|
class Traject::Indexer
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
end
|
164
|
-
class NamingError < ArgumentError;
|
165
|
-
end
|
166
|
-
|
160
|
+
CompletedStateError = Class.new(StandardError)
|
161
|
+
ArityError = Class.new(ArgumentError)
|
162
|
+
NamingError = Class.new(ArgumentError)
|
167
163
|
|
168
164
|
include Traject::QualifiedConstGet
|
165
|
+
extend Traject::QualifiedConstGet
|
169
166
|
|
170
167
|
attr_writer :reader_class, :writer_class, :writer
|
171
168
|
|
172
|
-
# For now we hard-code these basic macro's included
|
173
|
-
# TODO, make these added with extend per-indexer,
|
174
|
-
# added by default but easily turned off (or have other
|
175
|
-
# default macro modules provided)
|
176
|
-
include Traject::Macros::Marc21
|
177
169
|
include Traject::Macros::Basic
|
170
|
+
include Traject::Macros::Transformation
|
178
171
|
|
179
172
|
|
180
173
|
# optional hash or Traject::Indexer::Settings object of settings.
|
181
|
-
|
182
|
-
|
174
|
+
# optionally takes a block which is instance_eval'd in the indexer,
|
175
|
+
# intended for configuration simimlar to what would be in a config file.
|
176
|
+
def initialize(arg_settings = {}, &block)
|
177
|
+
@writer_class = nil
|
178
|
+
@completed = false
|
179
|
+
@settings = Settings.new(arg_settings).with_defaults(self.class.default_settings)
|
183
180
|
@index_steps = []
|
184
181
|
@after_processing_steps = []
|
182
|
+
|
183
|
+
instance_eval(&block) if block
|
184
|
+
end
|
185
|
+
|
186
|
+
# Right now just does an `instance_eval`, but encouraged in case we change the underlying
|
187
|
+
# implementation later, and to make intent more clear.
|
188
|
+
def configure(&block)
|
189
|
+
instance_eval(&block)
|
185
190
|
end
|
186
191
|
|
192
|
+
|
187
193
|
# Pass a string file path, a Pathname, or a File object, for
|
188
194
|
# a config file to load into indexer.
|
189
195
|
#
|
@@ -234,16 +240,81 @@ class Traject::Indexer
|
|
234
240
|
def settings(new_settings = nil, &block)
|
235
241
|
@settings.merge!(new_settings) if new_settings
|
236
242
|
|
237
|
-
@settings.instance_eval
|
243
|
+
@settings.instance_eval(&block) if block_given?
|
238
244
|
|
239
245
|
return @settings
|
240
246
|
end
|
241
247
|
|
248
|
+
# Hash is frozen to avoid inheritance-mutability confusion.
|
249
|
+
def self.default_settings
|
250
|
+
@default_settings ||= {
|
251
|
+
# Writer defaults
|
252
|
+
"writer_class_name" => "Traject::SolrJsonWriter",
|
253
|
+
"solr_writer.batch_size" => 100,
|
254
|
+
"solr_writer.thread_pool" => 1,
|
255
|
+
|
256
|
+
# Threading and logging
|
257
|
+
"processing_thread_pool" => Traject::Indexer::Settings.default_processing_thread_pool,
|
258
|
+
"log.batch_size.severity" => "info",
|
259
|
+
|
260
|
+
# how to post-process the accumulator
|
261
|
+
"allow_nil_values" => false,
|
262
|
+
"allow_duplicate_values" => true,
|
263
|
+
|
264
|
+
"allow_empty_fields" => false
|
265
|
+
}.freeze
|
266
|
+
end
|
267
|
+
|
268
|
+
|
269
|
+
# Not sure if allowing changing of default_settings is a good idea, but we do
|
270
|
+
# use it in test. For now we make it private to require extreme measures to do it,
|
271
|
+
# and advertise that this API could go away or change without a major version release,
|
272
|
+
# it is experimental and internal.
|
273
|
+
private_class_method def self.default_settings=(settings)
|
274
|
+
@default_settings = settings
|
275
|
+
end
|
276
|
+
|
277
|
+
# Sub-classes should override to return a _proc_ object that takes one arg,
|
278
|
+
# a source record, and returns an identifier for it that can be used in
|
279
|
+
# logged messages. This differs depending on input record format, is why we
|
280
|
+
# leave it to sub-classes.
|
281
|
+
def source_record_id_proc
|
282
|
+
if defined?(@@legacy_marc_mode) && @@legacy_marc_mode
|
283
|
+
return @source_record_id_proc ||= lambda do |source_marc_record|
|
284
|
+
if ( source_marc_record &&
|
285
|
+
source_marc_record.kind_of?(MARC::Record) &&
|
286
|
+
source_marc_record['001'] )
|
287
|
+
source_marc_record['001'].value
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
@source_record_id_proc ||= lambda { |source| nil }
|
293
|
+
end
|
294
|
+
|
295
|
+
def self.legacy_marc_mode!
|
296
|
+
@@legacy_marc_mode = true
|
297
|
+
# include legacy Marc macros
|
298
|
+
include Traject::Macros::Marc21
|
299
|
+
|
300
|
+
# Reader defaults
|
301
|
+
legacy_settings = {
|
302
|
+
"reader_class_name" => "Traject::MarcReader",
|
303
|
+
"marc_source.type" => "binary",
|
304
|
+
}
|
305
|
+
|
306
|
+
default_settings.merge!(legacy_settings)
|
307
|
+
|
308
|
+
self
|
309
|
+
end
|
310
|
+
|
242
311
|
# Part of DSL, used to define an indexing mapping. Register logic
|
243
312
|
# to be called for each record, and generate values for a particular
|
244
|
-
# output field.
|
245
|
-
|
246
|
-
|
313
|
+
# output field. The first field_name argument can be a single string, or
|
314
|
+
# an array of multiple strings -- in the latter case, the processed values
|
315
|
+
# will be added to each field mentioned.
|
316
|
+
def to_field(field_name, *procs, &block)
|
317
|
+
@index_steps << ToFieldStep.new(field_name, procs, block, Traject::Util.extract_caller_location(caller.first))
|
247
318
|
end
|
248
319
|
|
249
320
|
# Part of DSL, register logic to be called for each record
|
@@ -313,14 +384,33 @@ class Traject::Indexer
|
|
313
384
|
# this indexer. Returns the output hash (a hash whose keys are
|
314
385
|
# string fields, and values are arrays of one or more values in that field)
|
315
386
|
#
|
387
|
+
# If the record is marked `skip` as part of processing, this will return
|
388
|
+
# nil.
|
389
|
+
#
|
316
390
|
# This is a convenience shortcut for #map_to_context! -- use that one
|
317
391
|
# if you want to provide addtional context
|
318
392
|
# like position, and/or get back the full context.
|
319
393
|
def map_record(record)
|
320
|
-
context = Context.new(:source_record => record, :settings => settings)
|
394
|
+
context = Context.new(:source_record => record, :settings => settings, :source_record_id_proc => source_record_id_proc, :logger => logger)
|
395
|
+
map_to_context!(context)
|
396
|
+
return context.output_hash unless context.skip?
|
397
|
+
end
|
398
|
+
|
399
|
+
# Takes a single record, maps it, and sends it to the instance-configured
|
400
|
+
# writer. No threading, no logging, no error handling. Respects skipped
|
401
|
+
# records by not adding them. Returns the Traject::Indexer::Context.
|
402
|
+
#
|
403
|
+
# Aliased as #<<
|
404
|
+
def process_record(record)
|
405
|
+
check_uncompleted
|
406
|
+
|
407
|
+
context = Context.new(:source_record => record, :settings => settings, :source_record_id_proc => source_record_id_proc, :logger => logger)
|
321
408
|
map_to_context!(context)
|
322
|
-
|
409
|
+
writer.put( context ) unless context.skip?
|
410
|
+
|
411
|
+
return context
|
323
412
|
end
|
413
|
+
alias_method :<<, :process_record
|
324
414
|
|
325
415
|
# Maps a single record INTO the second argument, a Traject::Indexer::Context.
|
326
416
|
#
|
@@ -342,7 +432,7 @@ class Traject::Indexer
|
|
342
432
|
|
343
433
|
# Set the index step for error reporting
|
344
434
|
context.index_step = index_step
|
345
|
-
|
435
|
+
handle_mapping_errors(context) do
|
346
436
|
index_step.execute(context) # will always return [] for an each_record step
|
347
437
|
end
|
348
438
|
|
@@ -353,31 +443,40 @@ class Traject::Indexer
|
|
353
443
|
return context
|
354
444
|
end
|
355
445
|
|
356
|
-
|
357
|
-
|
358
|
-
|
446
|
+
|
447
|
+
protected def default_mapping_rescue
|
448
|
+
@default_mapping_rescue ||= lambda do |context, exception|
|
449
|
+
msg = "Unexpected error on record #{context.record_inspect}\n"
|
450
|
+
msg += " while executing #{context.index_step.inspect}\n"
|
451
|
+
|
452
|
+
msg += begin
|
453
|
+
"\n Record: #{context.source_record.to_s}\n"
|
454
|
+
rescue StandardError => to_s_exception
|
455
|
+
"\n (Could not log record, #{to_s_exception})\n"
|
456
|
+
end
|
457
|
+
|
458
|
+
msg += Traject::Util.exception_to_log_message(exception)
|
459
|
+
|
460
|
+
context.logger.error(msg) if context.logger
|
461
|
+
|
462
|
+
raise exception
|
463
|
+
end
|
464
|
+
end
|
465
|
+
|
466
|
+
# just a wrapper that catches any errors, and handles them. By default, logs
|
467
|
+
# and re-raises. But you can set custom setting `mapping_rescue`
|
468
|
+
# to customize
|
359
469
|
#
|
360
|
-
# Re-raises error at the moment.
|
361
470
|
#
|
362
|
-
#
|
471
|
+
# handle_mapping_errors(context, index_step) do
|
363
472
|
# all_sorts_of_stuff # that will have errors logged
|
364
473
|
# end
|
365
|
-
def
|
474
|
+
protected def handle_mapping_errors(context)
|
366
475
|
begin
|
367
476
|
yield
|
368
|
-
rescue
|
369
|
-
|
370
|
-
|
371
|
-
msg += Traject::Util.exception_to_log_message(e)
|
372
|
-
|
373
|
-
logger.error msg
|
374
|
-
begin
|
375
|
-
logger.debug "Record: " + context.source_record.to_s
|
376
|
-
rescue Exception => marc_to_s_exception
|
377
|
-
logger.debug "(Could not log record, #{marc_to_s_exception})"
|
378
|
-
end
|
379
|
-
|
380
|
-
raise e
|
477
|
+
rescue StandardError => e
|
478
|
+
error_handler = settings["mapping_rescue"] || default_mapping_rescue
|
479
|
+
error_handler.call(context, e)
|
381
480
|
end
|
382
481
|
end
|
383
482
|
|
@@ -385,67 +484,80 @@ class Traject::Indexer
|
|
385
484
|
# mapping according to configured mapping rules, and then writing
|
386
485
|
# to configured Writer.
|
387
486
|
#
|
487
|
+
# You instead give it an _array_ of streams, as well.
|
488
|
+
#
|
388
489
|
# returns 'false' as a signal to command line to return non-zero exit code
|
389
490
|
# for some reason (reason found in logs, presumably). This particular mechanism
|
390
491
|
# is open to complexification, starting simple. We do need SOME way to return
|
391
492
|
# non-zero to command line.
|
392
493
|
#
|
393
|
-
|
494
|
+
# @param [#read, Array<#read>]
|
495
|
+
def process(io_stream_or_array)
|
496
|
+
check_uncompleted
|
497
|
+
|
394
498
|
settings.fill_in_defaults!
|
395
499
|
|
396
500
|
count = 0
|
397
501
|
start_time = batch_start_time = Time.now
|
398
|
-
logger.debug "beginning Indexer#process with settings: #{settings.inspect}"
|
399
|
-
|
400
|
-
reader = self.reader!(io_stream)
|
502
|
+
logger.debug "beginning Traject::Indexer#process with settings: #{settings.inspect}"
|
401
503
|
|
402
504
|
processing_threads = settings["processing_thread_pool"].to_i
|
403
505
|
thread_pool = Traject::ThreadPool.new(processing_threads)
|
404
506
|
|
405
|
-
logger.info " Indexer with #{processing_threads} processing threads, reader: #{
|
507
|
+
logger.info " Traject::Indexer with #{processing_threads} processing threads, reader: #{reader_class.name} and writer: #{writer.class.name}"
|
406
508
|
|
407
|
-
|
509
|
+
#io_stream can now be an array of io_streams.
|
510
|
+
(io_stream_or_array.kind_of?(Array) ? io_stream_or_array : [io_stream_or_array]).each do |io_stream|
|
511
|
+
reader = self.reader!(io_stream)
|
512
|
+
input_name = Traject::Util.io_name(io_stream)
|
513
|
+
position_in_input = 0
|
408
514
|
|
409
|
-
|
410
|
-
count += 1
|
515
|
+
log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
|
411
516
|
|
412
|
-
|
413
|
-
|
414
|
-
|
517
|
+
reader.each do |record; safe_count, safe_position_in_input |
|
518
|
+
count += 1
|
519
|
+
position_in_input += 1
|
415
520
|
|
416
|
-
|
521
|
+
# have to use a block local var, so the changing `count` one
|
522
|
+
# doesn't get caught in the closure. Don't totally get it, but
|
523
|
+
# I think it's so.
|
524
|
+
safe_count, safe_position_in_input = count, position_in_input
|
417
525
|
|
418
|
-
|
419
|
-
$stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
|
420
|
-
end
|
526
|
+
thread_pool.raise_collected_exception!
|
421
527
|
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
:position => position,
|
426
|
-
:logger => logger
|
427
|
-
)
|
428
|
-
|
429
|
-
if log_batch_size && (count % log_batch_size == 0)
|
430
|
-
batch_rps = log_batch_size / (Time.now - batch_start_time)
|
431
|
-
overall_rps = count / (Time.now - start_time)
|
432
|
-
logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at id:#{context.source_record_id}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
|
433
|
-
batch_start_time = Time.now
|
434
|
-
end
|
528
|
+
if settings["debug_ascii_progress"].to_s == "true"
|
529
|
+
$stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
|
530
|
+
end
|
435
531
|
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
532
|
+
context = Context.new(
|
533
|
+
:source_record => record,
|
534
|
+
:source_record_id_proc => source_record_id_proc,
|
535
|
+
:settings => settings,
|
536
|
+
:position => safe_count,
|
537
|
+
:input_name => input_name,
|
538
|
+
:position_in_input => safe_position_in_input,
|
539
|
+
:logger => logger
|
540
|
+
)
|
541
|
+
|
542
|
+
if log_batch_size && (count % log_batch_size == 0)
|
543
|
+
batch_rps = log_batch_size / (Time.now - batch_start_time)
|
544
|
+
overall_rps = count / (Time.now - start_time)
|
545
|
+
logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at: #{context.source_inspect}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
|
546
|
+
batch_start_time = Time.now
|
445
547
|
end
|
446
548
|
|
549
|
+
# We pass context in a block arg to properly 'capture' it, so
|
550
|
+
# we don't accidentally share the local var under closure between
|
551
|
+
# threads.
|
552
|
+
thread_pool.maybe_in_thread_pool(context) do |t_context|
|
553
|
+
map_to_context!(t_context)
|
554
|
+
if context.skip?
|
555
|
+
log_skip(t_context)
|
556
|
+
else
|
557
|
+
writer.put t_context
|
558
|
+
end
|
559
|
+
end
|
447
560
|
end
|
448
|
-
|
449
561
|
end
|
450
562
|
$stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
|
451
563
|
|
@@ -455,39 +567,156 @@ class Traject::Indexer
|
|
455
567
|
|
456
568
|
thread_pool.raise_collected_exception!
|
457
569
|
|
570
|
+
complete
|
571
|
+
|
572
|
+
elapsed = Time.now - start_time
|
573
|
+
avg_rps = (count / elapsed)
|
574
|
+
logger.info "finished Traject::Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
|
575
|
+
|
576
|
+
if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
|
577
|
+
logger.error "Traject::Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
|
578
|
+
return false
|
579
|
+
end
|
580
|
+
|
581
|
+
return true
|
582
|
+
end
|
583
|
+
|
584
|
+
def completed?
|
585
|
+
@completed
|
586
|
+
end
|
587
|
+
|
588
|
+
# Instance variable readers and writers are not generally re-usble.
|
589
|
+
# The writer may have been closed. The reader does it's thing and doesn't
|
590
|
+
# rewind. If we're completed, as a sanity check don't let someone do
|
591
|
+
# something with the indexer that uses the reader or writer and isn't gonna work.
|
592
|
+
protected def check_uncompleted
|
593
|
+
if completed?
|
594
|
+
raise CompletedStateError.new("This Traject::Indexer has been completed, and it's reader and writer are not in a usable state")
|
595
|
+
end
|
596
|
+
end
|
458
597
|
|
598
|
+
# Closes the writer (which may flush/save/finalize buffered records),
|
599
|
+
# and calls run_after_processing_steps
|
600
|
+
def complete
|
459
601
|
writer.close if writer.respond_to?(:close)
|
602
|
+
run_after_processing_steps
|
460
603
|
|
604
|
+
# after an indexer has been completed, it is not really usable anymore,
|
605
|
+
# as the writer has been closed.
|
606
|
+
@completed = true
|
607
|
+
end
|
608
|
+
|
609
|
+
def run_after_processing_steps
|
461
610
|
@after_processing_steps.each do |step|
|
462
611
|
begin
|
463
612
|
step.execute
|
464
|
-
rescue
|
613
|
+
rescue StandardError => e
|
465
614
|
logger.fatal("Unexpected exception #{e} when executing #{step}")
|
466
615
|
raise e
|
467
616
|
end
|
468
617
|
end
|
618
|
+
end
|
469
619
|
|
470
|
-
|
471
|
-
|
472
|
-
|
620
|
+
# A light-weight process method meant for programmatic use, generally
|
621
|
+
# intended for only a "few" (not milliions) of records.
|
622
|
+
#
|
623
|
+
# It does _not_ use instance-configured reader or writer, instead taking
|
624
|
+
# a source/reader and destination/writer as arguments to this call.
|
625
|
+
#
|
626
|
+
# The reader can be anything that has an #each returning source
|
627
|
+
# records. This includes an ordinary array of source records, or any
|
628
|
+
# traject Reader.
|
629
|
+
#
|
630
|
+
# The writer can be anything with a #put method taking a Traject::Indexer::Context.
|
631
|
+
# For convenience, see the Traject::ArrayWriter that just collects output in an array.
|
632
|
+
#
|
633
|
+
# Return value of process_with is the writer passed as second arg, for your convenience.
|
634
|
+
#
|
635
|
+
# This does much less than the full #process method, to be more flexible
|
636
|
+
# and make fewer assumptions:
|
637
|
+
#
|
638
|
+
# * Will never use any additional threads (unless writer does). Wrap in your own threading if desired.
|
639
|
+
# * Will not do any standard logging or progress bars, regardless of indexer settings.
|
640
|
+
# Log yourself if desired.
|
641
|
+
# * Will _not_ call any `after_processing` steps. Call yourself with `indexer.run_after_processing_steps` as desired.
|
642
|
+
# * WILL by default call #close on the writer, IF the writer has a #close method.
|
643
|
+
# pass `:close_writer => false` to not do so.
|
644
|
+
# * exceptions will just raise out, unless you pass in a rescue: option, value is a proc/lambda
|
645
|
+
# that will receive two args, context and exception. If the rescue proc doesn't re-raise,
|
646
|
+
# `process_with` will continue to process subsequent records.
|
647
|
+
#
|
648
|
+
# @example
|
649
|
+
# array_writer_instance = indexer.process_with([record1, record2], Traject::ArrayWriter.new)
|
650
|
+
#
|
651
|
+
# @example With a block, in addition to or instead of a writer.
|
652
|
+
#
|
653
|
+
# indexer.process_with([record]) do |context|
|
654
|
+
# do_something_with(context.output_hash)
|
655
|
+
# end
|
656
|
+
#
|
657
|
+
# @param source [#each]
|
658
|
+
# @param destination [#put]
|
659
|
+
# @param close_writer whether the destination should have #close called on it, if it responds to.
|
660
|
+
# @param rescue_with [Proc] to call on errors, taking two args: A Traject::Indexer::Context and an exception.
|
661
|
+
# If nil (default), exceptions will be raised out. If set, you can raise or handle otherwise if you like.
|
662
|
+
# @param on_skipped [Proc] will be called for any skipped records, with one arg Traject::Indexer::Context
|
663
|
+
def process_with(source, destination = nil, close_writer: true, rescue_with: nil, on_skipped: nil)
|
664
|
+
unless destination || block_given?
|
665
|
+
raise ArgumentError, "Need either a second arg writer/destination, or a block"
|
666
|
+
end
|
473
667
|
|
474
|
-
|
475
|
-
|
476
|
-
|
668
|
+
settings.fill_in_defaults!
|
669
|
+
|
670
|
+
position = 0
|
671
|
+
input_name = Traject::Util.io_name(source)
|
672
|
+
source.each do |record |
|
673
|
+
begin
|
674
|
+
position += 1
|
675
|
+
|
676
|
+
context = Context.new(
|
677
|
+
:source_record => record,
|
678
|
+
:source_record_id_proc => source_record_id_proc,
|
679
|
+
:settings => settings,
|
680
|
+
:position => position,
|
681
|
+
:position_in_input => (position if input_name),
|
682
|
+
:logger => logger
|
683
|
+
)
|
684
|
+
|
685
|
+
map_to_context!(context)
|
686
|
+
|
687
|
+
if context.skip?
|
688
|
+
on_skipped.call(context) if on_skipped
|
689
|
+
else
|
690
|
+
destination.put(context) if destination
|
691
|
+
yield(context) if block_given?
|
692
|
+
end
|
693
|
+
rescue StandardError => e
|
694
|
+
if rescue_with
|
695
|
+
rescue_with.call(context, e)
|
696
|
+
else
|
697
|
+
raise e
|
698
|
+
end
|
699
|
+
end
|
477
700
|
end
|
478
701
|
|
479
|
-
|
702
|
+
if close_writer && destination.respond_to?(:close)
|
703
|
+
destination.close
|
704
|
+
end
|
705
|
+
|
706
|
+
return destination
|
480
707
|
end
|
481
708
|
|
482
709
|
# Log that the current record is being skipped, using
|
483
710
|
# data in context.position and context.skipmessage
|
484
711
|
def log_skip(context)
|
485
|
-
logger.debug "Skipped record #{context.
|
712
|
+
logger.debug "Skipped record #{context.record_inspect}: #{context.skipmessage}"
|
486
713
|
end
|
487
714
|
|
488
715
|
def reader_class
|
489
716
|
unless defined? @reader_class
|
490
|
-
|
717
|
+
reader_class_name = settings["reader_class_name"]
|
718
|
+
|
719
|
+
@reader_class = qualified_const_get(reader_class_name)
|
491
720
|
end
|
492
721
|
return @reader_class
|
493
722
|
end
|