traject 2.3.4 → 3.0.0.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +16 -9
- data/CHANGES.md +74 -1
- data/Gemfile +2 -1
- data/README.md +104 -53
- data/Rakefile +8 -1
- data/doc/indexing_rules.md +79 -63
- data/doc/programmatic_use.md +218 -0
- data/doc/settings.md +28 -1
- data/doc/xml.md +134 -0
- data/lib/traject.rb +5 -0
- data/lib/traject/array_writer.rb +34 -0
- data/lib/traject/command_line.rb +18 -22
- data/lib/traject/debug_writer.rb +2 -5
- data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
- data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
- data/lib/traject/indexer.rb +321 -92
- data/lib/traject/indexer/context.rb +39 -13
- data/lib/traject/indexer/marc_indexer.rb +30 -0
- data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
- data/lib/traject/indexer/settings.rb +36 -53
- data/lib/traject/indexer/step.rb +27 -33
- data/lib/traject/macros/marc21.rb +37 -12
- data/lib/traject/macros/nokogiri_macros.rb +43 -0
- data/lib/traject/macros/transformation.rb +162 -0
- data/lib/traject/marc_extractor.rb +2 -0
- data/lib/traject/ndj_reader.rb +1 -1
- data/lib/traject/nokogiri_reader.rb +179 -0
- data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
- data/lib/traject/solr_json_writer.rb +19 -12
- data/lib/traject/thread_pool.rb +13 -0
- data/lib/traject/util.rb +14 -2
- data/lib/traject/version.rb +1 -1
- data/test/debug_writer_test.rb +3 -3
- data/test/delimited_writer_test.rb +3 -3
- data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
- data/test/indexer/context_test.rb +23 -13
- data/test/indexer/error_handler_test.rb +59 -0
- data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
- data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
- data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
- data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
- data/test/indexer/macros/to_field_test.rb +2 -2
- data/test/indexer/macros/transformation_test.rb +177 -0
- data/test/indexer/map_record_test.rb +2 -3
- data/test/indexer/nokogiri_indexer_test.rb +103 -0
- data/test/indexer/process_record_test.rb +55 -0
- data/test/indexer/process_with_test.rb +148 -0
- data/test/indexer/read_write_test.rb +52 -2
- data/test/indexer/settings_test.rb +34 -24
- data/test/indexer/to_field_test.rb +27 -2
- data/test/marc_extractor_test.rb +7 -7
- data/test/marc_reader_test.rb +4 -4
- data/test/nokogiri_reader_test.rb +158 -0
- data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
- data/test/solr_json_writer_test.rb +24 -28
- data/test/test_helper.rb +8 -2
- data/test/test_support/namespace-test.xml +7 -0
- data/test/test_support/nokogiri_demo_config.rb +17 -0
- data/test/test_support/oai-pmh-one-record-2.xml +24 -0
- data/test/test_support/oai-pmh-one-record-first.xml +24 -0
- data/test/test_support/sample-oai-no-namespace.xml +197 -0
- data/test/test_support/sample-oai-pmh.xml +197 -0
- data/test/thread_pool_test.rb +38 -0
- data/test/translation_map_test.rb +3 -3
- data/test/translation_maps/ruby_map.rb +2 -1
- data/test/translation_maps/yaml_map.yaml +2 -1
- data/traject.gemspec +4 -11
- metadata +92 -6
data/lib/traject/debug_writer.rb
CHANGED
@@ -40,12 +40,9 @@ class Traject::DebugWriter < Traject::LineWriter
|
|
40
40
|
@idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
|
41
41
|
@format = settings['debug_writer.format'] || DEFAULT_FORMAT
|
42
42
|
|
43
|
-
|
44
|
-
@use_position = true
|
45
|
-
end
|
43
|
+
@use_position = (@idfield == 'record_position')
|
46
44
|
|
47
45
|
@already_threw_warning_about_missing_id = false
|
48
|
-
|
49
46
|
end
|
50
47
|
|
51
48
|
def record_number(context)
|
@@ -54,7 +51,7 @@ class Traject::DebugWriter < Traject::LineWriter
|
|
54
51
|
context.output_hash[@idfield].first
|
55
52
|
else
|
56
53
|
unless @already_threw_warning_about_missing_id
|
57
|
-
context.logger.warn "At least one record (
|
54
|
+
context.logger.warn "At least one record (#{context.record_inspect}) doesn't define field '#{@idfield}'.
|
58
55
|
All records are assumed to have a unique id. You can set which field to look in via the setting 'debug_writer.idfield'"
|
59
56
|
@already_threw_warning_about_missing_id = true
|
60
57
|
end
|
@@ -0,0 +1,276 @@
|
|
1
|
+
module Traject
|
2
|
+
# An EXPERIMENTAL HALF-FINISHED implementation of a streaming/pull reader using Nokogiri.
|
3
|
+
# Not ready for use, not stable API, could go away.
|
4
|
+
#
|
5
|
+
# This was my first try at a NokogiriReader implementation, it didn't work out, at least without
|
6
|
+
# a lot more work. I think we'd need to re-do it to build the Nokogiri::XML::Nodes by hand as the
|
7
|
+
# source is traversed, instead of relying on #outer_xml -- outer_xml returning a string results in a double-parsing,
|
8
|
+
# with the expected 50% performance hit. Picadillos in Nokogiri JRuby namespace handling don't help.
|
9
|
+
#
|
10
|
+
# All in all, it's possible something could be gotten here with a lot more work, it's also possible
|
11
|
+
# Nokogiri's antipathy to namespaces could keep getting in the way.
|
12
|
+
class ExperimentalNokogiriStreamingReader
|
13
|
+
include Enumerable
|
14
|
+
|
15
|
+
attr_reader :settings, :input_stream, :clipboard, :path_tracker
|
16
|
+
|
17
|
+
def initialize(input_stream, settings)
|
18
|
+
@settings = Traject::Indexer::Settings.new settings
|
19
|
+
@input_stream = input_stream
|
20
|
+
@clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new
|
21
|
+
|
22
|
+
if each_record_xpath
|
23
|
+
@path_tracker = PathTracker.new(each_record_xpath,
|
24
|
+
clipboard: self.clipboard,
|
25
|
+
namespaces: default_namespaces,
|
26
|
+
extra_xpath_hooks: extra_xpath_hooks)
|
27
|
+
end
|
28
|
+
|
29
|
+
default_namespaces # trigger validation
|
30
|
+
validate_limited_xpath(each_record_xpath, key_name: "each_record_xpath")
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
def each_record_xpath
|
35
|
+
@each_record_xpath ||= settings["nokogiri.each_record_xpath"]
|
36
|
+
end
|
37
|
+
|
38
|
+
def extra_xpath_hooks
|
39
|
+
@extra_xpath_hooks ||= begin
|
40
|
+
(settings["nokogiri_reader.extra_xpath_hooks"] || {}).tap do |hash|
|
41
|
+
hash.each_pair do |limited_xpath, callable|
|
42
|
+
validate_limited_xpath(limited_xpath, key_name: "nokogiri_reader.extra_xpath_hooks")
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
protected def validate_limited_xpath(each_record_xpath, key_name:)
|
49
|
+
return unless each_record_xpath
|
50
|
+
|
51
|
+
components = each_record_xpath.split('/')
|
52
|
+
components.each do |component|
|
53
|
+
prefix, element = component.split(':')
|
54
|
+
unless element
|
55
|
+
# there was no namespace
|
56
|
+
prefix, element = nil, prefix
|
57
|
+
end
|
58
|
+
|
59
|
+
# We don't support brackets or any xpath beyond the MOST simple.
|
60
|
+
# Catch a few we can catch.
|
61
|
+
if element =~ /::/ || element =~ /[\[\]]/
|
62
|
+
raise ArgumentError, "#{key_name}: Only very simple xpaths supported. '//some/path' or '/some/path'. Not: #{each_record_xpath.inspect}"
|
63
|
+
end
|
64
|
+
|
65
|
+
if prefix
|
66
|
+
ns_uri = default_namespaces[prefix]
|
67
|
+
if ns_uri.nil?
|
68
|
+
raise ArgumentError, "each_record_xpath: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
each_record_xpath
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
def default_namespaces
|
78
|
+
@default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
|
79
|
+
unless ns.kind_of?(Hash)
|
80
|
+
raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
|
81
|
+
end
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
def each
|
86
|
+
unless each_record_xpath
|
87
|
+
# forget streaming, just read it and return it once, done.
|
88
|
+
yield Nokogiri::XML.parse(input_stream)
|
89
|
+
return
|
90
|
+
end
|
91
|
+
|
92
|
+
reader = Nokogiri::XML::Reader(input_stream)
|
93
|
+
|
94
|
+
reader.each do |reader_node|
|
95
|
+
if reader_node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
|
96
|
+
path_tracker.push(reader_node)
|
97
|
+
|
98
|
+
if path_tracker.match?
|
99
|
+
yield path_tracker.current_node_doc
|
100
|
+
end
|
101
|
+
path_tracker.run_extra_xpath_hooks
|
102
|
+
|
103
|
+
if reader_node.self_closing?
|
104
|
+
path_tracker.pop
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
if reader_node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
109
|
+
path_tracker.pop
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
# initialized with the specification (a very small subset of xpath) for
|
117
|
+
# what records to yield-on-each. Tests to see if a Nokogiri::XML::Reader
|
118
|
+
# node matches spec.
|
119
|
+
#
|
120
|
+
# '//record'
|
121
|
+
# or anchored to root:
|
122
|
+
# '/body/head/meta' same thing as './body/head/meta' or 'head/meta'
|
123
|
+
#
|
124
|
+
# Elements can (and must, to match) have XML namespaces, if and only if
|
125
|
+
# they are registered with settings nokogiri.namespaces
|
126
|
+
#
|
127
|
+
# sadly JRuby Nokogiri has an incompatibility with true nokogiri, and
|
128
|
+
# doesn't preserve our namespaces on outer_xml,
|
129
|
+
# so in JRuby we have to track them ourselves, and then also do yet ANOTHER
|
130
|
+
# parse in nokogiri. This may make this in Java even LESS performant, I'm afraid.
|
131
|
+
class PathTracker
|
132
|
+
attr_reader :path_spec, :inverted_namespaces, :current_path, :namespaces_stack, :extra_xpath_hooks, :clipboard
|
133
|
+
def initialize(str_spec, clipboard:, namespaces: {}, extra_xpath_hooks: {})
|
134
|
+
@inverted_namespaces = namespaces.invert
|
135
|
+
@clipboard = clipboard
|
136
|
+
# We're guessing using a string will be more efficient than an array
|
137
|
+
@current_path = ""
|
138
|
+
@floating = false
|
139
|
+
|
140
|
+
@path_spec, @floating = parse_path(str_spec)
|
141
|
+
|
142
|
+
@namespaces_stack = []
|
143
|
+
|
144
|
+
|
145
|
+
@extra_xpath_hooks = extra_xpath_hooks.collect do |path, callable|
|
146
|
+
bare_path, floating = parse_path(path)
|
147
|
+
{
|
148
|
+
path: bare_path,
|
149
|
+
floating: floating,
|
150
|
+
callable: callable
|
151
|
+
}
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# returns [bare_path, is_floating]
|
156
|
+
protected def parse_path(str_spec)
|
157
|
+
floating = false
|
158
|
+
|
159
|
+
if str_spec.start_with?('//')
|
160
|
+
str_spec = str_spec.slice(2..-1)
|
161
|
+
floating = true
|
162
|
+
else
|
163
|
+
str_spec = str_spec.slice(1..-1) if str_spec.start_with?(".")
|
164
|
+
str_spec = "/" + str_spec unless str_spec.start_with?("/")
|
165
|
+
end
|
166
|
+
|
167
|
+
return [str_spec, floating]
|
168
|
+
end
|
169
|
+
|
170
|
+
def is_jruby?
|
171
|
+
Traject::Util.is_jruby?
|
172
|
+
end
|
173
|
+
|
174
|
+
# adds a component to slash-separated current_path, with namespace prefix.
|
175
|
+
def push(reader_node)
|
176
|
+
namespace_prefix = reader_node.namespace_uri && inverted_namespaces[reader_node.namespace_uri]
|
177
|
+
|
178
|
+
# gah, reader_node.name has the namespace prefix in there
|
179
|
+
node_name = reader_node.name.gsub(/[^:]+:/, '')
|
180
|
+
|
181
|
+
node_str = if namespace_prefix
|
182
|
+
namespace_prefix + ":" + node_name
|
183
|
+
else
|
184
|
+
reader_node.name
|
185
|
+
end
|
186
|
+
|
187
|
+
current_path << ("/" + node_str)
|
188
|
+
|
189
|
+
if is_jruby?
|
190
|
+
namespaces_stack << reader_node.namespaces
|
191
|
+
end
|
192
|
+
@current_node = reader_node
|
193
|
+
end
|
194
|
+
|
195
|
+
def current_node_doc
|
196
|
+
return nil unless @current_node
|
197
|
+
|
198
|
+
# yeah, sadly we got to have nokogiri parse it again
|
199
|
+
fix_namespaces(Nokogiri::XML.parse(@current_node.outer_xml))
|
200
|
+
end
|
201
|
+
|
202
|
+
# removes the last slash-separated component from current_path
|
203
|
+
def pop
|
204
|
+
current_path.slice!( current_path.rindex('/')..-1 )
|
205
|
+
@current_node = nil
|
206
|
+
|
207
|
+
if is_jruby?
|
208
|
+
namespaces_stack.pop
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
def floating?
|
213
|
+
!!@floating
|
214
|
+
end
|
215
|
+
|
216
|
+
def match?
|
217
|
+
match_path?(path_spec, floating: floating?)
|
218
|
+
end
|
219
|
+
|
220
|
+
def match_path?(path_to_match, floating:)
|
221
|
+
if floating?
|
222
|
+
current_path.end_with?(path_to_match)
|
223
|
+
else
|
224
|
+
current_path == path_to_match
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def run_extra_xpath_hooks
|
229
|
+
return unless @current_node
|
230
|
+
|
231
|
+
extra_xpath_hooks.each do |hook_spec|
|
232
|
+
if match_path?(hook_spec[:path], floating: hook_spec[:floating])
|
233
|
+
hook_spec[:callable].call(current_node_doc, clipboard)
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# no-op unless it's jruby, and then we use our namespace stack to
|
239
|
+
# correctly add namespaces to the Nokogiri::XML::Document, cause
|
240
|
+
# in Jruby outer_xml on the Reader doesn't do it for us. :(
|
241
|
+
def fix_namespaces(doc)
|
242
|
+
if is_jruby?
|
243
|
+
# Only needed in jruby, nokogiri's jruby implementation isn't weird
|
244
|
+
# around namespaces in exactly the same way as MRI. We need to keep
|
245
|
+
# track of the namespaces in outer contexts ourselves, and then see
|
246
|
+
# if they are needed ourselves. :(
|
247
|
+
namespaces = namespaces_stack.compact.reduce({}, :merge)
|
248
|
+
default_ns = namespaces.delete("xmlns")
|
249
|
+
|
250
|
+
namespaces.each_pair do |attrib, uri|
|
251
|
+
ns_prefix = attrib.sub(/\Axmlns:/, '')
|
252
|
+
|
253
|
+
# gotta make sure it's actually used in the doc to not add it
|
254
|
+
# unecessarily. GAH.
|
255
|
+
if doc.xpath("//*[starts-with(name(), '#{ns_prefix}:')][1]").empty? &&
|
256
|
+
doc.xpath("//@*[starts-with(name(), '#{ns_prefix}:')][1]").empty?
|
257
|
+
next
|
258
|
+
end
|
259
|
+
doc.root.add_namespace_definition(ns_prefix, uri)
|
260
|
+
end
|
261
|
+
|
262
|
+
if default_ns
|
263
|
+
doc.root.default_namespace = default_ns
|
264
|
+
# OMG nokogiri, really?
|
265
|
+
default_ns = doc.root.namespace
|
266
|
+
doc.xpath("//*[namespace-uri()='']").each do |node|
|
267
|
+
node.namespace = default_ns
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
end
|
272
|
+
return doc
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'hashie'
|
2
|
+
|
3
|
+
module Traject
|
4
|
+
module Hashie
|
5
|
+
# Backporting fix from https://github.com/intridea/hashie/commit/a82c594710e1bc9460d3de4d2989cb700f4c3c7f
|
6
|
+
# into Hashie.
|
7
|
+
#
|
8
|
+
# This makes merge(ordinary_hash) on a Hash that has IndifferentAccess included work, without
|
9
|
+
# raising. Which we needed.
|
10
|
+
#
|
11
|
+
# As of this writing that fix is not available in a Hashie release, if it becomes so
|
12
|
+
# later than this monkey-patch may no longer be required, we can just depend on fixed version.
|
13
|
+
#
|
14
|
+
# See also https://github.com/intridea/hashie/issues/451
|
15
|
+
module IndifferentAccessFix
|
16
|
+
def merge(*args)
|
17
|
+
result = super
|
18
|
+
::Hashie::Extensions::IndifferentAccess.inject!(result) if hash_lacking_indifference?(result)
|
19
|
+
result.convert!
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
Hashie::Extensions::IndifferentAccess.include(Traject::Hashie::IndifferentAccessFix)
|
25
|
+
|
data/lib/traject/indexer.rb
CHANGED
@@ -11,14 +11,14 @@ require 'traject/marc_reader'
|
|
11
11
|
require 'traject/json_writer'
|
12
12
|
require 'traject/solr_json_writer'
|
13
13
|
require 'traject/debug_writer'
|
14
|
-
|
14
|
+
require 'traject/array_writer'
|
15
15
|
|
16
16
|
require 'traject/macros/marc21'
|
17
17
|
require 'traject/macros/basic'
|
18
|
+
require 'traject/macros/transformation'
|
19
|
+
|
20
|
+
require 'traject/indexer/marc_indexer'
|
18
21
|
|
19
|
-
if defined? JRUBY_VERSION
|
20
|
-
require 'traject/marc4j_reader'
|
21
|
-
end
|
22
22
|
|
23
23
|
# This class does indexing for traject: Getting input records from a Reader
|
24
24
|
# class, mapping the input records to an output hash, and then sending the output
|
@@ -157,33 +157,39 @@ end
|
|
157
157
|
# inconveient for you, we'd like to know your use case and improve things.
|
158
158
|
#
|
159
159
|
class Traject::Indexer
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
end
|
164
|
-
class NamingError < ArgumentError;
|
165
|
-
end
|
166
|
-
|
160
|
+
CompletedStateError = Class.new(StandardError)
|
161
|
+
ArityError = Class.new(ArgumentError)
|
162
|
+
NamingError = Class.new(ArgumentError)
|
167
163
|
|
168
164
|
include Traject::QualifiedConstGet
|
165
|
+
extend Traject::QualifiedConstGet
|
169
166
|
|
170
167
|
attr_writer :reader_class, :writer_class, :writer
|
171
168
|
|
172
|
-
# For now we hard-code these basic macro's included
|
173
|
-
# TODO, make these added with extend per-indexer,
|
174
|
-
# added by default but easily turned off (or have other
|
175
|
-
# default macro modules provided)
|
176
|
-
include Traject::Macros::Marc21
|
177
169
|
include Traject::Macros::Basic
|
170
|
+
include Traject::Macros::Transformation
|
178
171
|
|
179
172
|
|
180
173
|
# optional hash or Traject::Indexer::Settings object of settings.
|
181
|
-
|
182
|
-
|
174
|
+
# optionally takes a block which is instance_eval'd in the indexer,
|
175
|
+
# intended for configuration simimlar to what would be in a config file.
|
176
|
+
def initialize(arg_settings = {}, &block)
|
177
|
+
@writer_class = nil
|
178
|
+
@completed = false
|
179
|
+
@settings = Settings.new(arg_settings).with_defaults(self.class.default_settings)
|
183
180
|
@index_steps = []
|
184
181
|
@after_processing_steps = []
|
182
|
+
|
183
|
+
instance_eval(&block) if block
|
184
|
+
end
|
185
|
+
|
186
|
+
# Right now just does an `instance_eval`, but encouraged in case we change the underlying
|
187
|
+
# implementation later, and to make intent more clear.
|
188
|
+
def configure(&block)
|
189
|
+
instance_eval(&block)
|
185
190
|
end
|
186
191
|
|
192
|
+
|
187
193
|
# Pass a string file path, a Pathname, or a File object, for
|
188
194
|
# a config file to load into indexer.
|
189
195
|
#
|
@@ -234,16 +240,81 @@ class Traject::Indexer
|
|
234
240
|
def settings(new_settings = nil, &block)
|
235
241
|
@settings.merge!(new_settings) if new_settings
|
236
242
|
|
237
|
-
@settings.instance_eval
|
243
|
+
@settings.instance_eval(&block) if block_given?
|
238
244
|
|
239
245
|
return @settings
|
240
246
|
end
|
241
247
|
|
248
|
+
# Hash is frozen to avoid inheritance-mutability confusion.
|
249
|
+
def self.default_settings
|
250
|
+
@default_settings ||= {
|
251
|
+
# Writer defaults
|
252
|
+
"writer_class_name" => "Traject::SolrJsonWriter",
|
253
|
+
"solr_writer.batch_size" => 100,
|
254
|
+
"solr_writer.thread_pool" => 1,
|
255
|
+
|
256
|
+
# Threading and logging
|
257
|
+
"processing_thread_pool" => Traject::Indexer::Settings.default_processing_thread_pool,
|
258
|
+
"log.batch_size.severity" => "info",
|
259
|
+
|
260
|
+
# how to post-process the accumulator
|
261
|
+
"allow_nil_values" => false,
|
262
|
+
"allow_duplicate_values" => true,
|
263
|
+
|
264
|
+
"allow_empty_fields" => false
|
265
|
+
}.freeze
|
266
|
+
end
|
267
|
+
|
268
|
+
|
269
|
+
# Not sure if allowing changing of default_settings is a good idea, but we do
|
270
|
+
# use it in test. For now we make it private to require extreme measures to do it,
|
271
|
+
# and advertise that this API could go away or change without a major version release,
|
272
|
+
# it is experimental and internal.
|
273
|
+
private_class_method def self.default_settings=(settings)
|
274
|
+
@default_settings = settings
|
275
|
+
end
|
276
|
+
|
277
|
+
# Sub-classes should override to return a _proc_ object that takes one arg,
|
278
|
+
# a source record, and returns an identifier for it that can be used in
|
279
|
+
# logged messages. This differs depending on input record format, is why we
|
280
|
+
# leave it to sub-classes.
|
281
|
+
def source_record_id_proc
|
282
|
+
if defined?(@@legacy_marc_mode) && @@legacy_marc_mode
|
283
|
+
return @source_record_id_proc ||= lambda do |source_marc_record|
|
284
|
+
if ( source_marc_record &&
|
285
|
+
source_marc_record.kind_of?(MARC::Record) &&
|
286
|
+
source_marc_record['001'] )
|
287
|
+
source_marc_record['001'].value
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
@source_record_id_proc ||= lambda { |source| nil }
|
293
|
+
end
|
294
|
+
|
295
|
+
def self.legacy_marc_mode!
|
296
|
+
@@legacy_marc_mode = true
|
297
|
+
# include legacy Marc macros
|
298
|
+
include Traject::Macros::Marc21
|
299
|
+
|
300
|
+
# Reader defaults
|
301
|
+
legacy_settings = {
|
302
|
+
"reader_class_name" => "Traject::MarcReader",
|
303
|
+
"marc_source.type" => "binary",
|
304
|
+
}
|
305
|
+
|
306
|
+
default_settings.merge!(legacy_settings)
|
307
|
+
|
308
|
+
self
|
309
|
+
end
|
310
|
+
|
242
311
|
# Part of DSL, used to define an indexing mapping. Register logic
|
243
312
|
# to be called for each record, and generate values for a particular
|
244
|
-
# output field.
|
245
|
-
|
246
|
-
|
313
|
+
# output field. The first field_name argument can be a single string, or
|
314
|
+
# an array of multiple strings -- in the latter case, the processed values
|
315
|
+
# will be added to each field mentioned.
|
316
|
+
def to_field(field_name, *procs, &block)
|
317
|
+
@index_steps << ToFieldStep.new(field_name, procs, block, Traject::Util.extract_caller_location(caller.first))
|
247
318
|
end
|
248
319
|
|
249
320
|
# Part of DSL, register logic to be called for each record
|
@@ -313,14 +384,33 @@ class Traject::Indexer
|
|
313
384
|
# this indexer. Returns the output hash (a hash whose keys are
|
314
385
|
# string fields, and values are arrays of one or more values in that field)
|
315
386
|
#
|
387
|
+
# If the record is marked `skip` as part of processing, this will return
|
388
|
+
# nil.
|
389
|
+
#
|
316
390
|
# This is a convenience shortcut for #map_to_context! -- use that one
|
317
391
|
# if you want to provide addtional context
|
318
392
|
# like position, and/or get back the full context.
|
319
393
|
def map_record(record)
|
320
|
-
context = Context.new(:source_record => record, :settings => settings)
|
394
|
+
context = Context.new(:source_record => record, :settings => settings, :source_record_id_proc => source_record_id_proc, :logger => logger)
|
395
|
+
map_to_context!(context)
|
396
|
+
return context.output_hash unless context.skip?
|
397
|
+
end
|
398
|
+
|
399
|
+
# Takes a single record, maps it, and sends it to the instance-configured
|
400
|
+
# writer. No threading, no logging, no error handling. Respects skipped
|
401
|
+
# records by not adding them. Returns the Traject::Indexer::Context.
|
402
|
+
#
|
403
|
+
# Aliased as #<<
|
404
|
+
def process_record(record)
|
405
|
+
check_uncompleted
|
406
|
+
|
407
|
+
context = Context.new(:source_record => record, :settings => settings, :source_record_id_proc => source_record_id_proc, :logger => logger)
|
321
408
|
map_to_context!(context)
|
322
|
-
|
409
|
+
writer.put( context ) unless context.skip?
|
410
|
+
|
411
|
+
return context
|
323
412
|
end
|
413
|
+
alias_method :<<, :process_record
|
324
414
|
|
325
415
|
# Maps a single record INTO the second argument, a Traject::Indexer::Context.
|
326
416
|
#
|
@@ -342,7 +432,7 @@ class Traject::Indexer
|
|
342
432
|
|
343
433
|
# Set the index step for error reporting
|
344
434
|
context.index_step = index_step
|
345
|
-
|
435
|
+
handle_mapping_errors(context) do
|
346
436
|
index_step.execute(context) # will always return [] for an each_record step
|
347
437
|
end
|
348
438
|
|
@@ -353,31 +443,40 @@ class Traject::Indexer
|
|
353
443
|
return context
|
354
444
|
end
|
355
445
|
|
356
|
-
|
357
|
-
|
358
|
-
|
446
|
+
|
447
|
+
protected def default_mapping_rescue
|
448
|
+
@default_mapping_rescue ||= lambda do |context, exception|
|
449
|
+
msg = "Unexpected error on record #{context.record_inspect}\n"
|
450
|
+
msg += " while executing #{context.index_step.inspect}\n"
|
451
|
+
|
452
|
+
msg += begin
|
453
|
+
"\n Record: #{context.source_record.to_s}\n"
|
454
|
+
rescue StandardError => to_s_exception
|
455
|
+
"\n (Could not log record, #{to_s_exception})\n"
|
456
|
+
end
|
457
|
+
|
458
|
+
msg += Traject::Util.exception_to_log_message(exception)
|
459
|
+
|
460
|
+
context.logger.error(msg) if context.logger
|
461
|
+
|
462
|
+
raise exception
|
463
|
+
end
|
464
|
+
end
|
465
|
+
|
466
|
+
# just a wrapper that catches any errors, and handles them. By default, logs
|
467
|
+
# and re-raises. But you can set custom setting `mapping_rescue`
|
468
|
+
# to customize
|
359
469
|
#
|
360
|
-
# Re-raises error at the moment.
|
361
470
|
#
|
362
|
-
#
|
471
|
+
# handle_mapping_errors(context, index_step) do
|
363
472
|
# all_sorts_of_stuff # that will have errors logged
|
364
473
|
# end
|
365
|
-
def
|
474
|
+
protected def handle_mapping_errors(context)
|
366
475
|
begin
|
367
476
|
yield
|
368
|
-
rescue
|
369
|
-
|
370
|
-
|
371
|
-
msg += Traject::Util.exception_to_log_message(e)
|
372
|
-
|
373
|
-
logger.error msg
|
374
|
-
begin
|
375
|
-
logger.debug "Record: " + context.source_record.to_s
|
376
|
-
rescue Exception => marc_to_s_exception
|
377
|
-
logger.debug "(Could not log record, #{marc_to_s_exception})"
|
378
|
-
end
|
379
|
-
|
380
|
-
raise e
|
477
|
+
rescue StandardError => e
|
478
|
+
error_handler = settings["mapping_rescue"] || default_mapping_rescue
|
479
|
+
error_handler.call(context, e)
|
381
480
|
end
|
382
481
|
end
|
383
482
|
|
@@ -385,67 +484,80 @@ class Traject::Indexer
|
|
385
484
|
# mapping according to configured mapping rules, and then writing
|
386
485
|
# to configured Writer.
|
387
486
|
#
|
487
|
+
# You instead give it an _array_ of streams, as well.
|
488
|
+
#
|
388
489
|
# returns 'false' as a signal to command line to return non-zero exit code
|
389
490
|
# for some reason (reason found in logs, presumably). This particular mechanism
|
390
491
|
# is open to complexification, starting simple. We do need SOME way to return
|
391
492
|
# non-zero to command line.
|
392
493
|
#
|
393
|
-
|
494
|
+
# @param [#read, Array<#read>]
|
495
|
+
def process(io_stream_or_array)
|
496
|
+
check_uncompleted
|
497
|
+
|
394
498
|
settings.fill_in_defaults!
|
395
499
|
|
396
500
|
count = 0
|
397
501
|
start_time = batch_start_time = Time.now
|
398
|
-
logger.debug "beginning Indexer#process with settings: #{settings.inspect}"
|
399
|
-
|
400
|
-
reader = self.reader!(io_stream)
|
502
|
+
logger.debug "beginning Traject::Indexer#process with settings: #{settings.inspect}"
|
401
503
|
|
402
504
|
processing_threads = settings["processing_thread_pool"].to_i
|
403
505
|
thread_pool = Traject::ThreadPool.new(processing_threads)
|
404
506
|
|
405
|
-
logger.info " Indexer with #{processing_threads} processing threads, reader: #{
|
507
|
+
logger.info " Traject::Indexer with #{processing_threads} processing threads, reader: #{reader_class.name} and writer: #{writer.class.name}"
|
406
508
|
|
407
|
-
|
509
|
+
#io_stream can now be an array of io_streams.
|
510
|
+
(io_stream_or_array.kind_of?(Array) ? io_stream_or_array : [io_stream_or_array]).each do |io_stream|
|
511
|
+
reader = self.reader!(io_stream)
|
512
|
+
input_name = Traject::Util.io_name(io_stream)
|
513
|
+
position_in_input = 0
|
408
514
|
|
409
|
-
|
410
|
-
count += 1
|
515
|
+
log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
|
411
516
|
|
412
|
-
|
413
|
-
|
414
|
-
|
517
|
+
reader.each do |record; safe_count, safe_position_in_input |
|
518
|
+
count += 1
|
519
|
+
position_in_input += 1
|
415
520
|
|
416
|
-
|
521
|
+
# have to use a block local var, so the changing `count` one
|
522
|
+
# doesn't get caught in the closure. Don't totally get it, but
|
523
|
+
# I think it's so.
|
524
|
+
safe_count, safe_position_in_input = count, position_in_input
|
417
525
|
|
418
|
-
|
419
|
-
$stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
|
420
|
-
end
|
526
|
+
thread_pool.raise_collected_exception!
|
421
527
|
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
:position => position,
|
426
|
-
:logger => logger
|
427
|
-
)
|
428
|
-
|
429
|
-
if log_batch_size && (count % log_batch_size == 0)
|
430
|
-
batch_rps = log_batch_size / (Time.now - batch_start_time)
|
431
|
-
overall_rps = count / (Time.now - start_time)
|
432
|
-
logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at id:#{context.source_record_id}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
|
433
|
-
batch_start_time = Time.now
|
434
|
-
end
|
528
|
+
if settings["debug_ascii_progress"].to_s == "true"
|
529
|
+
$stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
|
530
|
+
end
|
435
531
|
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
532
|
+
context = Context.new(
|
533
|
+
:source_record => record,
|
534
|
+
:source_record_id_proc => source_record_id_proc,
|
535
|
+
:settings => settings,
|
536
|
+
:position => safe_count,
|
537
|
+
:input_name => input_name,
|
538
|
+
:position_in_input => safe_position_in_input,
|
539
|
+
:logger => logger
|
540
|
+
)
|
541
|
+
|
542
|
+
if log_batch_size && (count % log_batch_size == 0)
|
543
|
+
batch_rps = log_batch_size / (Time.now - batch_start_time)
|
544
|
+
overall_rps = count / (Time.now - start_time)
|
545
|
+
logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at: #{context.source_inspect}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
|
546
|
+
batch_start_time = Time.now
|
445
547
|
end
|
446
548
|
|
549
|
+
# We pass context in a block arg to properly 'capture' it, so
|
550
|
+
# we don't accidentally share the local var under closure between
|
551
|
+
# threads.
|
552
|
+
thread_pool.maybe_in_thread_pool(context) do |t_context|
|
553
|
+
map_to_context!(t_context)
|
554
|
+
if context.skip?
|
555
|
+
log_skip(t_context)
|
556
|
+
else
|
557
|
+
writer.put t_context
|
558
|
+
end
|
559
|
+
end
|
447
560
|
end
|
448
|
-
|
449
561
|
end
|
450
562
|
$stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
|
451
563
|
|
@@ -455,39 +567,156 @@ class Traject::Indexer
|
|
455
567
|
|
456
568
|
thread_pool.raise_collected_exception!
|
457
569
|
|
570
|
+
complete
|
571
|
+
|
572
|
+
elapsed = Time.now - start_time
|
573
|
+
avg_rps = (count / elapsed)
|
574
|
+
logger.info "finished Traject::Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
|
575
|
+
|
576
|
+
if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
|
577
|
+
logger.error "Traject::Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
|
578
|
+
return false
|
579
|
+
end
|
580
|
+
|
581
|
+
return true
|
582
|
+
end
|
583
|
+
|
584
|
+
def completed?
|
585
|
+
@completed
|
586
|
+
end
|
587
|
+
|
588
|
+
# Instance variable readers and writers are not generally re-usble.
|
589
|
+
# The writer may have been closed. The reader does it's thing and doesn't
|
590
|
+
# rewind. If we're completed, as a sanity check don't let someone do
|
591
|
+
# something with the indexer that uses the reader or writer and isn't gonna work.
|
592
|
+
protected def check_uncompleted
|
593
|
+
if completed?
|
594
|
+
raise CompletedStateError.new("This Traject::Indexer has been completed, and it's reader and writer are not in a usable state")
|
595
|
+
end
|
596
|
+
end
|
458
597
|
|
598
|
+
# Closes the writer (which may flush/save/finalize buffered records),
|
599
|
+
# and calls run_after_processing_steps
|
600
|
+
def complete
|
459
601
|
writer.close if writer.respond_to?(:close)
|
602
|
+
run_after_processing_steps
|
460
603
|
|
604
|
+
# after an indexer has been completed, it is not really usable anymore,
|
605
|
+
# as the writer has been closed.
|
606
|
+
@completed = true
|
607
|
+
end
|
608
|
+
|
609
|
+
def run_after_processing_steps
|
461
610
|
@after_processing_steps.each do |step|
|
462
611
|
begin
|
463
612
|
step.execute
|
464
|
-
rescue
|
613
|
+
rescue StandardError => e
|
465
614
|
logger.fatal("Unexpected exception #{e} when executing #{step}")
|
466
615
|
raise e
|
467
616
|
end
|
468
617
|
end
|
618
|
+
end
|
469
619
|
|
470
|
-
|
471
|
-
|
472
|
-
|
620
|
+
# A light-weight process method meant for programmatic use, generally
|
621
|
+
# intended for only a "few" (not milliions) of records.
|
622
|
+
#
|
623
|
+
# It does _not_ use instance-configured reader or writer, instead taking
|
624
|
+
# a source/reader and destination/writer as arguments to this call.
|
625
|
+
#
|
626
|
+
# The reader can be anything that has an #each returning source
|
627
|
+
# records. This includes an ordinary array of source records, or any
|
628
|
+
# traject Reader.
|
629
|
+
#
|
630
|
+
# The writer can be anything with a #put method taking a Traject::Indexer::Context.
|
631
|
+
# For convenience, see the Traject::ArrayWriter that just collects output in an array.
|
632
|
+
#
|
633
|
+
# Return value of process_with is the writer passed as second arg, for your convenience.
|
634
|
+
#
|
635
|
+
# This does much less than the full #process method, to be more flexible
|
636
|
+
# and make fewer assumptions:
|
637
|
+
#
|
638
|
+
# * Will never use any additional threads (unless writer does). Wrap in your own threading if desired.
|
639
|
+
# * Will not do any standard logging or progress bars, regardless of indexer settings.
|
640
|
+
# Log yourself if desired.
|
641
|
+
# * Will _not_ call any `after_processing` steps. Call yourself with `indexer.run_after_processing_steps` as desired.
|
642
|
+
# * WILL by default call #close on the writer, IF the writer has a #close method.
|
643
|
+
# pass `:close_writer => false` to not do so.
|
644
|
+
# * exceptions will just raise out, unless you pass in a rescue: option, value is a proc/lambda
|
645
|
+
# that will receive two args, context and exception. If the rescue proc doesn't re-raise,
|
646
|
+
# `process_with` will continue to process subsequent records.
|
647
|
+
#
|
648
|
+
# @example
|
649
|
+
# array_writer_instance = indexer.process_with([record1, record2], Traject::ArrayWriter.new)
|
650
|
+
#
|
651
|
+
# @example With a block, in addition to or instead of a writer.
|
652
|
+
#
|
653
|
+
# indexer.process_with([record]) do |context|
|
654
|
+
# do_something_with(context.output_hash)
|
655
|
+
# end
|
656
|
+
#
|
657
|
+
# @param source [#each]
|
658
|
+
# @param destination [#put]
|
659
|
+
# @param close_writer whether the destination should have #close called on it, if it responds to.
|
660
|
+
# @param rescue_with [Proc] to call on errors, taking two args: A Traject::Indexer::Context and an exception.
|
661
|
+
# If nil (default), exceptions will be raised out. If set, you can raise or handle otherwise if you like.
|
662
|
+
# @param on_skipped [Proc] will be called for any skipped records, with one arg Traject::Indexer::Context
|
663
|
+
def process_with(source, destination = nil, close_writer: true, rescue_with: nil, on_skipped: nil)
|
664
|
+
unless destination || block_given?
|
665
|
+
raise ArgumentError, "Need either a second arg writer/destination, or a block"
|
666
|
+
end
|
473
667
|
|
474
|
-
|
475
|
-
|
476
|
-
|
668
|
+
settings.fill_in_defaults!
|
669
|
+
|
670
|
+
position = 0
|
671
|
+
input_name = Traject::Util.io_name(source)
|
672
|
+
source.each do |record |
|
673
|
+
begin
|
674
|
+
position += 1
|
675
|
+
|
676
|
+
context = Context.new(
|
677
|
+
:source_record => record,
|
678
|
+
:source_record_id_proc => source_record_id_proc,
|
679
|
+
:settings => settings,
|
680
|
+
:position => position,
|
681
|
+
:position_in_input => (position if input_name),
|
682
|
+
:logger => logger
|
683
|
+
)
|
684
|
+
|
685
|
+
map_to_context!(context)
|
686
|
+
|
687
|
+
if context.skip?
|
688
|
+
on_skipped.call(context) if on_skipped
|
689
|
+
else
|
690
|
+
destination.put(context) if destination
|
691
|
+
yield(context) if block_given?
|
692
|
+
end
|
693
|
+
rescue StandardError => e
|
694
|
+
if rescue_with
|
695
|
+
rescue_with.call(context, e)
|
696
|
+
else
|
697
|
+
raise e
|
698
|
+
end
|
699
|
+
end
|
477
700
|
end
|
478
701
|
|
479
|
-
|
702
|
+
if close_writer && destination.respond_to?(:close)
|
703
|
+
destination.close
|
704
|
+
end
|
705
|
+
|
706
|
+
return destination
|
480
707
|
end
|
481
708
|
|
482
709
|
# Log that the current record is being skipped, using
|
483
710
|
# data in context.position and context.skipmessage
|
484
711
|
def log_skip(context)
|
485
|
-
logger.debug "Skipped record #{context.
|
712
|
+
logger.debug "Skipped record #{context.record_inspect}: #{context.skipmessage}"
|
486
713
|
end
|
487
714
|
|
488
715
|
def reader_class
|
489
716
|
unless defined? @reader_class
|
490
|
-
|
717
|
+
reader_class_name = settings["reader_class_name"]
|
718
|
+
|
719
|
+
@reader_class = qualified_const_get(reader_class_name)
|
491
720
|
end
|
492
721
|
return @reader_class
|
493
722
|
end
|