traject 1.1.0 → 2.0.0.rc.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +20 -0
- data/README.md +85 -73
- data/doc/batch_execution.md +2 -6
- data/doc/other_commands.md +3 -5
- data/doc/settings.md +27 -38
- data/lib/traject/command_line.rb +1 -1
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +29 -11
- data/lib/traject/indexer/settings.rb +39 -13
- data/lib/traject/line_writer.rb +10 -6
- data/lib/traject/marc_reader.rb +2 -1
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +38 -48
- data/lib/traject/translation_map.rb +3 -0
- data/lib/traject/util.rb +13 -51
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/marc_geographic.yaml +2 -2
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/read_write_test.rb +0 -22
- data/test/indexer/settings_test.rb +24 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +5 -3
- data/test/test_support/demo_config.rb +0 -5
- data/test/translation_map_test.rb +9 -0
- data/traject.gemspec +18 -5
- metadata +77 -87
- data/lib/traject/marc4j_reader.rb +0 -153
- data/lib/traject/solrj_writer.rb +0 -351
- data/test/marc4j_reader_test.rb +0 -136
- data/test/solrj_writer_test.rb +0 -209
- data/vendor/solrj/README +0 -8
- data/vendor/solrj/build.xml +0 -39
- data/vendor/solrj/ivy.xml +0 -16
- data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
- data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
- data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
- data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
- data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
- data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
- data/vendor/solrj/lib/noggit-0.5.jar +0 -0
- data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
- data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
- data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
- data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
data/lib/traject/command_line.rb
CHANGED
@@ -12,7 +12,7 @@ module Traject
|
|
12
12
|
#
|
13
13
|
# A CommandLine object has a single persistent Indexer object it uses
|
14
14
|
class CommandLine
|
15
|
-
# orig_argv is
|
15
|
+
# orig_argv is original one passed in, remaining_argv is after destructive
|
16
16
|
# processing by slop, still has file args in it etc.
|
17
17
|
attr_accessor :orig_argv, :remaining_argv
|
18
18
|
attr_accessor :slop, :options
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'traject/delimited_writer'
|
2
|
+
require 'csv'
|
3
|
+
|
4
|
+
# A CSV-writer, for folks who like that sort of thing.
|
5
|
+
# Use DelimitedWriter for non-CSV lines (e.g., tab-delimited)
|
6
|
+
#
|
7
|
+
#
|
8
|
+
|
9
|
+
class Traject::CSVWriter < Traject::DelimitedWriter
|
10
|
+
|
11
|
+
def initialize(*args)
|
12
|
+
super
|
13
|
+
self.delimiter = nil # Let CSV take care of it
|
14
|
+
end
|
15
|
+
|
16
|
+
def _write(data)
|
17
|
+
@output_file << data
|
18
|
+
end
|
19
|
+
|
20
|
+
# Turn the output file into a CSV writer
|
21
|
+
def open_output_file
|
22
|
+
of = super
|
23
|
+
CSV.new(of)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Let CSV take care of the comma escaping
|
27
|
+
def escape(x)
|
28
|
+
x = x.to_s
|
29
|
+
x.gsub! internal_delimiter, @eidelim
|
30
|
+
x
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
require 'traject/line_writer'
|
2
|
+
|
3
|
+
# A simple line writer that uses configuration to determine
|
4
|
+
# how to produce a tab-delimited file
|
5
|
+
#
|
6
|
+
# Appropos settings:
|
7
|
+
#
|
8
|
+
# * output_file -- the file to write to
|
9
|
+
# * output_stream -- the stream to write to, if defined and output_file is not
|
10
|
+
# * delimited_writer.delimiter -- What to separate fields with; default is tab
|
11
|
+
# * delimited_writer.internal_delimiter -- Delimiter _within_ a field, for multiple
|
12
|
+
# values. Default is pipe ( | )
|
13
|
+
# * delimited_writer.fields -- comma-separated list of the fields to output
|
14
|
+
# * delimited_writer.header (true/false) -- boolean that determines if we should output a header row. Default is true
|
15
|
+
# * delimited_writer.escape -- If a value actually contains the delimited or internal_delimiter, what to do?
|
16
|
+
# If unset, will follow the procedure below. If set, will turn it into the character(s) given
|
17
|
+
#
|
18
|
+
#
|
19
|
+
# If `delimited_writer.escape` is not set, the writer will automatically
|
20
|
+
# escape delimiters/internal_delimiters in the following way:
|
21
|
+
# * If the delimiter is a tab, replace tabs in values with a single space
|
22
|
+
# * If the delimiter is anything else, prefix it with a backslash
|
23
|
+
|
24
|
+
class Traject::DelimitedWriter < Traject::LineWriter
|
25
|
+
|
26
|
+
attr_reader :delimiter, :internal_delimiter, :edelim, :eidelim
|
27
|
+
attr_accessor :header
|
28
|
+
|
29
|
+
def initialize(settings)
|
30
|
+
super
|
31
|
+
|
32
|
+
# fields to output
|
33
|
+
|
34
|
+
begin
|
35
|
+
@fields = settings['delimited_writer.fields'].split(",")
|
36
|
+
rescue NoMethodError => e
|
37
|
+
end
|
38
|
+
|
39
|
+
if e or @fields.empty?
|
40
|
+
raise ArgumentError.new("#{self.class.name} must have a comma-delimited list of field names to output set in setting 'delimited_writer.fields'")
|
41
|
+
end
|
42
|
+
|
43
|
+
self.delimiter = settings['delimited_writer.delimiter'] || "\t"
|
44
|
+
self.internal_delimiter = settings['delimited_writer.internal_delimiter'] || '|'
|
45
|
+
self.header = settings['delimited_writer.header'].to_s != 'false'
|
46
|
+
|
47
|
+
# Output the header if need be
|
48
|
+
write_header if @header
|
49
|
+
end
|
50
|
+
|
51
|
+
def escaped_delimiter(d)
|
52
|
+
return nil if d.nil?
|
53
|
+
d == "\t" ? ' ' : '\\' + d
|
54
|
+
end
|
55
|
+
|
56
|
+
def delimiter=(d)
|
57
|
+
@delimiter = d
|
58
|
+
@edelim = escaped_delimiter(d)
|
59
|
+
self
|
60
|
+
end
|
61
|
+
|
62
|
+
def internal_delimiter=(d)
|
63
|
+
@internal_delimiter = d
|
64
|
+
@eidelim = escaped_delimiter(d)
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
def write_header
|
71
|
+
_write(@fields)
|
72
|
+
end
|
73
|
+
|
74
|
+
def _write(data)
|
75
|
+
output_file.puts(data.join(delimiter))
|
76
|
+
end
|
77
|
+
|
78
|
+
# Get the output values out of the context
|
79
|
+
def raw_output_values(context)
|
80
|
+
context.output_hash.values_at(*@fields)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Escape the delimiters in whatever way has been defined
|
84
|
+
def escape(x)
|
85
|
+
x = x.to_s
|
86
|
+
x.gsub! @delimiter, @edelim if @delimiter
|
87
|
+
x.gsub! @internal_delimiter, @eidelim
|
88
|
+
x
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
# Derive actual output field values from the raw values
|
93
|
+
def output_values(raw)
|
94
|
+
raw.map do |x|
|
95
|
+
if x.is_a? Array
|
96
|
+
x.map!{|s| escape(s)}
|
97
|
+
x.join(@internal_delimiter)
|
98
|
+
else
|
99
|
+
escape(x)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# Spit out the escaped values joined by the delimiter
|
105
|
+
def serialize(context)
|
106
|
+
output_values(raw_output_values(context))
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
end
|
data/lib/traject/indexer.rb
CHANGED
@@ -6,13 +6,16 @@ require 'traject/thread_pool'
|
|
6
6
|
|
7
7
|
require 'traject/indexer/settings'
|
8
8
|
require 'traject/marc_reader'
|
9
|
-
require 'traject/marc4j_reader'
|
10
9
|
require 'traject/json_writer'
|
11
|
-
require 'traject/
|
10
|
+
require 'traject/solr_json_writer'
|
12
11
|
|
13
12
|
require 'traject/macros/marc21'
|
14
13
|
require 'traject/macros/basic'
|
15
14
|
|
15
|
+
if defined? JRUBY_VERSION
|
16
|
+
require 'traject/marc4j_reader'
|
17
|
+
end
|
18
|
+
|
16
19
|
# This class does indexing for traject: Getting input records from a Reader
|
17
20
|
# class, mapping the input records to an output hash, and then sending the output
|
18
21
|
# hash off somewhere (usually Solr) with a Writer class.
|
@@ -53,8 +56,9 @@ require 'traject/macros/basic'
|
|
53
56
|
# 2) Responds to the usual ruby #each, returning a source record from each #each.
|
54
57
|
# (Including Enumerable is prob a good idea too)
|
55
58
|
#
|
56
|
-
# The default reader is the Traject::
|
57
|
-
# further customized by several settings in the Settings hash.
|
59
|
+
# The default reader is the Traject::MarcReader, who's behavior is
|
60
|
+
# further customized by several settings in the Settings hash. Jruby users
|
61
|
+
# with specialized needs may want to look at the gem traject-marc4j_reader.
|
58
62
|
#
|
59
63
|
# Alternate readers can be set directly with the #reader_class= method, or
|
60
64
|
# with the "reader_class_name" Setting, a String name of a class
|
@@ -72,14 +76,22 @@ require 'traject/macros/basic'
|
|
72
76
|
# 4) Optionally implements a #skipped_record_count method, returning int count of records
|
73
77
|
# that were skipped due to errors (and presumably logged)
|
74
78
|
#
|
75
|
-
#
|
76
|
-
#
|
77
|
-
#
|
78
|
-
#
|
79
|
+
# Traject packages one solr writer: traject/solr_json_writer, which sends
|
80
|
+
# in json format and works under both ruby and jruby, but only with solr version
|
81
|
+
# >= 3.2. To index to an older solr installation, you'll need to use jruby and
|
82
|
+
# install the gem traject-solrj_writer, which uses the solrj .jar underneath.
|
79
83
|
#
|
80
84
|
# You can set alternate writers by setting a Class object directly
|
81
85
|
# with the #writer_class method, or by the 'writer_class_name' Setting,
|
82
|
-
# with a String name of class meeting the Writer contract.
|
86
|
+
# with a String name of class meeting the Writer contract. There are several
|
87
|
+
# that ship with traject itself:
|
88
|
+
#
|
89
|
+
# * traject/json_writer (Traject::JsonWriter) -- write newline-delimied json files.
|
90
|
+
# * traject/yaml_writer (Traject::YamlWriter) -- write pretty yaml file; very human-readable
|
91
|
+
# * traject/debug_writer (Traject::DebugWriter) -- write a tab-delimited file where
|
92
|
+
# each line consists of the id, field, and value(s).
|
93
|
+
# * traject/delimited_writer and traject/csv_writer -- write character-delimited files
|
94
|
+
# (default is tab-delimited) or comma-separated-value files.
|
83
95
|
#
|
84
96
|
class Traject::Indexer
|
85
97
|
|
@@ -310,7 +322,13 @@ class Traject::Indexer
|
|
310
322
|
reader = self.reader!(io_stream)
|
311
323
|
writer = self.writer!
|
312
324
|
|
313
|
-
|
325
|
+
|
326
|
+
processing_threads = settings["processing_thread_pool"].to_i
|
327
|
+
if processing_threads > 0 and !(defined? JRuby)
|
328
|
+
processing_threads = 0
|
329
|
+
logger.warn "Processing threads set to 0 because we're not running under JRuby"
|
330
|
+
end
|
331
|
+
thread_pool = Traject::ThreadPool.new(processing_threads)
|
314
332
|
|
315
333
|
logger.info " Indexer with reader: #{reader.class.name} and writer: #{writer.class.name}"
|
316
334
|
|
@@ -326,7 +344,7 @@ class Traject::Indexer
|
|
326
344
|
thread_pool.raise_collected_exception!
|
327
345
|
|
328
346
|
if settings["debug_ascii_progress"].to_s == "true"
|
329
|
-
$stderr.write "." if count % settings["
|
347
|
+
$stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
|
330
348
|
end
|
331
349
|
|
332
350
|
if log_batch_size && (count % log_batch_size == 0)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'hashie'
|
2
|
+
require 'concurrent'
|
2
3
|
|
3
4
|
class Traject::Indexer
|
4
5
|
|
@@ -22,9 +23,6 @@ class Traject::Indexer
|
|
22
23
|
include Hashie::Extensions::MergeInitializer # can init with hash
|
23
24
|
include Hashie::Extensions::IndifferentAccess
|
24
25
|
|
25
|
-
# Hashie bug Issue #100 https://github.com/intridea/hashie/pull/100
|
26
|
-
alias_method :store, :indifferent_writer
|
27
|
-
|
28
26
|
def initialize(*args)
|
29
27
|
super
|
30
28
|
self.default_proc = lambda do |hash, key|
|
@@ -59,19 +57,37 @@ class Traject::Indexer
|
|
59
57
|
def fill_in_defaults!
|
60
58
|
self.reverse_merge!(self.class.defaults)
|
61
59
|
end
|
60
|
+
|
61
|
+
|
62
|
+
def self.mri_defaults
|
63
|
+
{
|
64
|
+
"reader_class_name" => "Traject::MarcReader",
|
65
|
+
"writer_class_name" => "Traject::SolrJsonWriter",
|
66
|
+
"marc_source.type" => "binary",
|
67
|
+
"solrj_writer.batch_size" => 200,
|
68
|
+
"solrj_writer.thread_pool" => 1,
|
69
|
+
"processing_thread_pool" => self.default_processing_thread_pool,
|
70
|
+
"log.batch_size.severity" => "info"
|
71
|
+
}
|
72
|
+
end
|
62
73
|
|
63
|
-
def self.
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
"marc_source.type" => "binary",
|
68
|
-
"marc4j_reader.permissive" => true,
|
69
|
-
"solrj_writer.batch_size" => 200,
|
70
|
-
"solrj_writer.thread_pool" => 1,
|
71
|
-
"processing_thread_pool" => 3,
|
72
|
-
"log.batch_size.severity" => "info"
|
74
|
+
def self.jruby_defaults
|
75
|
+
{
|
76
|
+
'reader_class_name' => "Traject::Marc4JReader",
|
77
|
+
'marc4j_reader.permissive' => true
|
73
78
|
}
|
74
79
|
end
|
80
|
+
|
81
|
+
|
82
|
+
def self.defaults
|
83
|
+
return @@defaults if defined? @@defaults
|
84
|
+
default_settings = self.mri_defaults
|
85
|
+
if defined? JRUBY_VERSION
|
86
|
+
default_settings.merge! self.jruby_defaults
|
87
|
+
end
|
88
|
+
|
89
|
+
@@defaults = default_settings
|
90
|
+
end
|
75
91
|
|
76
92
|
def inspect
|
77
93
|
# Keep any key ending in password out of the inspect
|
@@ -80,5 +96,15 @@ class Traject::Indexer
|
|
80
96
|
hash
|
81
97
|
end.inspect
|
82
98
|
end
|
99
|
+
|
100
|
+
protected
|
101
|
+
def self.default_processing_thread_pool
|
102
|
+
if ["jruby", "rbx"].include? ENV["RUBY_ENGINE"]
|
103
|
+
[1, Concurrent.processor_count - 1].max
|
104
|
+
else
|
105
|
+
1
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
83
109
|
end
|
84
110
|
end
|
data/lib/traject/line_writer.rb
CHANGED
@@ -16,14 +16,18 @@ require 'thread'
|
|
16
16
|
# method. For instance, see JsonWriter.
|
17
17
|
class Traject::LineWriter
|
18
18
|
attr_reader :settings
|
19
|
-
attr_reader :write_mutex
|
19
|
+
attr_reader :write_mutex, :output_file
|
20
20
|
|
21
21
|
def initialize(argSettings)
|
22
22
|
@settings = argSettings
|
23
23
|
@write_mutex = Mutex.new
|
24
24
|
|
25
25
|
# trigger lazy loading now for thread-safety
|
26
|
-
output_file
|
26
|
+
@output_file = open_output_file
|
27
|
+
end
|
28
|
+
|
29
|
+
def _write(data)
|
30
|
+
output_file.puts(data)
|
27
31
|
end
|
28
32
|
|
29
33
|
|
@@ -34,13 +38,13 @@ class Traject::LineWriter
|
|
34
38
|
def put(context)
|
35
39
|
serialized = serialize(context)
|
36
40
|
write_mutex.synchronize do
|
37
|
-
|
41
|
+
_write(serialized)
|
38
42
|
end
|
39
43
|
end
|
40
44
|
|
41
|
-
def
|
45
|
+
def open_output_file
|
42
46
|
unless defined? @output_file
|
43
|
-
|
47
|
+
of =
|
44
48
|
if settings["output_file"]
|
45
49
|
File.open(settings["output_file"], 'w:UTF-8')
|
46
50
|
elsif settings["output_stream"]
|
@@ -49,7 +53,7 @@ class Traject::LineWriter
|
|
49
53
|
$stdout
|
50
54
|
end
|
51
55
|
end
|
52
|
-
return
|
56
|
+
return of
|
53
57
|
end
|
54
58
|
|
55
59
|
def close
|
data/lib/traject/marc_reader.rb
CHANGED
@@ -5,7 +5,8 @@ require 'traject/ndj_reader'
|
|
5
5
|
# can read MARC ISO 2709 ('binary'), MARC-XML, and Marc-in-json (newline-delimited-json).
|
6
6
|
#
|
7
7
|
# Marc4JReader is an alternative to this class, powered by Marc4J. You may be interested
|
8
|
-
# in comparing for performance, under your particular use case.
|
8
|
+
# in comparing for performance, under your particular use case. To use it, you'll need
|
9
|
+
# the gem traject-marc4j_reader.
|
9
10
|
#
|
10
11
|
# By default assumes binary MARC encoding, please set marc_source.type setting
|
11
12
|
# for XML or json. If binary, please set marc_source.encoding with char encoding.
|
@@ -0,0 +1,277 @@
|
|
1
|
+
require 'yell'
|
2
|
+
|
3
|
+
require 'traject'
|
4
|
+
require 'traject/util'
|
5
|
+
require 'traject/qualified_const_get'
|
6
|
+
require 'traject/thread_pool'
|
7
|
+
|
8
|
+
require 'json'
|
9
|
+
require 'httpclient'
|
10
|
+
|
11
|
+
require 'uri'
|
12
|
+
require 'thread' # for Mutex/Queue
|
13
|
+
require 'concurrent' # for atomic_fixnum
|
14
|
+
|
15
|
+
# Write to Solr using the JSON interface; only works for Solr >= 3.2
|
16
|
+
#
|
17
|
+
# This should work under both MRI and JRuby, with JRuby getting much
|
18
|
+
# better performance due to the threading model.
|
19
|
+
#
|
20
|
+
# Relevant settings
|
21
|
+
#
|
22
|
+
# * solr.url (optional if solr.update_url is set) The URL to the solr core to index into
|
23
|
+
#
|
24
|
+
# * solr.update_url: The actual update url. If unset, we'll first see if
|
25
|
+
# "#{solr.url}/update/json" exists, and if not use "#{solr.url}/update"
|
26
|
+
#
|
27
|
+
# * solr_writer.batch_size: How big a batch to send to solr. Default is 100.
|
28
|
+
# My tests indicate that this setting doesn't change overall index speed by a ton.
|
29
|
+
#
|
30
|
+
# * solr_writer.thread_pool: How many threads to use for the writer. Default is 1.
|
31
|
+
# Likely useful even under MRI since thread will be waiting on Solr for some time.
|
32
|
+
#
|
33
|
+
# * solr_writer.max_skipped: How many records skipped due to errors before we
|
34
|
+
# bail out with a fatal error? Set to -1 for unlimited skips. Default 0,
|
35
|
+
# raise and abort on a single record that could not be added to Solr.
|
36
|
+
#
|
37
|
+
# * solr_writer.commit_on_close: Set to true (or "true") if you want to commit at the
|
38
|
+
# end of the indexing run. (Old "solrj_writer.commit_on_close" supported for backwards
|
39
|
+
# compat only.)
|
40
|
+
#
|
41
|
+
# * solr_writer.commit_timeout: If commit_on_close, how long to wait for Solr before
|
42
|
+
# giving up as a timeout. Default 10 minutes. Solr can be slow.
|
43
|
+
#
|
44
|
+
# * solr_json_writer.http_client Mainly intended for testing, set your own HTTPClient
|
45
|
+
# or mock object to be used for HTTP.
|
46
|
+
|
47
|
+
|
48
|
+
class Traject::SolrJsonWriter
|
49
|
+
include Traject::QualifiedConstGet
|
50
|
+
|
51
|
+
DEFAULT_MAX_SKIPPED = 0
|
52
|
+
DEFAULT_BATCH_SIZE = 100
|
53
|
+
|
54
|
+
# The passed-in settings
|
55
|
+
attr_reader :settings, :thread_pool_size
|
56
|
+
|
57
|
+
# A queue to hold documents before sending to solr
|
58
|
+
attr_reader :batched_queue
|
59
|
+
|
60
|
+
def initialize(argSettings)
|
61
|
+
@settings = Traject::Indexer::Settings.new(argSettings)
|
62
|
+
|
63
|
+
# Set max errors
|
64
|
+
@max_skipped = (@settings['solr_writer.max_skipped'] || DEFAULT_MAX_SKIPPED).to_i
|
65
|
+
if @max_skipped < 0
|
66
|
+
@max_skipped = nil
|
67
|
+
end
|
68
|
+
|
69
|
+
@http_client = @settings["solr_json_writer.http_client"] || HTTPClient.new
|
70
|
+
|
71
|
+
@batch_size = (settings["solr_writer.batch_size"] || DEFAULT_BATCH_SIZE).to_i
|
72
|
+
@batch_size = 1 if @batch_size < 1
|
73
|
+
|
74
|
+
# Store error count in an AtomicInteger, so multi threads can increment
|
75
|
+
# it safely, if we're threaded.
|
76
|
+
@skipped_record_incrementer = Concurrent::AtomicFixnum.new(0)
|
77
|
+
|
78
|
+
|
79
|
+
# How many threads to use for the writer?
|
80
|
+
# if our thread pool settings are 0, it'll just create a null threadpool that
|
81
|
+
# executes in calling context.
|
82
|
+
@thread_pool_size = (@settings["solr_writer.thread_pool"] || 1).to_i
|
83
|
+
|
84
|
+
@batched_queue = Queue.new
|
85
|
+
@thread_pool = Traject::ThreadPool.new(@thread_pool_size)
|
86
|
+
|
87
|
+
# old setting solrj_writer supported for backwards compat, as we make
|
88
|
+
# this the new default writer.
|
89
|
+
@commit_on_close = (settings["solr_writer.commit_on_close"] || settings["solrj_writer.commit_on_close"]).to_s == "true"
|
90
|
+
|
91
|
+
# Figure out where to send updates
|
92
|
+
@solr_update_url = self.determine_solr_update_url
|
93
|
+
|
94
|
+
logger.info(" #{self.class.name} writing to '#{@solr_update_url}' in batches of #{@batch_size} with #{@thread_pool_size} bg threads")
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
# Add a single context to the queue, ready to be sent to solr
|
99
|
+
def put(context)
|
100
|
+
@thread_pool.raise_collected_exception!
|
101
|
+
|
102
|
+
@batched_queue << context
|
103
|
+
if @batched_queue.size >= @batch_size
|
104
|
+
batch = Traject::Util.drain_queue(@batched_queue)
|
105
|
+
@thread_pool.maybe_in_thread_pool(batch) {|batch_arg| send_batch(batch_arg) }
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# Send the given batch of contexts. If something goes wrong, send
|
110
|
+
# them one at a time.
|
111
|
+
# @param [Array<Traject::Indexer::Context>] an array of contexts
|
112
|
+
def send_batch(batch)
|
113
|
+
return if batch.empty?
|
114
|
+
json_package = JSON.generate(batch.map { |c| c.output_hash })
|
115
|
+
begin
|
116
|
+
resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
|
117
|
+
rescue StandardError => exception
|
118
|
+
end
|
119
|
+
|
120
|
+
if exception || resp.status != 200
|
121
|
+
error_message = exception ?
|
122
|
+
Traject::Util.exception_to_log_message(exception) :
|
123
|
+
"Solr response: #{resp.status}: #{resp.body}"
|
124
|
+
|
125
|
+
logger.error "Error in Solr batch add. Will retry documents individually at performance penalty: #{error_message}"
|
126
|
+
|
127
|
+
batch.each do |c|
|
128
|
+
send_single(c)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
|
134
|
+
# Send a single context to Solr, logging an error if need be
|
135
|
+
# @param [Traject::Indexer::Context] c The context whose document you want to send
|
136
|
+
def send_single(c)
|
137
|
+
json_package = JSON.generate([c.output_hash])
|
138
|
+
begin
|
139
|
+
resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
|
140
|
+
# Catch Timeouts and network errors as skipped records, but otherwise
|
141
|
+
# allow unexpected errors to propagate up.
|
142
|
+
rescue HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED => exception
|
143
|
+
end
|
144
|
+
|
145
|
+
if exception || resp.status != 200
|
146
|
+
if exception
|
147
|
+
msg = Traject::Util.exception_to_log_message(e)
|
148
|
+
else
|
149
|
+
msg = "Solr error response: #{resp.status}: #{resp.body}"
|
150
|
+
end
|
151
|
+
logger.error "Could not add record #{record_id_from_context c} at source file position #{c.position}: #{msg}"
|
152
|
+
logger.debug(c.source_record.to_s)
|
153
|
+
|
154
|
+
@skipped_record_incrementer.increment
|
155
|
+
if @max_skipped and skipped_record_count > @max_skipped
|
156
|
+
raise RuntimeError.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting")
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
# Get the logger from the settings, or default to an effectively null logger
|
165
|
+
def logger
|
166
|
+
settings["logger"] ||= Yell.new(STDERR, :level => "gt.fatal") # null logger
|
167
|
+
end
|
168
|
+
|
169
|
+
# Returns MARC 001, then a slash, then output_hash["id"] -- if both
|
170
|
+
# are present. Otherwise may return just one, or even an empty string.
|
171
|
+
def record_id_from_context(context)
|
172
|
+
marc_id = context.source_record && context.source_record['001'] && context.source_record['001'].value
|
173
|
+
output_id = context.output_hash["id"]
|
174
|
+
|
175
|
+
return [marc_id, output_id].compact.join("/")
|
176
|
+
end
|
177
|
+
|
178
|
+
|
179
|
+
# On close, we need to (a) raise any exceptions we might have, (b) send off
|
180
|
+
# the last (possibly empty) batch, and (c) commit if instructed to do so
|
181
|
+
# via the solr_writer.commit_on_close setting.
|
182
|
+
def close
|
183
|
+
@thread_pool.raise_collected_exception!
|
184
|
+
|
185
|
+
# Finish off whatever's left. Do it in the thread pool for
|
186
|
+
# consistency, and to ensure expected order of operations, so
|
187
|
+
# it goes to the end of the queue behind any other work.
|
188
|
+
batch = Traject::Util.drain_queue(@batched_queue)
|
189
|
+
if batch.length > 0
|
190
|
+
@thread_pool.maybe_in_thread_pool { send_batch(batch) }
|
191
|
+
end
|
192
|
+
|
193
|
+
# Wait for shutdown, and time it.
|
194
|
+
logger.debug "#{self.class.name}: Shutting down thread pool, waiting if needed..."
|
195
|
+
elapsed = @thread_pool.shutdown_and_wait
|
196
|
+
if elapsed > 60
|
197
|
+
logger.warn "Waited #{elapsed} seconds for all threads, you may want to increase solr_writer.thread_pool (currently #{@settings["solr_writer.thread_pool"]})"
|
198
|
+
end
|
199
|
+
logger.debug "#{self.class.name}: Thread pool shutdown complete"
|
200
|
+
logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
|
201
|
+
|
202
|
+
# check again now that we've waited, there could still be some
|
203
|
+
# that didn't show up before.
|
204
|
+
@thread_pool.raise_collected_exception!
|
205
|
+
|
206
|
+
# Commit if we're supposed to
|
207
|
+
if @commit_on_close
|
208
|
+
commit
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
|
213
|
+
# Send a commit
|
214
|
+
def commit
|
215
|
+
logger.info "#{self.class.name} sending commit to solr at url #{@solr_update_url}..."
|
216
|
+
|
217
|
+
original_timeout = @http_client.receive_timeout
|
218
|
+
|
219
|
+
@http_client.receive_timeout = (settings["commit_timeout"] || (10 * 60)).to_i
|
220
|
+
|
221
|
+
resp = @http_client.get(@solr_update_url, {"commit" => 'true'})
|
222
|
+
unless resp.status == 200
|
223
|
+
raise RuntimeError.new("Could not commit to Solr: #{resp.status} #{resp.body}")
|
224
|
+
end
|
225
|
+
|
226
|
+
@http_client.receive_timeout = original_timeout
|
227
|
+
end
|
228
|
+
|
229
|
+
|
230
|
+
# Return count of encountered skipped records. Most accurate to call
|
231
|
+
# it after #close, in which case it should include full count, even
|
232
|
+
# under async thread_pool.
|
233
|
+
def skipped_record_count
|
234
|
+
@skipped_record_incrementer.value
|
235
|
+
end
|
236
|
+
|
237
|
+
|
238
|
+
# Relatively complex logic to determine if we have a valid URL and what it is
|
239
|
+
def determine_solr_update_url
|
240
|
+
if settings['solr.update_url']
|
241
|
+
check_solr_update_url(settings['solr.update_url'])
|
242
|
+
else
|
243
|
+
derive_solr_update_url_from_solr_url(settings['solr.url'])
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
|
248
|
+
# If we've got a solr.update_url, make sure it's ok
|
249
|
+
def check_solr_update_url(url)
|
250
|
+
unless url =~ /^#{URI::regexp}$/
|
251
|
+
raise ArgumentError.new("#{self.class.name} setting `solr.update_url` doesn't look like a URL: `#{url}`")
|
252
|
+
end
|
253
|
+
url
|
254
|
+
end
|
255
|
+
|
256
|
+
def derive_solr_update_url_from_solr_url(url)
|
257
|
+
# Nil? Then we bail
|
258
|
+
if url.nil?
|
259
|
+
raise ArgumentError.new("#{self.class.name}: Neither solr.update_url nor solr.url set; need at least one")
|
260
|
+
end
|
261
|
+
|
262
|
+
# Not a URL? Bail
|
263
|
+
unless url =~ /^#{URI::regexp}$/
|
264
|
+
raise ArgumentError.new("#{self.class.name} setting `solr.url` doesn't look like a URL: `#{url}`")
|
265
|
+
end
|
266
|
+
|
267
|
+
# First, try the /update/json handler
|
268
|
+
candidate = [url.chomp('/'), 'update', 'json'].join('/')
|
269
|
+
resp = @http_client.get(candidate)
|
270
|
+
if resp.status == 404
|
271
|
+
candidate = [url.chomp('/'), 'update'].join('/')
|
272
|
+
end
|
273
|
+
candidate
|
274
|
+
end
|
275
|
+
|
276
|
+
|
277
|
+
end
|