traject 1.1.0 → 2.0.0.rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +20 -0
- data/README.md +85 -73
- data/doc/batch_execution.md +2 -6
- data/doc/other_commands.md +3 -5
- data/doc/settings.md +27 -38
- data/lib/traject/command_line.rb +1 -1
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +29 -11
- data/lib/traject/indexer/settings.rb +39 -13
- data/lib/traject/line_writer.rb +10 -6
- data/lib/traject/marc_reader.rb +2 -1
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +38 -48
- data/lib/traject/translation_map.rb +3 -0
- data/lib/traject/util.rb +13 -51
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/marc_geographic.yaml +2 -2
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/read_write_test.rb +0 -22
- data/test/indexer/settings_test.rb +24 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +5 -3
- data/test/test_support/demo_config.rb +0 -5
- data/test/translation_map_test.rb +9 -0
- data/traject.gemspec +18 -5
- metadata +77 -87
- data/lib/traject/marc4j_reader.rb +0 -153
- data/lib/traject/solrj_writer.rb +0 -351
- data/test/marc4j_reader_test.rb +0 -136
- data/test/solrj_writer_test.rb +0 -209
- data/vendor/solrj/README +0 -8
- data/vendor/solrj/build.xml +0 -39
- data/vendor/solrj/ivy.xml +0 -16
- data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
- data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
- data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
- data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
- data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
- data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
- data/vendor/solrj/lib/noggit-0.5.jar +0 -0
- data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
- data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
- data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
- data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
data/lib/traject/command_line.rb
CHANGED
@@ -12,7 +12,7 @@ module Traject
|
|
12
12
|
#
|
13
13
|
# A CommandLine object has a single persistent Indexer object it uses
|
14
14
|
class CommandLine
|
15
|
-
# orig_argv is
|
15
|
+
# orig_argv is original one passed in, remaining_argv is after destructive
|
16
16
|
# processing by slop, still has file args in it etc.
|
17
17
|
attr_accessor :orig_argv, :remaining_argv
|
18
18
|
attr_accessor :slop, :options
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'traject/delimited_writer'
|
2
|
+
require 'csv'
|
3
|
+
|
4
|
+
# A CSV-writer, for folks who like that sort of thing.
|
5
|
+
# Use DelimitedWriter for non-CSV lines (e.g., tab-delimited)
|
6
|
+
#
|
7
|
+
#
|
8
|
+
|
9
|
+
class Traject::CSVWriter < Traject::DelimitedWriter
|
10
|
+
|
11
|
+
def initialize(*args)
|
12
|
+
super
|
13
|
+
self.delimiter = nil # Let CSV take care of it
|
14
|
+
end
|
15
|
+
|
16
|
+
def _write(data)
|
17
|
+
@output_file << data
|
18
|
+
end
|
19
|
+
|
20
|
+
# Turn the output file into a CSV writer
|
21
|
+
def open_output_file
|
22
|
+
of = super
|
23
|
+
CSV.new(of)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Let CSV take care of the comma escaping
|
27
|
+
def escape(x)
|
28
|
+
x = x.to_s
|
29
|
+
x.gsub! internal_delimiter, @eidelim
|
30
|
+
x
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
require 'traject/line_writer'
|
2
|
+
|
3
|
+
# A simple line writer that uses configuration to determine
|
4
|
+
# how to produce a tab-delimited file
|
5
|
+
#
|
6
|
+
# Appropos settings:
|
7
|
+
#
|
8
|
+
# * output_file -- the file to write to
|
9
|
+
# * output_stream -- the stream to write to, if defined and output_file is not
|
10
|
+
# * delimited_writer.delimiter -- What to separate fields with; default is tab
|
11
|
+
# * delimited_writer.internal_delimiter -- Delimiter _within_ a field, for multiple
|
12
|
+
# values. Default is pipe ( | )
|
13
|
+
# * delimited_writer.fields -- comma-separated list of the fields to output
|
14
|
+
# * delimited_writer.header (true/false) -- boolean that determines if we should output a header row. Default is true
|
15
|
+
# * delimited_writer.escape -- If a value actually contains the delimited or internal_delimiter, what to do?
|
16
|
+
# If unset, will follow the procedure below. If set, will turn it into the character(s) given
|
17
|
+
#
|
18
|
+
#
|
19
|
+
# If `delimited_writer.escape` is not set, the writer will automatically
|
20
|
+
# escape delimiters/internal_delimiters in the following way:
|
21
|
+
# * If the delimiter is a tab, replace tabs in values with a single space
|
22
|
+
# * If the delimiter is anything else, prefix it with a backslash
|
23
|
+
|
24
|
+
class Traject::DelimitedWriter < Traject::LineWriter
|
25
|
+
|
26
|
+
attr_reader :delimiter, :internal_delimiter, :edelim, :eidelim
|
27
|
+
attr_accessor :header
|
28
|
+
|
29
|
+
def initialize(settings)
|
30
|
+
super
|
31
|
+
|
32
|
+
# fields to output
|
33
|
+
|
34
|
+
begin
|
35
|
+
@fields = settings['delimited_writer.fields'].split(",")
|
36
|
+
rescue NoMethodError => e
|
37
|
+
end
|
38
|
+
|
39
|
+
if e or @fields.empty?
|
40
|
+
raise ArgumentError.new("#{self.class.name} must have a comma-delimited list of field names to output set in setting 'delimited_writer.fields'")
|
41
|
+
end
|
42
|
+
|
43
|
+
self.delimiter = settings['delimited_writer.delimiter'] || "\t"
|
44
|
+
self.internal_delimiter = settings['delimited_writer.internal_delimiter'] || '|'
|
45
|
+
self.header = settings['delimited_writer.header'].to_s != 'false'
|
46
|
+
|
47
|
+
# Output the header if need be
|
48
|
+
write_header if @header
|
49
|
+
end
|
50
|
+
|
51
|
+
def escaped_delimiter(d)
|
52
|
+
return nil if d.nil?
|
53
|
+
d == "\t" ? ' ' : '\\' + d
|
54
|
+
end
|
55
|
+
|
56
|
+
def delimiter=(d)
|
57
|
+
@delimiter = d
|
58
|
+
@edelim = escaped_delimiter(d)
|
59
|
+
self
|
60
|
+
end
|
61
|
+
|
62
|
+
def internal_delimiter=(d)
|
63
|
+
@internal_delimiter = d
|
64
|
+
@eidelim = escaped_delimiter(d)
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
def write_header
|
71
|
+
_write(@fields)
|
72
|
+
end
|
73
|
+
|
74
|
+
def _write(data)
|
75
|
+
output_file.puts(data.join(delimiter))
|
76
|
+
end
|
77
|
+
|
78
|
+
# Get the output values out of the context
|
79
|
+
def raw_output_values(context)
|
80
|
+
context.output_hash.values_at(*@fields)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Escape the delimiters in whatever way has been defined
|
84
|
+
def escape(x)
|
85
|
+
x = x.to_s
|
86
|
+
x.gsub! @delimiter, @edelim if @delimiter
|
87
|
+
x.gsub! @internal_delimiter, @eidelim
|
88
|
+
x
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
# Derive actual output field values from the raw values
|
93
|
+
def output_values(raw)
|
94
|
+
raw.map do |x|
|
95
|
+
if x.is_a? Array
|
96
|
+
x.map!{|s| escape(s)}
|
97
|
+
x.join(@internal_delimiter)
|
98
|
+
else
|
99
|
+
escape(x)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# Spit out the escaped values joined by the delimiter
|
105
|
+
def serialize(context)
|
106
|
+
output_values(raw_output_values(context))
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
end
|
data/lib/traject/indexer.rb
CHANGED
@@ -6,13 +6,16 @@ require 'traject/thread_pool'
|
|
6
6
|
|
7
7
|
require 'traject/indexer/settings'
|
8
8
|
require 'traject/marc_reader'
|
9
|
-
require 'traject/marc4j_reader'
|
10
9
|
require 'traject/json_writer'
|
11
|
-
require 'traject/
|
10
|
+
require 'traject/solr_json_writer'
|
12
11
|
|
13
12
|
require 'traject/macros/marc21'
|
14
13
|
require 'traject/macros/basic'
|
15
14
|
|
15
|
+
if defined? JRUBY_VERSION
|
16
|
+
require 'traject/marc4j_reader'
|
17
|
+
end
|
18
|
+
|
16
19
|
# This class does indexing for traject: Getting input records from a Reader
|
17
20
|
# class, mapping the input records to an output hash, and then sending the output
|
18
21
|
# hash off somewhere (usually Solr) with a Writer class.
|
@@ -53,8 +56,9 @@ require 'traject/macros/basic'
|
|
53
56
|
# 2) Responds to the usual ruby #each, returning a source record from each #each.
|
54
57
|
# (Including Enumerable is prob a good idea too)
|
55
58
|
#
|
56
|
-
# The default reader is the Traject::
|
57
|
-
# further customized by several settings in the Settings hash.
|
59
|
+
# The default reader is the Traject::MarcReader, who's behavior is
|
60
|
+
# further customized by several settings in the Settings hash. Jruby users
|
61
|
+
# with specialized needs may want to look at the gem traject-marc4j_reader.
|
58
62
|
#
|
59
63
|
# Alternate readers can be set directly with the #reader_class= method, or
|
60
64
|
# with the "reader_class_name" Setting, a String name of a class
|
@@ -72,14 +76,22 @@ require 'traject/macros/basic'
|
|
72
76
|
# 4) Optionally implements a #skipped_record_count method, returning int count of records
|
73
77
|
# that were skipped due to errors (and presumably logged)
|
74
78
|
#
|
75
|
-
#
|
76
|
-
#
|
77
|
-
#
|
78
|
-
#
|
79
|
+
# Traject packages one solr writer: traject/solr_json_writer, which sends
|
80
|
+
# in json format and works under both ruby and jruby, but only with solr version
|
81
|
+
# >= 3.2. To index to an older solr installation, you'll need to use jruby and
|
82
|
+
# install the gem traject-solrj_writer, which uses the solrj .jar underneath.
|
79
83
|
#
|
80
84
|
# You can set alternate writers by setting a Class object directly
|
81
85
|
# with the #writer_class method, or by the 'writer_class_name' Setting,
|
82
|
-
# with a String name of class meeting the Writer contract.
|
86
|
+
# with a String name of class meeting the Writer contract. There are several
|
87
|
+
# that ship with traject itself:
|
88
|
+
#
|
89
|
+
# * traject/json_writer (Traject::JsonWriter) -- write newline-delimied json files.
|
90
|
+
# * traject/yaml_writer (Traject::YamlWriter) -- write pretty yaml file; very human-readable
|
91
|
+
# * traject/debug_writer (Traject::DebugWriter) -- write a tab-delimited file where
|
92
|
+
# each line consists of the id, field, and value(s).
|
93
|
+
# * traject/delimited_writer and traject/csv_writer -- write character-delimited files
|
94
|
+
# (default is tab-delimited) or comma-separated-value files.
|
83
95
|
#
|
84
96
|
class Traject::Indexer
|
85
97
|
|
@@ -310,7 +322,13 @@ class Traject::Indexer
|
|
310
322
|
reader = self.reader!(io_stream)
|
311
323
|
writer = self.writer!
|
312
324
|
|
313
|
-
|
325
|
+
|
326
|
+
processing_threads = settings["processing_thread_pool"].to_i
|
327
|
+
if processing_threads > 0 and !(defined? JRuby)
|
328
|
+
processing_threads = 0
|
329
|
+
logger.warn "Processing threads set to 0 because we're not running under JRuby"
|
330
|
+
end
|
331
|
+
thread_pool = Traject::ThreadPool.new(processing_threads)
|
314
332
|
|
315
333
|
logger.info " Indexer with reader: #{reader.class.name} and writer: #{writer.class.name}"
|
316
334
|
|
@@ -326,7 +344,7 @@ class Traject::Indexer
|
|
326
344
|
thread_pool.raise_collected_exception!
|
327
345
|
|
328
346
|
if settings["debug_ascii_progress"].to_s == "true"
|
329
|
-
$stderr.write "." if count % settings["
|
347
|
+
$stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
|
330
348
|
end
|
331
349
|
|
332
350
|
if log_batch_size && (count % log_batch_size == 0)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'hashie'
|
2
|
+
require 'concurrent'
|
2
3
|
|
3
4
|
class Traject::Indexer
|
4
5
|
|
@@ -22,9 +23,6 @@ class Traject::Indexer
|
|
22
23
|
include Hashie::Extensions::MergeInitializer # can init with hash
|
23
24
|
include Hashie::Extensions::IndifferentAccess
|
24
25
|
|
25
|
-
# Hashie bug Issue #100 https://github.com/intridea/hashie/pull/100
|
26
|
-
alias_method :store, :indifferent_writer
|
27
|
-
|
28
26
|
def initialize(*args)
|
29
27
|
super
|
30
28
|
self.default_proc = lambda do |hash, key|
|
@@ -59,19 +57,37 @@ class Traject::Indexer
|
|
59
57
|
def fill_in_defaults!
|
60
58
|
self.reverse_merge!(self.class.defaults)
|
61
59
|
end
|
60
|
+
|
61
|
+
|
62
|
+
def self.mri_defaults
|
63
|
+
{
|
64
|
+
"reader_class_name" => "Traject::MarcReader",
|
65
|
+
"writer_class_name" => "Traject::SolrJsonWriter",
|
66
|
+
"marc_source.type" => "binary",
|
67
|
+
"solrj_writer.batch_size" => 200,
|
68
|
+
"solrj_writer.thread_pool" => 1,
|
69
|
+
"processing_thread_pool" => self.default_processing_thread_pool,
|
70
|
+
"log.batch_size.severity" => "info"
|
71
|
+
}
|
72
|
+
end
|
62
73
|
|
63
|
-
def self.
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
"marc_source.type" => "binary",
|
68
|
-
"marc4j_reader.permissive" => true,
|
69
|
-
"solrj_writer.batch_size" => 200,
|
70
|
-
"solrj_writer.thread_pool" => 1,
|
71
|
-
"processing_thread_pool" => 3,
|
72
|
-
"log.batch_size.severity" => "info"
|
74
|
+
def self.jruby_defaults
|
75
|
+
{
|
76
|
+
'reader_class_name' => "Traject::Marc4JReader",
|
77
|
+
'marc4j_reader.permissive' => true
|
73
78
|
}
|
74
79
|
end
|
80
|
+
|
81
|
+
|
82
|
+
def self.defaults
|
83
|
+
return @@defaults if defined? @@defaults
|
84
|
+
default_settings = self.mri_defaults
|
85
|
+
if defined? JRUBY_VERSION
|
86
|
+
default_settings.merge! self.jruby_defaults
|
87
|
+
end
|
88
|
+
|
89
|
+
@@defaults = default_settings
|
90
|
+
end
|
75
91
|
|
76
92
|
def inspect
|
77
93
|
# Keep any key ending in password out of the inspect
|
@@ -80,5 +96,15 @@ class Traject::Indexer
|
|
80
96
|
hash
|
81
97
|
end.inspect
|
82
98
|
end
|
99
|
+
|
100
|
+
protected
|
101
|
+
def self.default_processing_thread_pool
|
102
|
+
if ["jruby", "rbx"].include? ENV["RUBY_ENGINE"]
|
103
|
+
[1, Concurrent.processor_count - 1].max
|
104
|
+
else
|
105
|
+
1
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
83
109
|
end
|
84
110
|
end
|
data/lib/traject/line_writer.rb
CHANGED
@@ -16,14 +16,18 @@ require 'thread'
|
|
16
16
|
# method. For instance, see JsonWriter.
|
17
17
|
class Traject::LineWriter
|
18
18
|
attr_reader :settings
|
19
|
-
attr_reader :write_mutex
|
19
|
+
attr_reader :write_mutex, :output_file
|
20
20
|
|
21
21
|
def initialize(argSettings)
|
22
22
|
@settings = argSettings
|
23
23
|
@write_mutex = Mutex.new
|
24
24
|
|
25
25
|
# trigger lazy loading now for thread-safety
|
26
|
-
output_file
|
26
|
+
@output_file = open_output_file
|
27
|
+
end
|
28
|
+
|
29
|
+
def _write(data)
|
30
|
+
output_file.puts(data)
|
27
31
|
end
|
28
32
|
|
29
33
|
|
@@ -34,13 +38,13 @@ class Traject::LineWriter
|
|
34
38
|
def put(context)
|
35
39
|
serialized = serialize(context)
|
36
40
|
write_mutex.synchronize do
|
37
|
-
|
41
|
+
_write(serialized)
|
38
42
|
end
|
39
43
|
end
|
40
44
|
|
41
|
-
def
|
45
|
+
def open_output_file
|
42
46
|
unless defined? @output_file
|
43
|
-
|
47
|
+
of =
|
44
48
|
if settings["output_file"]
|
45
49
|
File.open(settings["output_file"], 'w:UTF-8')
|
46
50
|
elsif settings["output_stream"]
|
@@ -49,7 +53,7 @@ class Traject::LineWriter
|
|
49
53
|
$stdout
|
50
54
|
end
|
51
55
|
end
|
52
|
-
return
|
56
|
+
return of
|
53
57
|
end
|
54
58
|
|
55
59
|
def close
|
data/lib/traject/marc_reader.rb
CHANGED
@@ -5,7 +5,8 @@ require 'traject/ndj_reader'
|
|
5
5
|
# can read MARC ISO 2709 ('binary'), MARC-XML, and Marc-in-json (newline-delimited-json).
|
6
6
|
#
|
7
7
|
# Marc4JReader is an alternative to this class, powered by Marc4J. You may be interested
|
8
|
-
# in comparing for performance, under your particular use case.
|
8
|
+
# in comparing for performance, under your particular use case. To use it, you'll need
|
9
|
+
# the gem traject-marc4j_reader.
|
9
10
|
#
|
10
11
|
# By default assumes binary MARC encoding, please set marc_source.type setting
|
11
12
|
# for XML or json. If binary, please set marc_source.encoding with char encoding.
|
@@ -0,0 +1,277 @@
|
|
1
|
+
require 'yell'
|
2
|
+
|
3
|
+
require 'traject'
|
4
|
+
require 'traject/util'
|
5
|
+
require 'traject/qualified_const_get'
|
6
|
+
require 'traject/thread_pool'
|
7
|
+
|
8
|
+
require 'json'
|
9
|
+
require 'httpclient'
|
10
|
+
|
11
|
+
require 'uri'
|
12
|
+
require 'thread' # for Mutex/Queue
|
13
|
+
require 'concurrent' # for atomic_fixnum
|
14
|
+
|
15
|
+
# Write to Solr using the JSON interface; only works for Solr >= 3.2
|
16
|
+
#
|
17
|
+
# This should work under both MRI and JRuby, with JRuby getting much
|
18
|
+
# better performance due to the threading model.
|
19
|
+
#
|
20
|
+
# Relevant settings
|
21
|
+
#
|
22
|
+
# * solr.url (optional if solr.update_url is set) The URL to the solr core to index into
|
23
|
+
#
|
24
|
+
# * solr.update_url: The actual update url. If unset, we'll first see if
|
25
|
+
# "#{solr.url}/update/json" exists, and if not use "#{solr.url}/update"
|
26
|
+
#
|
27
|
+
# * solr_writer.batch_size: How big a batch to send to solr. Default is 100.
|
28
|
+
# My tests indicate that this setting doesn't change overall index speed by a ton.
|
29
|
+
#
|
30
|
+
# * solr_writer.thread_pool: How many threads to use for the writer. Default is 1.
|
31
|
+
# Likely useful even under MRI since thread will be waiting on Solr for some time.
|
32
|
+
#
|
33
|
+
# * solr_writer.max_skipped: How many records skipped due to errors before we
|
34
|
+
# bail out with a fatal error? Set to -1 for unlimited skips. Default 0,
|
35
|
+
# raise and abort on a single record that could not be added to Solr.
|
36
|
+
#
|
37
|
+
# * solr_writer.commit_on_close: Set to true (or "true") if you want to commit at the
|
38
|
+
# end of the indexing run. (Old "solrj_writer.commit_on_close" supported for backwards
|
39
|
+
# compat only.)
|
40
|
+
#
|
41
|
+
# * solr_writer.commit_timeout: If commit_on_close, how long to wait for Solr before
|
42
|
+
# giving up as a timeout. Default 10 minutes. Solr can be slow.
|
43
|
+
#
|
44
|
+
# * solr_json_writer.http_client Mainly intended for testing, set your own HTTPClient
|
45
|
+
# or mock object to be used for HTTP.
|
46
|
+
|
47
|
+
|
48
|
+
class Traject::SolrJsonWriter
|
49
|
+
include Traject::QualifiedConstGet
|
50
|
+
|
51
|
+
DEFAULT_MAX_SKIPPED = 0
|
52
|
+
DEFAULT_BATCH_SIZE = 100
|
53
|
+
|
54
|
+
# The passed-in settings
|
55
|
+
attr_reader :settings, :thread_pool_size
|
56
|
+
|
57
|
+
# A queue to hold documents before sending to solr
|
58
|
+
attr_reader :batched_queue
|
59
|
+
|
60
|
+
def initialize(argSettings)
|
61
|
+
@settings = Traject::Indexer::Settings.new(argSettings)
|
62
|
+
|
63
|
+
# Set max errors
|
64
|
+
@max_skipped = (@settings['solr_writer.max_skipped'] || DEFAULT_MAX_SKIPPED).to_i
|
65
|
+
if @max_skipped < 0
|
66
|
+
@max_skipped = nil
|
67
|
+
end
|
68
|
+
|
69
|
+
@http_client = @settings["solr_json_writer.http_client"] || HTTPClient.new
|
70
|
+
|
71
|
+
@batch_size = (settings["solr_writer.batch_size"] || DEFAULT_BATCH_SIZE).to_i
|
72
|
+
@batch_size = 1 if @batch_size < 1
|
73
|
+
|
74
|
+
# Store error count in an AtomicInteger, so multi threads can increment
|
75
|
+
# it safely, if we're threaded.
|
76
|
+
@skipped_record_incrementer = Concurrent::AtomicFixnum.new(0)
|
77
|
+
|
78
|
+
|
79
|
+
# How many threads to use for the writer?
|
80
|
+
# if our thread pool settings are 0, it'll just create a null threadpool that
|
81
|
+
# executes in calling context.
|
82
|
+
@thread_pool_size = (@settings["solr_writer.thread_pool"] || 1).to_i
|
83
|
+
|
84
|
+
@batched_queue = Queue.new
|
85
|
+
@thread_pool = Traject::ThreadPool.new(@thread_pool_size)
|
86
|
+
|
87
|
+
# old setting solrj_writer supported for backwards compat, as we make
|
88
|
+
# this the new default writer.
|
89
|
+
@commit_on_close = (settings["solr_writer.commit_on_close"] || settings["solrj_writer.commit_on_close"]).to_s == "true"
|
90
|
+
|
91
|
+
# Figure out where to send updates
|
92
|
+
@solr_update_url = self.determine_solr_update_url
|
93
|
+
|
94
|
+
logger.info(" #{self.class.name} writing to '#{@solr_update_url}' in batches of #{@batch_size} with #{@thread_pool_size} bg threads")
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
# Add a single context to the queue, ready to be sent to solr
|
99
|
+
def put(context)
|
100
|
+
@thread_pool.raise_collected_exception!
|
101
|
+
|
102
|
+
@batched_queue << context
|
103
|
+
if @batched_queue.size >= @batch_size
|
104
|
+
batch = Traject::Util.drain_queue(@batched_queue)
|
105
|
+
@thread_pool.maybe_in_thread_pool(batch) {|batch_arg| send_batch(batch_arg) }
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# Send the given batch of contexts. If something goes wrong, send
|
110
|
+
# them one at a time.
|
111
|
+
# @param [Array<Traject::Indexer::Context>] an array of contexts
|
112
|
+
def send_batch(batch)
|
113
|
+
return if batch.empty?
|
114
|
+
json_package = JSON.generate(batch.map { |c| c.output_hash })
|
115
|
+
begin
|
116
|
+
resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
|
117
|
+
rescue StandardError => exception
|
118
|
+
end
|
119
|
+
|
120
|
+
if exception || resp.status != 200
|
121
|
+
error_message = exception ?
|
122
|
+
Traject::Util.exception_to_log_message(exception) :
|
123
|
+
"Solr response: #{resp.status}: #{resp.body}"
|
124
|
+
|
125
|
+
logger.error "Error in Solr batch add. Will retry documents individually at performance penalty: #{error_message}"
|
126
|
+
|
127
|
+
batch.each do |c|
|
128
|
+
send_single(c)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
|
134
|
+
# Send a single context to Solr, logging an error if need be
|
135
|
+
# @param [Traject::Indexer::Context] c The context whose document you want to send
|
136
|
+
def send_single(c)
|
137
|
+
json_package = JSON.generate([c.output_hash])
|
138
|
+
begin
|
139
|
+
resp = @http_client.post @solr_update_url, json_package, "Content-type" => "application/json"
|
140
|
+
# Catch Timeouts and network errors as skipped records, but otherwise
|
141
|
+
# allow unexpected errors to propagate up.
|
142
|
+
rescue HTTPClient::TimeoutError, SocketError, Errno::ECONNREFUSED => exception
|
143
|
+
end
|
144
|
+
|
145
|
+
if exception || resp.status != 200
|
146
|
+
if exception
|
147
|
+
msg = Traject::Util.exception_to_log_message(e)
|
148
|
+
else
|
149
|
+
msg = "Solr error response: #{resp.status}: #{resp.body}"
|
150
|
+
end
|
151
|
+
logger.error "Could not add record #{record_id_from_context c} at source file position #{c.position}: #{msg}"
|
152
|
+
logger.debug(c.source_record.to_s)
|
153
|
+
|
154
|
+
@skipped_record_incrementer.increment
|
155
|
+
if @max_skipped and skipped_record_count > @max_skipped
|
156
|
+
raise RuntimeError.new("#{self.class.name}: Exceeded maximum number of skipped records (#{@max_skipped}): aborting")
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
# Get the logger from the settings, or default to an effectively null logger
|
165
|
+
def logger
|
166
|
+
settings["logger"] ||= Yell.new(STDERR, :level => "gt.fatal") # null logger
|
167
|
+
end
|
168
|
+
|
169
|
+
# Returns MARC 001, then a slash, then output_hash["id"] -- if both
|
170
|
+
# are present. Otherwise may return just one, or even an empty string.
|
171
|
+
def record_id_from_context(context)
|
172
|
+
marc_id = context.source_record && context.source_record['001'] && context.source_record['001'].value
|
173
|
+
output_id = context.output_hash["id"]
|
174
|
+
|
175
|
+
return [marc_id, output_id].compact.join("/")
|
176
|
+
end
|
177
|
+
|
178
|
+
|
179
|
+
# On close, we need to (a) raise any exceptions we might have, (b) send off
|
180
|
+
# the last (possibly empty) batch, and (c) commit if instructed to do so
|
181
|
+
# via the solr_writer.commit_on_close setting.
|
182
|
+
def close
|
183
|
+
@thread_pool.raise_collected_exception!
|
184
|
+
|
185
|
+
# Finish off whatever's left. Do it in the thread pool for
|
186
|
+
# consistency, and to ensure expected order of operations, so
|
187
|
+
# it goes to the end of the queue behind any other work.
|
188
|
+
batch = Traject::Util.drain_queue(@batched_queue)
|
189
|
+
if batch.length > 0
|
190
|
+
@thread_pool.maybe_in_thread_pool { send_batch(batch) }
|
191
|
+
end
|
192
|
+
|
193
|
+
# Wait for shutdown, and time it.
|
194
|
+
logger.debug "#{self.class.name}: Shutting down thread pool, waiting if needed..."
|
195
|
+
elapsed = @thread_pool.shutdown_and_wait
|
196
|
+
if elapsed > 60
|
197
|
+
logger.warn "Waited #{elapsed} seconds for all threads, you may want to increase solr_writer.thread_pool (currently #{@settings["solr_writer.thread_pool"]})"
|
198
|
+
end
|
199
|
+
logger.debug "#{self.class.name}: Thread pool shutdown complete"
|
200
|
+
logger.warn "#{self.class.name}: #{skipped_record_count} skipped records" if skipped_record_count > 0
|
201
|
+
|
202
|
+
# check again now that we've waited, there could still be some
|
203
|
+
# that didn't show up before.
|
204
|
+
@thread_pool.raise_collected_exception!
|
205
|
+
|
206
|
+
# Commit if we're supposed to
|
207
|
+
if @commit_on_close
|
208
|
+
commit
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
|
213
|
+
# Send a commit
|
214
|
+
def commit
|
215
|
+
logger.info "#{self.class.name} sending commit to solr at url #{@solr_update_url}..."
|
216
|
+
|
217
|
+
original_timeout = @http_client.receive_timeout
|
218
|
+
|
219
|
+
@http_client.receive_timeout = (settings["commit_timeout"] || (10 * 60)).to_i
|
220
|
+
|
221
|
+
resp = @http_client.get(@solr_update_url, {"commit" => 'true'})
|
222
|
+
unless resp.status == 200
|
223
|
+
raise RuntimeError.new("Could not commit to Solr: #{resp.status} #{resp.body}")
|
224
|
+
end
|
225
|
+
|
226
|
+
@http_client.receive_timeout = original_timeout
|
227
|
+
end
|
228
|
+
|
229
|
+
|
230
|
+
# Return count of encountered skipped records. Most accurate to call
|
231
|
+
# it after #close, in which case it should include full count, even
|
232
|
+
# under async thread_pool.
|
233
|
+
def skipped_record_count
|
234
|
+
@skipped_record_incrementer.value
|
235
|
+
end
|
236
|
+
|
237
|
+
|
238
|
+
# Relatively complex logic to determine if we have a valid URL and what it is
|
239
|
+
def determine_solr_update_url
|
240
|
+
if settings['solr.update_url']
|
241
|
+
check_solr_update_url(settings['solr.update_url'])
|
242
|
+
else
|
243
|
+
derive_solr_update_url_from_solr_url(settings['solr.url'])
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
|
248
|
+
# If we've got a solr.update_url, make sure it's ok
|
249
|
+
def check_solr_update_url(url)
|
250
|
+
unless url =~ /^#{URI::regexp}$/
|
251
|
+
raise ArgumentError.new("#{self.class.name} setting `solr.update_url` doesn't look like a URL: `#{url}`")
|
252
|
+
end
|
253
|
+
url
|
254
|
+
end
|
255
|
+
|
256
|
+
def derive_solr_update_url_from_solr_url(url)
|
257
|
+
# Nil? Then we bail
|
258
|
+
if url.nil?
|
259
|
+
raise ArgumentError.new("#{self.class.name}: Neither solr.update_url nor solr.url set; need at least one")
|
260
|
+
end
|
261
|
+
|
262
|
+
# Not a URL? Bail
|
263
|
+
unless url =~ /^#{URI::regexp}$/
|
264
|
+
raise ArgumentError.new("#{self.class.name} setting `solr.url` doesn't look like a URL: `#{url}`")
|
265
|
+
end
|
266
|
+
|
267
|
+
# First, try the /update/json handler
|
268
|
+
candidate = [url.chomp('/'), 'update', 'json'].join('/')
|
269
|
+
resp = @http_client.get(candidate)
|
270
|
+
if resp.status == 404
|
271
|
+
candidate = [url.chomp('/'), 'update'].join('/')
|
272
|
+
end
|
273
|
+
candidate
|
274
|
+
end
|
275
|
+
|
276
|
+
|
277
|
+
end
|