traject 1.1.0 → 2.0.0.rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +20 -0
- data/README.md +85 -73
- data/doc/batch_execution.md +2 -6
- data/doc/other_commands.md +3 -5
- data/doc/settings.md +27 -38
- data/lib/traject/command_line.rb +1 -1
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +29 -11
- data/lib/traject/indexer/settings.rb +39 -13
- data/lib/traject/line_writer.rb +10 -6
- data/lib/traject/marc_reader.rb +2 -1
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +38 -48
- data/lib/traject/translation_map.rb +3 -0
- data/lib/traject/util.rb +13 -51
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/marc_geographic.yaml +2 -2
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/read_write_test.rb +0 -22
- data/test/indexer/settings_test.rb +24 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +5 -3
- data/test/test_support/demo_config.rb +0 -5
- data/test/translation_map_test.rb +9 -0
- data/traject.gemspec +18 -5
- metadata +77 -87
- data/lib/traject/marc4j_reader.rb +0 -153
- data/lib/traject/solrj_writer.rb +0 -351
- data/test/marc4j_reader_test.rb +0 -136
- data/test/solrj_writer_test.rb +0 -209
- data/vendor/solrj/README +0 -8
- data/vendor/solrj/build.xml +0 -39
- data/vendor/solrj/ivy.xml +0 -16
- data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
- data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
- data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
- data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
- data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
- data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
- data/vendor/solrj/lib/noggit-0.5.jar +0 -0
- data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
- data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
- data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
- data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
data/lib/traject/thread_pool.rb
CHANGED
@@ -1,28 +1,33 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
require 'thread' # for Queue
|
3
|
+
|
1
4
|
module Traject
|
2
|
-
# An abstraction wrapping a
|
3
|
-
# and other apparatus.
|
5
|
+
# An abstraction wrapping a Concurrent::ThreadPool in some configuration choices
|
6
|
+
# and other apparatus. Concurrent::ThreadPool is a Java ThreadPool executor on
|
7
|
+
# jruby for performance, and is ruby-concurrent's own ruby implementation otherwise.
|
4
8
|
#
|
5
9
|
# 1) Initialize with chosen pool size -- we create fixed size pools, where
|
6
10
|
# core and max sizes are the same.
|
7
11
|
#
|
8
|
-
# 2) If initialized with nil for threadcount, no thread pool will actually
|
9
|
-
# be created, and
|
10
|
-
# the
|
11
|
-
#
|
12
|
-
#
|
12
|
+
# 2) If initialized with nil or 0 for threadcount, no thread pool will actually
|
13
|
+
# be created, and work sent to the Traject::ThreadPool will just be executed
|
14
|
+
# in the caller thread. We call this a nil threadpool. One situation it can be useful
|
15
|
+
# is if you are running under MRI, where multi-core parallelism isn't available, so
|
16
|
+
# an actual threadpool may not be useful. (Although in some cases a thread pool,
|
17
|
+
# especially one with size 1, can be useful in MRI for I/O blocking operations)
|
13
18
|
#
|
14
19
|
# 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
|
15
|
-
# execution -- if
|
20
|
+
# execution -- if configurred with a nil threadcount, your block will just be
|
16
21
|
# executed in calling thread. Be careful to not refer to any non-local
|
17
22
|
# variables in the block, unless the variable has an object you can
|
18
23
|
# use thread-safely!
|
19
24
|
#
|
20
|
-
# 4)
|
21
|
-
# with a work queue that will buffer up to (pool_size*3) tasks. If queue is full,
|
22
|
-
# the
|
25
|
+
# 4) We configure our underlying Concurrent::ThreadPool
|
26
|
+
# with a work queue that will buffer up to (pool_size*3) tasks. If the queue is full,
|
27
|
+
# the underlying Concurrent::ThreadPool is set up to use the :caller_runs policy
|
23
28
|
# meaning the block will end up executing in caller's own thread. With the kind
|
24
29
|
# of work we're doing, where each unit of work is small and there are many of them--
|
25
|
-
# the
|
30
|
+
# the :caller_runs policy serves as an effective 'back pressure' mechanism to keep
|
26
31
|
# the work queue from getting too large and exhausting memory, when producers are
|
27
32
|
# faster than consumers.
|
28
33
|
#
|
@@ -34,8 +39,8 @@ module Traject
|
|
34
39
|
# #shutdown_and_wait, which will wait for all current queued work
|
35
40
|
# to complete, then return. You can not give any more work to the pool
|
36
41
|
# after you do this. By default it'll wait pretty much forever, which should
|
37
|
-
# be fine. If you never call shutdown,
|
38
|
-
#
|
42
|
+
# be fine. If you never call shutdown, then queued or in-progress work
|
43
|
+
# may be abandoned when the program ends, which would be bad.
|
39
44
|
#
|
40
45
|
# 7) We will keep track of total times a block is run in thread pool, and
|
41
46
|
# total elapsed (wall) time of running all blocks, so an average_execution_ms
|
@@ -43,33 +48,27 @@ module Traject
|
|
43
48
|
# threads are still executing, as it's not entirely thread safe (may get
|
44
49
|
# an off by one as to total iterations)
|
45
50
|
class ThreadPool
|
46
|
-
attr_reader :pool_size, :
|
51
|
+
attr_reader :pool_size, :queue_capacity
|
47
52
|
|
48
|
-
# First arg is pool size, 0 or nil and we'll be a null/no-op pool
|
53
|
+
# First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
|
54
|
+
# work in caller thread.
|
49
55
|
def initialize(pool_size)
|
50
56
|
unless pool_size.nil? || pool_size == 0
|
51
|
-
|
52
|
-
|
53
|
-
@label = label
|
54
|
-
|
55
|
-
@pool_size = pool_size.to_i # just for reflection, we don't really need it again
|
57
|
+
@pool_size = pool_size.to_i
|
56
58
|
@queue_capacity = pool_size * 3
|
57
59
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
@thread_pool = java.util.concurrent.ThreadPoolExecutor.new(
|
65
|
-
@pool_size, @pool_size, 0, java.util.concurrent.TimeUnit::MILLISECONDS,
|
66
|
-
blockingQueue, rejectedExecutionHandler)
|
60
|
+
@thread_pool = Concurrent::ThreadPoolExecutor.new(
|
61
|
+
:min_threads => @pool_size,
|
62
|
+
:max_threads => @pool_size,
|
63
|
+
:max_queue => @queue_capacity,
|
64
|
+
:fallback_policy => :caller_runs
|
65
|
+
)
|
67
66
|
|
68
67
|
# A thread-safe queue to collect exceptions cross-threads.
|
69
|
-
# We
|
70
|
-
#
|
71
|
-
#
|
72
|
-
@
|
68
|
+
# We really only need to save the first exception, but a queue
|
69
|
+
# is a convenient way to store a value concurrency-safely, and
|
70
|
+
# might as well store all of them.
|
71
|
+
@exceptions_caught_queue = Queue.new
|
73
72
|
end
|
74
73
|
end
|
75
74
|
|
@@ -106,7 +105,7 @@ module Traject
|
|
106
105
|
start_t = Time.now
|
107
106
|
|
108
107
|
if @thread_pool
|
109
|
-
@thread_pool.
|
108
|
+
@thread_pool.post do
|
110
109
|
begin
|
111
110
|
yield(*args)
|
112
111
|
rescue Exception => e
|
@@ -119,21 +118,13 @@ module Traject
|
|
119
118
|
|
120
119
|
end
|
121
120
|
|
122
|
-
# Just for monitoring/debugging purposes, we'll return the work queue
|
123
|
-
# used by the threadpool. Don't recommend you do anything with it, as
|
124
|
-
# the original java.util.concurrent docs make the same recommendation.
|
125
|
-
def queue
|
126
|
-
@thread_pool && @thread_pool.queue
|
127
|
-
end
|
128
121
|
|
129
122
|
# thread-safe way of storing an exception, to raise
|
130
123
|
# later in a different thread. We don't guarantee
|
131
124
|
# that we can store more than one at a time, only
|
132
125
|
# the first one recorded may be stored.
|
133
126
|
def collect_exception(e)
|
134
|
-
|
135
|
-
# with us.
|
136
|
-
@async_exception_queue.offer(e)
|
127
|
+
@exceptions_caught_queue.push(e)
|
137
128
|
end
|
138
129
|
|
139
130
|
# If there's a stored collected exception, raise it
|
@@ -144,7 +135,8 @@ module Traject
|
|
144
135
|
# as a non-functioning threadpool -- then this method is just
|
145
136
|
# a no-op.
|
146
137
|
def raise_collected_exception!
|
147
|
-
if @
|
138
|
+
if @exceptions_caught_queue && (! @exceptions_caught_queue.empty?)
|
139
|
+
e = @exceptions_caught_queue.pop
|
148
140
|
raise e
|
149
141
|
end
|
150
142
|
end
|
@@ -159,9 +151,7 @@ module Traject
|
|
159
151
|
|
160
152
|
if @thread_pool
|
161
153
|
@thread_pool.shutdown
|
162
|
-
|
163
|
-
# a timeout. Okay, one day!
|
164
|
-
@thread_pool.awaitTermination(1, java.util.concurrent.TimeUnit::DAYS)
|
154
|
+
@thread_pool.wait_for_termination
|
165
155
|
end
|
166
156
|
|
167
157
|
return (Time.now - start_t)
|
@@ -171,6 +171,9 @@ module Traject
|
|
171
171
|
def initialize(defn, options = {})
|
172
172
|
if defn.kind_of? Hash
|
173
173
|
@hash = defn
|
174
|
+
elsif defn.kind_of? self.class
|
175
|
+
@hash = defn.to_hash
|
176
|
+
@default = defn.default
|
174
177
|
else
|
175
178
|
@hash = self.class.cache.lookup(defn)
|
176
179
|
raise NotFound.new(defn) if @hash.nil?
|
data/lib/traject/util.rb
CHANGED
@@ -27,63 +27,25 @@ module Traject
|
|
27
27
|
end
|
28
28
|
|
29
29
|
|
30
|
-
|
31
|
-
#
|
32
|
-
#
|
33
|
-
# Have to pass in a settings arg, so we can check it for specified jar dir.
|
34
|
-
#
|
35
|
-
# Tries not to do the dirglob and require if solrj has already been loaded.
|
36
|
-
# Will define global constants with classes HttpSolrServer and SolrInputDocument
|
37
|
-
# if not already defined.
|
30
|
+
|
31
|
+
# Ruby stdlib queue lacks a 'drain' function, we write one.
|
38
32
|
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
|
43
|
-
|
44
|
-
def self.require_solrj_jars(settings)
|
45
|
-
jruby_ensure_init!
|
33
|
+
# Removes everything currently in the ruby stdlib queue, and returns
|
34
|
+
# it an array. Should be concurrent-safe, but queue may still have
|
35
|
+
# some things in it after drain, if there are concurrent writers.
|
36
|
+
def self.drain_queue(queue)
|
37
|
+
result = []
|
46
38
|
|
47
|
-
|
39
|
+
queue_size = queue.size
|
48
40
|
begin
|
49
|
-
|
50
|
-
|
51
|
-
org.apache.solr
|
52
|
-
org.apache.solr.client.solrj
|
53
|
-
|
54
|
-
# java_import which we'd normally use weirdly doesn't work
|
55
|
-
# from a class method. https://github.com/jruby/jruby/issues/975
|
56
|
-
Object.const_set("HttpSolrServer", org.apache.solr.client.solrj.impl.HttpSolrServer) unless defined? ::HttpSolrServer
|
57
|
-
Object.const_set("SolrInputDocument", org.apache.solr.common.SolrInputDocument) unless defined? ::SolrInputDocument
|
58
|
-
rescue NameError => e
|
59
|
-
included_jar_dir = File.expand_path("../../vendor/solrj/lib", File.dirname(__FILE__))
|
60
|
-
|
61
|
-
jardir = settings["solrj.jar_dir"] || included_jar_dir
|
62
|
-
Dir.glob("#{jardir}/*.jar") do |x|
|
63
|
-
require x
|
64
|
-
end
|
65
|
-
if tries > 1
|
66
|
-
raise LoadError.new("Can not find SolrJ java classes")
|
67
|
-
else
|
68
|
-
retry
|
41
|
+
queue_size.times do
|
42
|
+
result << queue.deq(:raise_if_empty)
|
69
43
|
end
|
44
|
+
rescue ThreadError
|
45
|
+
# Need do nothing, queue was concurrently popped, no biggie
|
70
46
|
end
|
71
|
-
end
|
72
47
|
|
73
|
-
|
74
|
-
# aren't jruby, and raises a better error message.
|
75
|
-
# Pass in a developer-presentable name of a feature to include in the error
|
76
|
-
# message if you want.
|
77
|
-
def self.jruby_ensure_init!(feature = nil)
|
78
|
-
begin
|
79
|
-
require 'java'
|
80
|
-
rescue LoadError => e
|
81
|
-
feature ||= "A traject feature is in use that"
|
82
|
-
msg = if feature
|
83
|
-
"#{feature} requires jruby, but you do not appear to be running under jruby. We recommend `chruby` for managing multiple ruby installs."
|
84
|
-
end
|
85
|
-
raise LoadError.new(msg)
|
86
|
-
end
|
48
|
+
return result
|
87
49
|
end
|
88
50
|
|
89
51
|
end
|
data/lib/traject/version.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task
|
2
|
-
# Scraped from http://www.loc.gov/marc/geoareas/gacs_code.html at
|
2
|
+
# Scraped from http://www.loc.gov/marc/geoareas/gacs_code.html at 2015-01-27 23:00:08 -0500
|
3
3
|
# Intentionally includes discontinued codes.
|
4
4
|
|
5
5
|
'a': 'Asia'
|
@@ -327,7 +327,7 @@
|
|
327
327
|
'lnaz': 'Azores'
|
328
328
|
'lnbm': 'Bermuda Islands'
|
329
329
|
'lnca': 'Canary Islands'
|
330
|
-
'lncv': '
|
330
|
+
'lncv': 'Cabo Verde'
|
331
331
|
'lnfa': 'Faroe Islands'
|
332
332
|
'lnjn': 'Jan Mayen Island'
|
333
333
|
'lnma': 'Madeira Islands'
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
3
|
+
require 'test_helper'
|
4
|
+
require 'stringio'
|
5
|
+
require 'traject/delimited_writer'
|
6
|
+
require 'traject/csv_writer'
|
7
|
+
|
8
|
+
require 'csv'
|
9
|
+
|
10
|
+
describe "Delimited/CSV Writers" do
|
11
|
+
|
12
|
+
before do
|
13
|
+
@out = StringIO.new
|
14
|
+
@settings = {'output_stream' => @out, 'delimited_writer.fields' => 'four,one,two'}
|
15
|
+
@context = Struct.new(:output_hash).new
|
16
|
+
@context.output_hash = {'one' => 'one', 'two' => %w[two1 two2], 'three' => 'three', 'four' => 'four'}
|
17
|
+
end
|
18
|
+
|
19
|
+
after do
|
20
|
+
@out.close
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "Traject::DelimitedWriter" do
|
24
|
+
|
25
|
+
it "creates a dw with defaults" do
|
26
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
27
|
+
dw.delimiter.must_equal "\t"
|
28
|
+
dw.internal_delimiter.must_equal '|'
|
29
|
+
dw.edelim.must_equal ' '
|
30
|
+
dw.eidelim.must_equal '\\|'
|
31
|
+
end
|
32
|
+
|
33
|
+
it "respects different delimiter" do
|
34
|
+
@settings['delimited_writer.delimiter'] = '^'
|
35
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
36
|
+
dw.delimiter.must_equal '^'
|
37
|
+
dw.edelim.must_equal '\\^'
|
38
|
+
dw.internal_delimiter.must_equal '|'
|
39
|
+
end
|
40
|
+
|
41
|
+
it "outputs a header if asked to" do
|
42
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
43
|
+
@out.string.chomp.must_equal %w[four one two].join("\t")
|
44
|
+
end
|
45
|
+
|
46
|
+
it "doesn't output a header if asked not to" do
|
47
|
+
@settings['delimited_writer.header'] = 'false'
|
48
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
49
|
+
@out.string.must_be_empty
|
50
|
+
end
|
51
|
+
|
52
|
+
it "deals with multiple values" do
|
53
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
54
|
+
dw.put @context
|
55
|
+
@out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(dw.delimiter)
|
56
|
+
end
|
57
|
+
|
58
|
+
it "bails if delimited_writer.fields isn't set" do
|
59
|
+
@settings.delete 'delimited_writer.fields'
|
60
|
+
proc { Traject::DelimitedWriter.new(@settings) }.must_raise(ArgumentError)
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "Traject::CSVWriter" do
|
66
|
+
it "unsets the delimiter" do
|
67
|
+
cw = Traject::CSVWriter.new(@settings)
|
68
|
+
cw.delimiter.must_be_nil
|
69
|
+
end
|
70
|
+
|
71
|
+
it "writes the header" do
|
72
|
+
cw = Traject::CSVWriter.new(@settings)
|
73
|
+
@out.string.chomp.must_equal 'four,one,two'
|
74
|
+
end
|
75
|
+
|
76
|
+
it "uses the internal delimiter" do
|
77
|
+
cw = Traject::CSVWriter.new(@settings)
|
78
|
+
cw.put @context
|
79
|
+
@out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(',')
|
80
|
+
end
|
81
|
+
|
82
|
+
it "produces complex output" do
|
83
|
+
@context.output_hash = {
|
84
|
+
'four' => ['Bill Clinton, Jr.', 'Jesse "the Body" Ventura'],
|
85
|
+
'one' => 'Willard "Mitt" Romney',
|
86
|
+
'two' => 'Dueber, Bill'
|
87
|
+
}
|
88
|
+
canonical = StringIO.new
|
89
|
+
csv = CSV.new(canonical)
|
90
|
+
|
91
|
+
csv_vals = [@context.output_hash['four'].join('|'), @context.output_hash['one'], @context.output_hash['two']]
|
92
|
+
csv << csv_vals
|
93
|
+
csv_output = canonical.string.chomp
|
94
|
+
|
95
|
+
cw = Traject::CSVWriter.new(@settings)
|
96
|
+
cw.put @context
|
97
|
+
traject_csvwriter_output = @out.string.split("\n").last.chomp
|
98
|
+
|
99
|
+
assert_equal(csv_output, traject_csvwriter_output)
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
end
|
@@ -62,29 +62,9 @@ describe "Traject::Indexer#process" do
|
|
62
62
|
assert writer_settings["memory_writer.closed"]
|
63
63
|
end
|
64
64
|
|
65
|
-
it "returns false if skipped records" do
|
66
|
-
@indexer = Traject::Indexer.new(
|
67
|
-
"solrj_writer.server_class_name" => "MockSolrServer",
|
68
|
-
"solr.url" => "http://example.org",
|
69
|
-
"writer_class_name" => "Traject::SolrJWriter"
|
70
|
-
)
|
71
|
-
@file = File.open(support_file_path "manufacturing_consent.marc")
|
72
|
-
|
73
|
-
|
74
|
-
@indexer.to_field("id") do |record, accumulator|
|
75
|
-
# intentionally make error
|
76
|
-
accumulator.concat ["one_id", "two_id"]
|
77
|
-
end
|
78
|
-
return_value = @indexer.process(@file)
|
79
|
-
|
80
|
-
assert ! return_value, "returns false on skipped record errors"
|
81
|
-
end
|
82
|
-
|
83
65
|
require 'traject/null_writer'
|
84
66
|
it "calls after_processing after processing" do
|
85
67
|
@indexer = Traject::Indexer.new(
|
86
|
-
"solrj_writer.server_class_name" => "MockSolrServer",
|
87
|
-
"solr.url" => "http://example.org",
|
88
68
|
"writer_class_name" => "Traject::NullWriter"
|
89
69
|
)
|
90
70
|
@file = File.open(support_file_path "test_data.utf8.mrc")
|
@@ -106,8 +86,6 @@ describe "Traject::Indexer#process" do
|
|
106
86
|
describe "demo_config.rb" do
|
107
87
|
before do
|
108
88
|
@indexer = Traject::Indexer.new(
|
109
|
-
"solrj_writer.server_class_name" => "MockSolrServer",
|
110
|
-
"solr.url" => "http://example.org",
|
111
89
|
"writer_class_name" => "Traject::NullWriter"
|
112
90
|
)
|
113
91
|
end
|
@@ -124,5 +124,29 @@ describe "Traject::Indexer#settings" do
|
|
124
124
|
assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
|
125
125
|
end
|
126
126
|
end
|
127
|
+
|
128
|
+
describe "JRuby / MRI" do
|
129
|
+
before do
|
130
|
+
@indexer = Traject::Indexer.new
|
131
|
+
end
|
132
|
+
|
133
|
+
it "has the right indexer name" do
|
134
|
+
if defined? JRUBY_VERSION
|
135
|
+
assert_equal "Traject::Marc4JReader", @indexer.settings['reader_class_name']
|
136
|
+
else
|
137
|
+
assert_equal "Traject::MarcReader", @indexer.settings['reader_class_name']
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# This next one has the added effect of making sure the correct class
|
142
|
+
# has actually been loaded -- otherwise the constant wouldn't be available
|
143
|
+
it "has the correct default indexer class based on platform" do
|
144
|
+
if defined? JRUBY_VERSION
|
145
|
+
assert_equal Traject::Marc4JReader, @indexer.reader_class
|
146
|
+
else
|
147
|
+
assert_equal Traject::MarcReader, @indexer.reader_class
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
127
151
|
|
128
152
|
end
|
@@ -0,0 +1,248 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'httpclient'
|
3
|
+
require 'traject/solr_json_writer'
|
4
|
+
require 'thread'
|
5
|
+
require 'json'
|
6
|
+
require 'stringio'
|
7
|
+
require 'logger'
|
8
|
+
|
9
|
+
|
10
|
+
# Some basic tests, using a mocked HTTPClient so we can see what it did --
|
11
|
+
# these tests do not run against a real solr server at present.
|
12
|
+
describe "Traject::SolrJsonWriter" do
|
13
|
+
|
14
|
+
|
15
|
+
#######
|
16
|
+
# A bunch of utilities to help testing
|
17
|
+
#######
|
18
|
+
|
19
|
+
class FakeHTTPClient
|
20
|
+
# Always reply with this status, normally 200, can
|
21
|
+
# be reset for testing error conditions.
|
22
|
+
attr_accessor :response_status
|
23
|
+
attr_accessor :allow_update_json_path
|
24
|
+
|
25
|
+
def initialize(*args)
|
26
|
+
@post_args = []
|
27
|
+
@get_args = []
|
28
|
+
@response_status = 200
|
29
|
+
@allow_update_json_path = true
|
30
|
+
@mutex = Monitor.new
|
31
|
+
end
|
32
|
+
|
33
|
+
def post(*args)
|
34
|
+
@mutex.synchronize do
|
35
|
+
@post_args << args
|
36
|
+
end
|
37
|
+
|
38
|
+
resp = HTTP::Message.new_response("")
|
39
|
+
resp.status = self.response_status
|
40
|
+
|
41
|
+
return resp
|
42
|
+
end
|
43
|
+
|
44
|
+
def get (*args)
|
45
|
+
@mutex.synchronize do
|
46
|
+
@get_args << args
|
47
|
+
end
|
48
|
+
|
49
|
+
resp = HTTP::Message.new_response("")
|
50
|
+
resp.status = self.response_status
|
51
|
+
|
52
|
+
if args.first.end_with?("/update/json") && ! self.allow_update_json_path
|
53
|
+
# Need to test auto-detection of /update/json being available
|
54
|
+
resp.status = 404
|
55
|
+
end
|
56
|
+
|
57
|
+
return resp
|
58
|
+
end
|
59
|
+
|
60
|
+
def post_args
|
61
|
+
@mutex.synchronize do
|
62
|
+
@post_args.dup
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def get_args
|
67
|
+
@mutex.synchronize do
|
68
|
+
@get_args.dup
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Everything else, just return nil please
|
73
|
+
def method_missing(*args)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def context_with(hash)
|
79
|
+
Traject::Indexer::Context.new(:output_hash => hash)
|
80
|
+
end
|
81
|
+
|
82
|
+
def create_writer(settings = {})
|
83
|
+
settings = {
|
84
|
+
"solr.url" => "http://example.com/solr",
|
85
|
+
"solr_json_writer.http_client" => FakeHTTPClient.new
|
86
|
+
}.merge!(settings)
|
87
|
+
@fake_http_client = settings["solr_json_writer.http_client"]
|
88
|
+
|
89
|
+
writer = Traject::SolrJsonWriter.new(settings)
|
90
|
+
|
91
|
+
return writer
|
92
|
+
end
|
93
|
+
|
94
|
+
# strio = StringIO.new
|
95
|
+
# logger_to_strio(strio)
|
96
|
+
#
|
97
|
+
# Later check for strio.string for contents
|
98
|
+
def logger_to_strio(strio)
|
99
|
+
# Yell makes this hard, let's do it with an ordinary logger, think
|
100
|
+
# it's okay.
|
101
|
+
Logger.new(strio)
|
102
|
+
end
|
103
|
+
|
104
|
+
#########
|
105
|
+
# Actual tests
|
106
|
+
#########
|
107
|
+
|
108
|
+
before do
|
109
|
+
@writer = create_writer
|
110
|
+
end
|
111
|
+
|
112
|
+
it "defaults to 1 bg thread" do
|
113
|
+
assert_equal 1, @writer.thread_pool_size
|
114
|
+
end
|
115
|
+
|
116
|
+
it "adds a document" do
|
117
|
+
@writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
|
118
|
+
@writer.close
|
119
|
+
|
120
|
+
post_args = @fake_http_client.post_args.first
|
121
|
+
|
122
|
+
refute_nil post_args
|
123
|
+
|
124
|
+
assert_equal "http://example.com/solr/update/json", post_args[0]
|
125
|
+
|
126
|
+
refute_nil post_args[1]
|
127
|
+
posted_json = JSON.parse(post_args[1])
|
128
|
+
|
129
|
+
assert_equal [{"id" => "one", "key" => ["value1", "value2"]}], posted_json
|
130
|
+
end
|
131
|
+
|
132
|
+
it "adds more than a batch in batches" do
|
133
|
+
(Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE + 1).times do |i|
|
134
|
+
doc = {"id" => "doc_#{i}", "key" => "value"}
|
135
|
+
@writer.put context_with(doc)
|
136
|
+
end
|
137
|
+
@writer.close
|
138
|
+
|
139
|
+
post_args = @fake_http_client.post_args
|
140
|
+
|
141
|
+
assert_length 2, post_args, "Makes two posts to Solr for two batches"
|
142
|
+
|
143
|
+
assert_length Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE, JSON.parse(post_args[0][1]), "first batch posted with batch size docs"
|
144
|
+
assert_length 1, JSON.parse(post_args[1][1]), "second batch posted with last remaining doc"
|
145
|
+
end
|
146
|
+
|
147
|
+
it "commits on close when set" do
|
148
|
+
@writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
|
149
|
+
@writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
|
150
|
+
@writer.close
|
151
|
+
|
152
|
+
last_solr_get = @fake_http_client.get_args.last
|
153
|
+
|
154
|
+
assert_equal "http://example.com/update/json", last_solr_get[0]
|
155
|
+
assert_equal( {"commit" => "true"}, last_solr_get[1] )
|
156
|
+
end
|
157
|
+
|
158
|
+
describe "skipped records" do
|
159
|
+
it "skips and reports under max_skipped" do
|
160
|
+
strio = StringIO.new
|
161
|
+
@writer = create_writer("solr_writer.max_skipped" => 10, "logger" => logger_to_strio(strio))
|
162
|
+
@fake_http_client.response_status = 500
|
163
|
+
|
164
|
+
10.times do |i|
|
165
|
+
@writer.put context_with("id" => "doc_#{i}", "key" => "value")
|
166
|
+
end
|
167
|
+
@writer.close
|
168
|
+
|
169
|
+
assert_equal 10, @writer.skipped_record_count
|
170
|
+
|
171
|
+
logged = strio.string
|
172
|
+
|
173
|
+
10.times do |i|
|
174
|
+
assert_match /ERROR.*Could not add record doc_#{i} at source file position : Solr error response: 500/, logged
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
it "raises when skipped more than max_skipped" do
|
179
|
+
@writer = create_writer("solr_writer.max_skipped" => 5)
|
180
|
+
@fake_http_client.response_status = 500
|
181
|
+
|
182
|
+
e = assert_raises(RuntimeError) do
|
183
|
+
6.times do |i|
|
184
|
+
@writer.put context_with("id" => "doc_#{i}", "key" => "value")
|
185
|
+
end
|
186
|
+
@writer.close
|
187
|
+
end
|
188
|
+
|
189
|
+
assert_includes e.message, "Exceeded maximum number of skipped records"
|
190
|
+
end
|
191
|
+
|
192
|
+
it "raises on one skipped record when max_skipped is 0" do
|
193
|
+
@writer = create_writer("solr_writer.max_skipped" => 0)
|
194
|
+
@fake_http_client.response_status = 500
|
195
|
+
|
196
|
+
e = assert_raises(RuntimeError) do
|
197
|
+
@writer.put context_with("id" => "doc_1", "key" => "value")
|
198
|
+
@writer.close
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
describe "auto-discovers proper update path" do
|
204
|
+
it "finds /update/json" do
|
205
|
+
assert_equal "http://example.com/solr/update/json", @writer.determine_solr_update_url
|
206
|
+
end
|
207
|
+
|
208
|
+
it "resorts to plain /update" do
|
209
|
+
@fake_http_client = FakeHTTPClient.new
|
210
|
+
@fake_http_client.allow_update_json_path = false
|
211
|
+
|
212
|
+
@writer = create_writer("solr.url" => "http://example.com/solr",
|
213
|
+
"solr_json_writer.http_client" => @fake_http_client)
|
214
|
+
|
215
|
+
assert_equal "http://example.com/solr/update", @writer.determine_solr_update_url
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
describe "Record id from context" do
|
220
|
+
before do
|
221
|
+
@record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
|
222
|
+
@context = Traject::Indexer::Context.new
|
223
|
+
@writer = create_writer
|
224
|
+
@record_001 = " 00282214 " # from the mrc file
|
225
|
+
end
|
226
|
+
|
227
|
+
it "gets it from 001" do
|
228
|
+
@context.source_record = @record
|
229
|
+
assert_equal @record_001, @writer.record_id_from_context(@context)
|
230
|
+
end
|
231
|
+
|
232
|
+
it "gets it from the id" do
|
233
|
+
@context.output_hash['id'] = 'the_record_id'
|
234
|
+
assert_equal 'the_record_id', @writer.record_id_from_context(@context)
|
235
|
+
end
|
236
|
+
|
237
|
+
it "gets it from both 001 and id" do
|
238
|
+
@context.output_hash['id'] = 'the_record_id'
|
239
|
+
@context.source_record = @record
|
240
|
+
assert_equal [@record_001, 'the_record_id'].join('/'), @writer.record_id_from_context(@context)
|
241
|
+
end
|
242
|
+
|
243
|
+
|
244
|
+
|
245
|
+
end
|
246
|
+
|
247
|
+
|
248
|
+
end
|