traject 1.1.0 → 2.0.0.rc.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +20 -0
- data/README.md +85 -73
- data/doc/batch_execution.md +2 -6
- data/doc/other_commands.md +3 -5
- data/doc/settings.md +27 -38
- data/lib/traject/command_line.rb +1 -1
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +29 -11
- data/lib/traject/indexer/settings.rb +39 -13
- data/lib/traject/line_writer.rb +10 -6
- data/lib/traject/marc_reader.rb +2 -1
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +38 -48
- data/lib/traject/translation_map.rb +3 -0
- data/lib/traject/util.rb +13 -51
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/marc_geographic.yaml +2 -2
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/read_write_test.rb +0 -22
- data/test/indexer/settings_test.rb +24 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +5 -3
- data/test/test_support/demo_config.rb +0 -5
- data/test/translation_map_test.rb +9 -0
- data/traject.gemspec +18 -5
- metadata +77 -87
- data/lib/traject/marc4j_reader.rb +0 -153
- data/lib/traject/solrj_writer.rb +0 -351
- data/test/marc4j_reader_test.rb +0 -136
- data/test/solrj_writer_test.rb +0 -209
- data/vendor/solrj/README +0 -8
- data/vendor/solrj/build.xml +0 -39
- data/vendor/solrj/ivy.xml +0 -16
- data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
- data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
- data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
- data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
- data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
- data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
- data/vendor/solrj/lib/noggit-0.5.jar +0 -0
- data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
- data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
- data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
- data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
data/lib/traject/thread_pool.rb
CHANGED
@@ -1,28 +1,33 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
require 'thread' # for Queue
|
3
|
+
|
1
4
|
module Traject
|
2
|
-
# An abstraction wrapping a
|
3
|
-
# and other apparatus.
|
5
|
+
# An abstraction wrapping a Concurrent::ThreadPool in some configuration choices
|
6
|
+
# and other apparatus. Concurrent::ThreadPool is a Java ThreadPool executor on
|
7
|
+
# jruby for performance, and is ruby-concurrent's own ruby implementation otherwise.
|
4
8
|
#
|
5
9
|
# 1) Initialize with chosen pool size -- we create fixed size pools, where
|
6
10
|
# core and max sizes are the same.
|
7
11
|
#
|
8
|
-
# 2) If initialized with nil for threadcount, no thread pool will actually
|
9
|
-
# be created, and
|
10
|
-
# the
|
11
|
-
#
|
12
|
-
#
|
12
|
+
# 2) If initialized with nil or 0 for threadcount, no thread pool will actually
|
13
|
+
# be created, and work sent to the Traject::ThreadPool will just be executed
|
14
|
+
# in the caller thread. We call this a nil threadpool. One situation it can be useful
|
15
|
+
# is if you are running under MRI, where multi-core parallelism isn't available, so
|
16
|
+
# an actual threadpool may not be useful. (Although in some cases a thread pool,
|
17
|
+
# especially one with size 1, can be useful in MRI for I/O blocking operations)
|
13
18
|
#
|
14
19
|
# 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
|
15
|
-
# execution -- if
|
20
|
+
# execution -- if configurred with a nil threadcount, your block will just be
|
16
21
|
# executed in calling thread. Be careful to not refer to any non-local
|
17
22
|
# variables in the block, unless the variable has an object you can
|
18
23
|
# use thread-safely!
|
19
24
|
#
|
20
|
-
# 4)
|
21
|
-
# with a work queue that will buffer up to (pool_size*3) tasks. If queue is full,
|
22
|
-
# the
|
25
|
+
# 4) We configure our underlying Concurrent::ThreadPool
|
26
|
+
# with a work queue that will buffer up to (pool_size*3) tasks. If the queue is full,
|
27
|
+
# the underlying Concurrent::ThreadPool is set up to use the :caller_runs policy
|
23
28
|
# meaning the block will end up executing in caller's own thread. With the kind
|
24
29
|
# of work we're doing, where each unit of work is small and there are many of them--
|
25
|
-
# the
|
30
|
+
# the :caller_runs policy serves as an effective 'back pressure' mechanism to keep
|
26
31
|
# the work queue from getting too large and exhausting memory, when producers are
|
27
32
|
# faster than consumers.
|
28
33
|
#
|
@@ -34,8 +39,8 @@ module Traject
|
|
34
39
|
# #shutdown_and_wait, which will wait for all current queued work
|
35
40
|
# to complete, then return. You can not give any more work to the pool
|
36
41
|
# after you do this. By default it'll wait pretty much forever, which should
|
37
|
-
# be fine. If you never call shutdown,
|
38
|
-
#
|
42
|
+
# be fine. If you never call shutdown, then queued or in-progress work
|
43
|
+
# may be abandoned when the program ends, which would be bad.
|
39
44
|
#
|
40
45
|
# 7) We will keep track of total times a block is run in thread pool, and
|
41
46
|
# total elapsed (wall) time of running all blocks, so an average_execution_ms
|
@@ -43,33 +48,27 @@ module Traject
|
|
43
48
|
# threads are still executing, as it's not entirely thread safe (may get
|
44
49
|
# an off by one as to total iterations)
|
45
50
|
class ThreadPool
|
46
|
-
attr_reader :pool_size, :
|
51
|
+
attr_reader :pool_size, :queue_capacity
|
47
52
|
|
48
|
-
# First arg is pool size, 0 or nil and we'll be a null/no-op pool
|
53
|
+
# First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
|
54
|
+
# work in caller thread.
|
49
55
|
def initialize(pool_size)
|
50
56
|
unless pool_size.nil? || pool_size == 0
|
51
|
-
|
52
|
-
|
53
|
-
@label = label
|
54
|
-
|
55
|
-
@pool_size = pool_size.to_i # just for reflection, we don't really need it again
|
57
|
+
@pool_size = pool_size.to_i
|
56
58
|
@queue_capacity = pool_size * 3
|
57
59
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
@thread_pool = java.util.concurrent.ThreadPoolExecutor.new(
|
65
|
-
@pool_size, @pool_size, 0, java.util.concurrent.TimeUnit::MILLISECONDS,
|
66
|
-
blockingQueue, rejectedExecutionHandler)
|
60
|
+
@thread_pool = Concurrent::ThreadPoolExecutor.new(
|
61
|
+
:min_threads => @pool_size,
|
62
|
+
:max_threads => @pool_size,
|
63
|
+
:max_queue => @queue_capacity,
|
64
|
+
:fallback_policy => :caller_runs
|
65
|
+
)
|
67
66
|
|
68
67
|
# A thread-safe queue to collect exceptions cross-threads.
|
69
|
-
# We
|
70
|
-
#
|
71
|
-
#
|
72
|
-
@
|
68
|
+
# We really only need to save the first exception, but a queue
|
69
|
+
# is a convenient way to store a value concurrency-safely, and
|
70
|
+
# might as well store all of them.
|
71
|
+
@exceptions_caught_queue = Queue.new
|
73
72
|
end
|
74
73
|
end
|
75
74
|
|
@@ -106,7 +105,7 @@ module Traject
|
|
106
105
|
start_t = Time.now
|
107
106
|
|
108
107
|
if @thread_pool
|
109
|
-
@thread_pool.
|
108
|
+
@thread_pool.post do
|
110
109
|
begin
|
111
110
|
yield(*args)
|
112
111
|
rescue Exception => e
|
@@ -119,21 +118,13 @@ module Traject
|
|
119
118
|
|
120
119
|
end
|
121
120
|
|
122
|
-
# Just for monitoring/debugging purposes, we'll return the work queue
|
123
|
-
# used by the threadpool. Don't recommend you do anything with it, as
|
124
|
-
# the original java.util.concurrent docs make the same recommendation.
|
125
|
-
def queue
|
126
|
-
@thread_pool && @thread_pool.queue
|
127
|
-
end
|
128
121
|
|
129
122
|
# thread-safe way of storing an exception, to raise
|
130
123
|
# later in a different thread. We don't guarantee
|
131
124
|
# that we can store more than one at a time, only
|
132
125
|
# the first one recorded may be stored.
|
133
126
|
def collect_exception(e)
|
134
|
-
|
135
|
-
# with us.
|
136
|
-
@async_exception_queue.offer(e)
|
127
|
+
@exceptions_caught_queue.push(e)
|
137
128
|
end
|
138
129
|
|
139
130
|
# If there's a stored collected exception, raise it
|
@@ -144,7 +135,8 @@ module Traject
|
|
144
135
|
# as a non-functioning threadpool -- then this method is just
|
145
136
|
# a no-op.
|
146
137
|
def raise_collected_exception!
|
147
|
-
if @
|
138
|
+
if @exceptions_caught_queue && (! @exceptions_caught_queue.empty?)
|
139
|
+
e = @exceptions_caught_queue.pop
|
148
140
|
raise e
|
149
141
|
end
|
150
142
|
end
|
@@ -159,9 +151,7 @@ module Traject
|
|
159
151
|
|
160
152
|
if @thread_pool
|
161
153
|
@thread_pool.shutdown
|
162
|
-
|
163
|
-
# a timeout. Okay, one day!
|
164
|
-
@thread_pool.awaitTermination(1, java.util.concurrent.TimeUnit::DAYS)
|
154
|
+
@thread_pool.wait_for_termination
|
165
155
|
end
|
166
156
|
|
167
157
|
return (Time.now - start_t)
|
@@ -171,6 +171,9 @@ module Traject
|
|
171
171
|
def initialize(defn, options = {})
|
172
172
|
if defn.kind_of? Hash
|
173
173
|
@hash = defn
|
174
|
+
elsif defn.kind_of? self.class
|
175
|
+
@hash = defn.to_hash
|
176
|
+
@default = defn.default
|
174
177
|
else
|
175
178
|
@hash = self.class.cache.lookup(defn)
|
176
179
|
raise NotFound.new(defn) if @hash.nil?
|
data/lib/traject/util.rb
CHANGED
@@ -27,63 +27,25 @@ module Traject
|
|
27
27
|
end
|
28
28
|
|
29
29
|
|
30
|
-
|
31
|
-
#
|
32
|
-
#
|
33
|
-
# Have to pass in a settings arg, so we can check it for specified jar dir.
|
34
|
-
#
|
35
|
-
# Tries not to do the dirglob and require if solrj has already been loaded.
|
36
|
-
# Will define global constants with classes HttpSolrServer and SolrInputDocument
|
37
|
-
# if not already defined.
|
30
|
+
|
31
|
+
# Ruby stdlib queue lacks a 'drain' function, we write one.
|
38
32
|
#
|
39
|
-
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
|
43
|
-
|
44
|
-
def self.require_solrj_jars(settings)
|
45
|
-
jruby_ensure_init!
|
33
|
+
# Removes everything currently in the ruby stdlib queue, and returns
|
34
|
+
# it an array. Should be concurrent-safe, but queue may still have
|
35
|
+
# some things in it after drain, if there are concurrent writers.
|
36
|
+
def self.drain_queue(queue)
|
37
|
+
result = []
|
46
38
|
|
47
|
-
|
39
|
+
queue_size = queue.size
|
48
40
|
begin
|
49
|
-
|
50
|
-
|
51
|
-
org.apache.solr
|
52
|
-
org.apache.solr.client.solrj
|
53
|
-
|
54
|
-
# java_import which we'd normally use weirdly doesn't work
|
55
|
-
# from a class method. https://github.com/jruby/jruby/issues/975
|
56
|
-
Object.const_set("HttpSolrServer", org.apache.solr.client.solrj.impl.HttpSolrServer) unless defined? ::HttpSolrServer
|
57
|
-
Object.const_set("SolrInputDocument", org.apache.solr.common.SolrInputDocument) unless defined? ::SolrInputDocument
|
58
|
-
rescue NameError => e
|
59
|
-
included_jar_dir = File.expand_path("../../vendor/solrj/lib", File.dirname(__FILE__))
|
60
|
-
|
61
|
-
jardir = settings["solrj.jar_dir"] || included_jar_dir
|
62
|
-
Dir.glob("#{jardir}/*.jar") do |x|
|
63
|
-
require x
|
64
|
-
end
|
65
|
-
if tries > 1
|
66
|
-
raise LoadError.new("Can not find SolrJ java classes")
|
67
|
-
else
|
68
|
-
retry
|
41
|
+
queue_size.times do
|
42
|
+
result << queue.deq(:raise_if_empty)
|
69
43
|
end
|
44
|
+
rescue ThreadError
|
45
|
+
# Need do nothing, queue was concurrently popped, no biggie
|
70
46
|
end
|
71
|
-
end
|
72
47
|
|
73
|
-
|
74
|
-
# aren't jruby, and raises a better error message.
|
75
|
-
# Pass in a developer-presentable name of a feature to include in the error
|
76
|
-
# message if you want.
|
77
|
-
def self.jruby_ensure_init!(feature = nil)
|
78
|
-
begin
|
79
|
-
require 'java'
|
80
|
-
rescue LoadError => e
|
81
|
-
feature ||= "A traject feature is in use that"
|
82
|
-
msg = if feature
|
83
|
-
"#{feature} requires jruby, but you do not appear to be running under jruby. We recommend `chruby` for managing multiple ruby installs."
|
84
|
-
end
|
85
|
-
raise LoadError.new(msg)
|
86
|
-
end
|
48
|
+
return result
|
87
49
|
end
|
88
50
|
|
89
51
|
end
|
data/lib/traject/version.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task
|
2
|
-
# Scraped from http://www.loc.gov/marc/geoareas/gacs_code.html at
|
2
|
+
# Scraped from http://www.loc.gov/marc/geoareas/gacs_code.html at 2015-01-27 23:00:08 -0500
|
3
3
|
# Intentionally includes discontinued codes.
|
4
4
|
|
5
5
|
'a': 'Asia'
|
@@ -327,7 +327,7 @@
|
|
327
327
|
'lnaz': 'Azores'
|
328
328
|
'lnbm': 'Bermuda Islands'
|
329
329
|
'lnca': 'Canary Islands'
|
330
|
-
'lncv': '
|
330
|
+
'lncv': 'Cabo Verde'
|
331
331
|
'lnfa': 'Faroe Islands'
|
332
332
|
'lnjn': 'Jan Mayen Island'
|
333
333
|
'lnma': 'Madeira Islands'
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
3
|
+
require 'test_helper'
|
4
|
+
require 'stringio'
|
5
|
+
require 'traject/delimited_writer'
|
6
|
+
require 'traject/csv_writer'
|
7
|
+
|
8
|
+
require 'csv'
|
9
|
+
|
10
|
+
describe "Delimited/CSV Writers" do
|
11
|
+
|
12
|
+
before do
|
13
|
+
@out = StringIO.new
|
14
|
+
@settings = {'output_stream' => @out, 'delimited_writer.fields' => 'four,one,two'}
|
15
|
+
@context = Struct.new(:output_hash).new
|
16
|
+
@context.output_hash = {'one' => 'one', 'two' => %w[two1 two2], 'three' => 'three', 'four' => 'four'}
|
17
|
+
end
|
18
|
+
|
19
|
+
after do
|
20
|
+
@out.close
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "Traject::DelimitedWriter" do
|
24
|
+
|
25
|
+
it "creates a dw with defaults" do
|
26
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
27
|
+
dw.delimiter.must_equal "\t"
|
28
|
+
dw.internal_delimiter.must_equal '|'
|
29
|
+
dw.edelim.must_equal ' '
|
30
|
+
dw.eidelim.must_equal '\\|'
|
31
|
+
end
|
32
|
+
|
33
|
+
it "respects different delimiter" do
|
34
|
+
@settings['delimited_writer.delimiter'] = '^'
|
35
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
36
|
+
dw.delimiter.must_equal '^'
|
37
|
+
dw.edelim.must_equal '\\^'
|
38
|
+
dw.internal_delimiter.must_equal '|'
|
39
|
+
end
|
40
|
+
|
41
|
+
it "outputs a header if asked to" do
|
42
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
43
|
+
@out.string.chomp.must_equal %w[four one two].join("\t")
|
44
|
+
end
|
45
|
+
|
46
|
+
it "doesn't output a header if asked not to" do
|
47
|
+
@settings['delimited_writer.header'] = 'false'
|
48
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
49
|
+
@out.string.must_be_empty
|
50
|
+
end
|
51
|
+
|
52
|
+
it "deals with multiple values" do
|
53
|
+
dw = Traject::DelimitedWriter.new(@settings)
|
54
|
+
dw.put @context
|
55
|
+
@out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(dw.delimiter)
|
56
|
+
end
|
57
|
+
|
58
|
+
it "bails if delimited_writer.fields isn't set" do
|
59
|
+
@settings.delete 'delimited_writer.fields'
|
60
|
+
proc { Traject::DelimitedWriter.new(@settings) }.must_raise(ArgumentError)
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "Traject::CSVWriter" do
|
66
|
+
it "unsets the delimiter" do
|
67
|
+
cw = Traject::CSVWriter.new(@settings)
|
68
|
+
cw.delimiter.must_be_nil
|
69
|
+
end
|
70
|
+
|
71
|
+
it "writes the header" do
|
72
|
+
cw = Traject::CSVWriter.new(@settings)
|
73
|
+
@out.string.chomp.must_equal 'four,one,two'
|
74
|
+
end
|
75
|
+
|
76
|
+
it "uses the internal delimiter" do
|
77
|
+
cw = Traject::CSVWriter.new(@settings)
|
78
|
+
cw.put @context
|
79
|
+
@out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(',')
|
80
|
+
end
|
81
|
+
|
82
|
+
it "produces complex output" do
|
83
|
+
@context.output_hash = {
|
84
|
+
'four' => ['Bill Clinton, Jr.', 'Jesse "the Body" Ventura'],
|
85
|
+
'one' => 'Willard "Mitt" Romney',
|
86
|
+
'two' => 'Dueber, Bill'
|
87
|
+
}
|
88
|
+
canonical = StringIO.new
|
89
|
+
csv = CSV.new(canonical)
|
90
|
+
|
91
|
+
csv_vals = [@context.output_hash['four'].join('|'), @context.output_hash['one'], @context.output_hash['two']]
|
92
|
+
csv << csv_vals
|
93
|
+
csv_output = canonical.string.chomp
|
94
|
+
|
95
|
+
cw = Traject::CSVWriter.new(@settings)
|
96
|
+
cw.put @context
|
97
|
+
traject_csvwriter_output = @out.string.split("\n").last.chomp
|
98
|
+
|
99
|
+
assert_equal(csv_output, traject_csvwriter_output)
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
104
|
+
end
|
@@ -62,29 +62,9 @@ describe "Traject::Indexer#process" do
|
|
62
62
|
assert writer_settings["memory_writer.closed"]
|
63
63
|
end
|
64
64
|
|
65
|
-
it "returns false if skipped records" do
|
66
|
-
@indexer = Traject::Indexer.new(
|
67
|
-
"solrj_writer.server_class_name" => "MockSolrServer",
|
68
|
-
"solr.url" => "http://example.org",
|
69
|
-
"writer_class_name" => "Traject::SolrJWriter"
|
70
|
-
)
|
71
|
-
@file = File.open(support_file_path "manufacturing_consent.marc")
|
72
|
-
|
73
|
-
|
74
|
-
@indexer.to_field("id") do |record, accumulator|
|
75
|
-
# intentionally make error
|
76
|
-
accumulator.concat ["one_id", "two_id"]
|
77
|
-
end
|
78
|
-
return_value = @indexer.process(@file)
|
79
|
-
|
80
|
-
assert ! return_value, "returns false on skipped record errors"
|
81
|
-
end
|
82
|
-
|
83
65
|
require 'traject/null_writer'
|
84
66
|
it "calls after_processing after processing" do
|
85
67
|
@indexer = Traject::Indexer.new(
|
86
|
-
"solrj_writer.server_class_name" => "MockSolrServer",
|
87
|
-
"solr.url" => "http://example.org",
|
88
68
|
"writer_class_name" => "Traject::NullWriter"
|
89
69
|
)
|
90
70
|
@file = File.open(support_file_path "test_data.utf8.mrc")
|
@@ -106,8 +86,6 @@ describe "Traject::Indexer#process" do
|
|
106
86
|
describe "demo_config.rb" do
|
107
87
|
before do
|
108
88
|
@indexer = Traject::Indexer.new(
|
109
|
-
"solrj_writer.server_class_name" => "MockSolrServer",
|
110
|
-
"solr.url" => "http://example.org",
|
111
89
|
"writer_class_name" => "Traject::NullWriter"
|
112
90
|
)
|
113
91
|
end
|
@@ -124,5 +124,29 @@ describe "Traject::Indexer#settings" do
|
|
124
124
|
assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
|
125
125
|
end
|
126
126
|
end
|
127
|
+
|
128
|
+
describe "JRuby / MRI" do
|
129
|
+
before do
|
130
|
+
@indexer = Traject::Indexer.new
|
131
|
+
end
|
132
|
+
|
133
|
+
it "has the right indexer name" do
|
134
|
+
if defined? JRUBY_VERSION
|
135
|
+
assert_equal "Traject::Marc4JReader", @indexer.settings['reader_class_name']
|
136
|
+
else
|
137
|
+
assert_equal "Traject::MarcReader", @indexer.settings['reader_class_name']
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# This next one has the added effect of making sure the correct class
|
142
|
+
# has actually been loaded -- otherwise the constant wouldn't be available
|
143
|
+
it "has the correct default indexer class based on platform" do
|
144
|
+
if defined? JRUBY_VERSION
|
145
|
+
assert_equal Traject::Marc4JReader, @indexer.reader_class
|
146
|
+
else
|
147
|
+
assert_equal Traject::MarcReader, @indexer.reader_class
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
127
151
|
|
128
152
|
end
|
@@ -0,0 +1,248 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'httpclient'
|
3
|
+
require 'traject/solr_json_writer'
|
4
|
+
require 'thread'
|
5
|
+
require 'json'
|
6
|
+
require 'stringio'
|
7
|
+
require 'logger'
|
8
|
+
|
9
|
+
|
10
|
+
# Some basic tests, using a mocked HTTPClient so we can see what it did --
|
11
|
+
# these tests do not run against a real solr server at present.
|
12
|
+
describe "Traject::SolrJsonWriter" do
|
13
|
+
|
14
|
+
|
15
|
+
#######
|
16
|
+
# A bunch of utilities to help testing
|
17
|
+
#######
|
18
|
+
|
19
|
+
class FakeHTTPClient
|
20
|
+
# Always reply with this status, normally 200, can
|
21
|
+
# be reset for testing error conditions.
|
22
|
+
attr_accessor :response_status
|
23
|
+
attr_accessor :allow_update_json_path
|
24
|
+
|
25
|
+
def initialize(*args)
|
26
|
+
@post_args = []
|
27
|
+
@get_args = []
|
28
|
+
@response_status = 200
|
29
|
+
@allow_update_json_path = true
|
30
|
+
@mutex = Monitor.new
|
31
|
+
end
|
32
|
+
|
33
|
+
def post(*args)
|
34
|
+
@mutex.synchronize do
|
35
|
+
@post_args << args
|
36
|
+
end
|
37
|
+
|
38
|
+
resp = HTTP::Message.new_response("")
|
39
|
+
resp.status = self.response_status
|
40
|
+
|
41
|
+
return resp
|
42
|
+
end
|
43
|
+
|
44
|
+
def get (*args)
|
45
|
+
@mutex.synchronize do
|
46
|
+
@get_args << args
|
47
|
+
end
|
48
|
+
|
49
|
+
resp = HTTP::Message.new_response("")
|
50
|
+
resp.status = self.response_status
|
51
|
+
|
52
|
+
if args.first.end_with?("/update/json") && ! self.allow_update_json_path
|
53
|
+
# Need to test auto-detection of /update/json being available
|
54
|
+
resp.status = 404
|
55
|
+
end
|
56
|
+
|
57
|
+
return resp
|
58
|
+
end
|
59
|
+
|
60
|
+
def post_args
|
61
|
+
@mutex.synchronize do
|
62
|
+
@post_args.dup
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def get_args
|
67
|
+
@mutex.synchronize do
|
68
|
+
@get_args.dup
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Everything else, just return nil please
|
73
|
+
def method_missing(*args)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def context_with(hash)
|
79
|
+
Traject::Indexer::Context.new(:output_hash => hash)
|
80
|
+
end
|
81
|
+
|
82
|
+
def create_writer(settings = {})
|
83
|
+
settings = {
|
84
|
+
"solr.url" => "http://example.com/solr",
|
85
|
+
"solr_json_writer.http_client" => FakeHTTPClient.new
|
86
|
+
}.merge!(settings)
|
87
|
+
@fake_http_client = settings["solr_json_writer.http_client"]
|
88
|
+
|
89
|
+
writer = Traject::SolrJsonWriter.new(settings)
|
90
|
+
|
91
|
+
return writer
|
92
|
+
end
|
93
|
+
|
94
|
+
# strio = StringIO.new
|
95
|
+
# logger_to_strio(strio)
|
96
|
+
#
|
97
|
+
# Later check for strio.string for contents
|
98
|
+
def logger_to_strio(strio)
|
99
|
+
# Yell makes this hard, let's do it with an ordinary logger, think
|
100
|
+
# it's okay.
|
101
|
+
Logger.new(strio)
|
102
|
+
end
|
103
|
+
|
104
|
+
#########
|
105
|
+
# Actual tests
|
106
|
+
#########
|
107
|
+
|
108
|
+
before do
|
109
|
+
@writer = create_writer
|
110
|
+
end
|
111
|
+
|
112
|
+
it "defaults to 1 bg thread" do
|
113
|
+
assert_equal 1, @writer.thread_pool_size
|
114
|
+
end
|
115
|
+
|
116
|
+
it "adds a document" do
|
117
|
+
@writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
|
118
|
+
@writer.close
|
119
|
+
|
120
|
+
post_args = @fake_http_client.post_args.first
|
121
|
+
|
122
|
+
refute_nil post_args
|
123
|
+
|
124
|
+
assert_equal "http://example.com/solr/update/json", post_args[0]
|
125
|
+
|
126
|
+
refute_nil post_args[1]
|
127
|
+
posted_json = JSON.parse(post_args[1])
|
128
|
+
|
129
|
+
assert_equal [{"id" => "one", "key" => ["value1", "value2"]}], posted_json
|
130
|
+
end
|
131
|
+
|
132
|
+
it "adds more than a batch in batches" do
|
133
|
+
(Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE + 1).times do |i|
|
134
|
+
doc = {"id" => "doc_#{i}", "key" => "value"}
|
135
|
+
@writer.put context_with(doc)
|
136
|
+
end
|
137
|
+
@writer.close
|
138
|
+
|
139
|
+
post_args = @fake_http_client.post_args
|
140
|
+
|
141
|
+
assert_length 2, post_args, "Makes two posts to Solr for two batches"
|
142
|
+
|
143
|
+
assert_length Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE, JSON.parse(post_args[0][1]), "first batch posted with batch size docs"
|
144
|
+
assert_length 1, JSON.parse(post_args[1][1]), "second batch posted with last remaining doc"
|
145
|
+
end
|
146
|
+
|
147
|
+
it "commits on close when set" do
|
148
|
+
@writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
|
149
|
+
@writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
|
150
|
+
@writer.close
|
151
|
+
|
152
|
+
last_solr_get = @fake_http_client.get_args.last
|
153
|
+
|
154
|
+
assert_equal "http://example.com/update/json", last_solr_get[0]
|
155
|
+
assert_equal( {"commit" => "true"}, last_solr_get[1] )
|
156
|
+
end
|
157
|
+
|
158
|
+
describe "skipped records" do
|
159
|
+
it "skips and reports under max_skipped" do
|
160
|
+
strio = StringIO.new
|
161
|
+
@writer = create_writer("solr_writer.max_skipped" => 10, "logger" => logger_to_strio(strio))
|
162
|
+
@fake_http_client.response_status = 500
|
163
|
+
|
164
|
+
10.times do |i|
|
165
|
+
@writer.put context_with("id" => "doc_#{i}", "key" => "value")
|
166
|
+
end
|
167
|
+
@writer.close
|
168
|
+
|
169
|
+
assert_equal 10, @writer.skipped_record_count
|
170
|
+
|
171
|
+
logged = strio.string
|
172
|
+
|
173
|
+
10.times do |i|
|
174
|
+
assert_match /ERROR.*Could not add record doc_#{i} at source file position : Solr error response: 500/, logged
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
it "raises when skipped more than max_skipped" do
|
179
|
+
@writer = create_writer("solr_writer.max_skipped" => 5)
|
180
|
+
@fake_http_client.response_status = 500
|
181
|
+
|
182
|
+
e = assert_raises(RuntimeError) do
|
183
|
+
6.times do |i|
|
184
|
+
@writer.put context_with("id" => "doc_#{i}", "key" => "value")
|
185
|
+
end
|
186
|
+
@writer.close
|
187
|
+
end
|
188
|
+
|
189
|
+
assert_includes e.message, "Exceeded maximum number of skipped records"
|
190
|
+
end
|
191
|
+
|
192
|
+
it "raises on one skipped record when max_skipped is 0" do
|
193
|
+
@writer = create_writer("solr_writer.max_skipped" => 0)
|
194
|
+
@fake_http_client.response_status = 500
|
195
|
+
|
196
|
+
e = assert_raises(RuntimeError) do
|
197
|
+
@writer.put context_with("id" => "doc_1", "key" => "value")
|
198
|
+
@writer.close
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
describe "auto-discovers proper update path" do
|
204
|
+
it "finds /update/json" do
|
205
|
+
assert_equal "http://example.com/solr/update/json", @writer.determine_solr_update_url
|
206
|
+
end
|
207
|
+
|
208
|
+
it "resorts to plain /update" do
|
209
|
+
@fake_http_client = FakeHTTPClient.new
|
210
|
+
@fake_http_client.allow_update_json_path = false
|
211
|
+
|
212
|
+
@writer = create_writer("solr.url" => "http://example.com/solr",
|
213
|
+
"solr_json_writer.http_client" => @fake_http_client)
|
214
|
+
|
215
|
+
assert_equal "http://example.com/solr/update", @writer.determine_solr_update_url
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
describe "Record id from context" do
|
220
|
+
before do
|
221
|
+
@record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
|
222
|
+
@context = Traject::Indexer::Context.new
|
223
|
+
@writer = create_writer
|
224
|
+
@record_001 = " 00282214 " # from the mrc file
|
225
|
+
end
|
226
|
+
|
227
|
+
it "gets it from 001" do
|
228
|
+
@context.source_record = @record
|
229
|
+
assert_equal @record_001, @writer.record_id_from_context(@context)
|
230
|
+
end
|
231
|
+
|
232
|
+
it "gets it from the id" do
|
233
|
+
@context.output_hash['id'] = 'the_record_id'
|
234
|
+
assert_equal 'the_record_id', @writer.record_id_from_context(@context)
|
235
|
+
end
|
236
|
+
|
237
|
+
it "gets it from both 001 and id" do
|
238
|
+
@context.output_hash['id'] = 'the_record_id'
|
239
|
+
@context.source_record = @record
|
240
|
+
assert_equal [@record_001, 'the_record_id'].join('/'), @writer.record_id_from_context(@context)
|
241
|
+
end
|
242
|
+
|
243
|
+
|
244
|
+
|
245
|
+
end
|
246
|
+
|
247
|
+
|
248
|
+
end
|