traject 1.1.0 → 2.0.0.rc.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +20 -0
  3. data/README.md +85 -73
  4. data/doc/batch_execution.md +2 -6
  5. data/doc/other_commands.md +3 -5
  6. data/doc/settings.md +27 -38
  7. data/lib/traject/command_line.rb +1 -1
  8. data/lib/traject/csv_writer.rb +34 -0
  9. data/lib/traject/delimited_writer.rb +110 -0
  10. data/lib/traject/indexer.rb +29 -11
  11. data/lib/traject/indexer/settings.rb +39 -13
  12. data/lib/traject/line_writer.rb +10 -6
  13. data/lib/traject/marc_reader.rb +2 -1
  14. data/lib/traject/solr_json_writer.rb +277 -0
  15. data/lib/traject/thread_pool.rb +38 -48
  16. data/lib/traject/translation_map.rb +3 -0
  17. data/lib/traject/util.rb +13 -51
  18. data/lib/traject/version.rb +1 -1
  19. data/lib/translation_maps/marc_geographic.yaml +2 -2
  20. data/test/delimited_writer_test.rb +104 -0
  21. data/test/indexer/read_write_test.rb +0 -22
  22. data/test/indexer/settings_test.rb +24 -0
  23. data/test/solr_json_writer_test.rb +248 -0
  24. data/test/test_helper.rb +5 -3
  25. data/test/test_support/demo_config.rb +0 -5
  26. data/test/translation_map_test.rb +9 -0
  27. data/traject.gemspec +18 -5
  28. metadata +77 -87
  29. data/lib/traject/marc4j_reader.rb +0 -153
  30. data/lib/traject/solrj_writer.rb +0 -351
  31. data/test/marc4j_reader_test.rb +0 -136
  32. data/test/solrj_writer_test.rb +0 -209
  33. data/vendor/solrj/README +0 -8
  34. data/vendor/solrj/build.xml +0 -39
  35. data/vendor/solrj/ivy.xml +0 -16
  36. data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
  37. data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
  38. data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
  39. data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
  40. data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
  41. data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
  42. data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
  43. data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
  44. data/vendor/solrj/lib/noggit-0.5.jar +0 -0
  45. data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
  46. data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
  47. data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
  48. data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
  49. data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
  50. data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
  51. data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
@@ -1,28 +1,33 @@
1
+ require 'concurrent'
2
+ require 'thread' # for Queue
3
+
1
4
  module Traject
2
- # An abstraction wrapping a threadpool executor in some configuration choices
3
- # and other apparatus.
5
+ # An abstraction wrapping a Concurrent::ThreadPool in some configuration choices
6
+ # and other apparatus. Concurrent::ThreadPool is a Java ThreadPool executor on
7
+ # jruby for performance, and is ruby-concurrent's own ruby implementation otherwise.
4
8
  #
5
9
  # 1) Initialize with chosen pool size -- we create fixed size pools, where
6
10
  # core and max sizes are the same.
7
11
  #
8
- # 2) If initialized with nil for threadcount, no thread pool will actually
9
- # be created, and all threadpool-related methods become no-ops. We call this
10
- # the nil/null threadpool. A non-nil threadpool requires jruby, but you can
11
- # create a null Traject::ThreadPool.new(nil) under MRI without anything
12
- # complaining.
12
+ # 2) If initialized with nil or 0 for threadcount, no thread pool will actually
13
+ # be created, and work sent to the Traject::ThreadPool will just be executed
14
+ # in the caller thread. We call this a nil threadpool. One situation it can be useful
15
+ # is if you are running under MRI, where multi-core parallelism isn't available, so
16
+ # an actual threadpool may not be useful. (Although in some cases a thread pool,
17
+ # especially one with size 1, can be useful in MRI for I/O blocking operations)
13
18
  #
14
19
  # 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
15
- # execution -- if no threadpool configured your block will just be
20
+ # execution -- if configurred with a nil threadcount, your block will just be
16
21
  # executed in calling thread. Be careful to not refer to any non-local
17
22
  # variables in the block, unless the variable has an object you can
18
23
  # use thread-safely!
19
24
  #
20
- # 4) Thread pools are java.util.concurrent.ThreadPoolExecutor, manually created
21
- # with a work queue that will buffer up to (pool_size*3) tasks. If queue is full,
22
- # the ThreadPoolExecutor is set up to use the ThreadPoolExecutor.CallerRunsPolicy,
25
+ # 4) We configure our underlying Concurrent::ThreadPool
26
+ # with a work queue that will buffer up to (pool_size*3) tasks. If the queue is full,
27
+ # the underlying Concurrent::ThreadPool is set up to use the :caller_runs policy
23
28
  # meaning the block will end up executing in caller's own thread. With the kind
24
29
  # of work we're doing, where each unit of work is small and there are many of them--
25
- # the CallerRunsPolicy serves as an effective 'back pressure' mechanism to keep
30
+ # the :caller_runs policy serves as an effective 'back pressure' mechanism to keep
26
31
  # the work queue from getting too large and exhausting memory, when producers are
27
32
  # faster than consumers.
28
33
  #
@@ -34,8 +39,8 @@ module Traject
34
39
  # #shutdown_and_wait, which will wait for all current queued work
35
40
  # to complete, then return. You can not give any more work to the pool
36
41
  # after you do this. By default it'll wait pretty much forever, which should
37
- # be fine. If you never call shutdown, the pool will keep running forever
38
- # and not allow your program to exit!
42
+ # be fine. If you never call shutdown, then queued or in-progress work
43
+ # may be abandoned when the program ends, which would be bad.
39
44
  #
40
45
  # 7) We will keep track of total times a block is run in thread pool, and
41
46
  # total elapsed (wall) time of running all blocks, so an average_execution_ms
@@ -43,33 +48,27 @@ module Traject
43
48
  # threads are still executing, as it's not entirely thread safe (may get
44
49
  # an off by one as to total iterations)
45
50
  class ThreadPool
46
- attr_reader :pool_size, :label, :queue_capacity
51
+ attr_reader :pool_size, :queue_capacity
47
52
 
48
- # First arg is pool size, 0 or nil and we'll be a null/no-op pool
53
+ # First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
54
+ # work in caller thread.
49
55
  def initialize(pool_size)
50
56
  unless pool_size.nil? || pool_size == 0
51
- require 'java' # trigger an exception now if we're not jruby
52
-
53
- @label = label
54
-
55
- @pool_size = pool_size.to_i # just for reflection, we don't really need it again
57
+ @pool_size = pool_size.to_i
56
58
  @queue_capacity = pool_size * 3
57
59
 
58
-
59
- blockingQueue = java.util.concurrent.ArrayBlockingQueue.new(@queue_capacity)
60
- rejectedExecutionHandler = java.util.concurrent.ThreadPoolExecutor::CallerRunsPolicy.new
61
-
62
- # keepalive times don't matter, we are setting core and max pool to
63
- # same thing, fixed size pool.
64
- @thread_pool = java.util.concurrent.ThreadPoolExecutor.new(
65
- @pool_size, @pool_size, 0, java.util.concurrent.TimeUnit::MILLISECONDS,
66
- blockingQueue, rejectedExecutionHandler)
60
+ @thread_pool = Concurrent::ThreadPoolExecutor.new(
61
+ :min_threads => @pool_size,
62
+ :max_threads => @pool_size,
63
+ :max_queue => @queue_capacity,
64
+ :fallback_policy => :caller_runs
65
+ )
67
66
 
68
67
  # A thread-safe queue to collect exceptions cross-threads.
69
- # We make it small, we really only need to store the first
70
- # exception, we don't care too much about others. But we'll
71
- # keep the first 20, why not.
72
- @async_exception_queue = java.util.concurrent.ArrayBlockingQueue.new(20)
68
+ # We really only need to save the first exception, but a queue
69
+ # is a convenient way to store a value concurrency-safely, and
70
+ # might as well store all of them.
71
+ @exceptions_caught_queue = Queue.new
73
72
  end
74
73
  end
75
74
 
@@ -106,7 +105,7 @@ module Traject
106
105
  start_t = Time.now
107
106
 
108
107
  if @thread_pool
109
- @thread_pool.execute do
108
+ @thread_pool.post do
110
109
  begin
111
110
  yield(*args)
112
111
  rescue Exception => e
@@ -119,21 +118,13 @@ module Traject
119
118
 
120
119
  end
121
120
 
122
- # Just for monitoring/debugging purposes, we'll return the work queue
123
- # used by the threadpool. Don't recommend you do anything with it, as
124
- # the original java.util.concurrent docs make the same recommendation.
125
- def queue
126
- @thread_pool && @thread_pool.queue
127
- end
128
121
 
129
122
  # thread-safe way of storing an exception, to raise
130
123
  # later in a different thread. We don't guarantee
131
124
  # that we can store more than one at a time, only
132
125
  # the first one recorded may be stored.
133
126
  def collect_exception(e)
134
- # offer will silently do nothing if the queue is full, that's fine
135
- # with us.
136
- @async_exception_queue.offer(e)
127
+ @exceptions_caught_queue.push(e)
137
128
  end
138
129
 
139
130
  # If there's a stored collected exception, raise it
@@ -144,7 +135,8 @@ module Traject
144
135
  # as a non-functioning threadpool -- then this method is just
145
136
  # a no-op.
146
137
  def raise_collected_exception!
147
- if @async_exception_queue && e = @async_exception_queue.poll
138
+ if @exceptions_caught_queue && (! @exceptions_caught_queue.empty?)
139
+ e = @exceptions_caught_queue.pop
148
140
  raise e
149
141
  end
150
142
  end
@@ -159,9 +151,7 @@ module Traject
159
151
 
160
152
  if @thread_pool
161
153
  @thread_pool.shutdown
162
- # We pretty much want to wait forever, although we need to give
163
- # a timeout. Okay, one day!
164
- @thread_pool.awaitTermination(1, java.util.concurrent.TimeUnit::DAYS)
154
+ @thread_pool.wait_for_termination
165
155
  end
166
156
 
167
157
  return (Time.now - start_t)
@@ -171,6 +171,9 @@ module Traject
171
171
  def initialize(defn, options = {})
172
172
  if defn.kind_of? Hash
173
173
  @hash = defn
174
+ elsif defn.kind_of? self.class
175
+ @hash = defn.to_hash
176
+ @default = defn.default
174
177
  else
175
178
  @hash = self.class.cache.lookup(defn)
176
179
  raise NotFound.new(defn) if @hash.nil?
@@ -27,63 +27,25 @@ module Traject
27
27
  end
28
28
 
29
29
 
30
- # Requires solrj jar(s) from settings['solrj.jar_dir'] if given, otherwise
31
- # uses jars bundled with traject gem in ./vendor
32
- #
33
- # Have to pass in a settings arg, so we can check it for specified jar dir.
34
- #
35
- # Tries not to do the dirglob and require if solrj has already been loaded.
36
- # Will define global constants with classes HttpSolrServer and SolrInputDocument
37
- # if not already defined.
30
+
31
+ # Ruby stdlib queue lacks a 'drain' function, we write one.
38
32
  #
39
- # This is all a bit janky, maybe there's a better way to do this? We do want
40
- # a 'require' method defined somewhere utility, so multiple classes can
41
- # use it, including extra gems. This method may be used by extra gems, so should
42
- # be considered part of the API -- after it's called, those top-level
43
- # globals should be available, and solrj should be loaded.
44
- def self.require_solrj_jars(settings)
45
- jruby_ensure_init!
33
+ # Removes everything currently in the ruby stdlib queue, and returns
34
+ # it an array. Should be concurrent-safe, but queue may still have
35
+ # some things in it after drain, if there are concurrent writers.
36
+ def self.drain_queue(queue)
37
+ result = []
46
38
 
47
- tries = 0
39
+ queue_size = queue.size
48
40
  begin
49
- tries += 1
50
-
51
- org.apache.solr
52
- org.apache.solr.client.solrj
53
-
54
- # java_import which we'd normally use weirdly doesn't work
55
- # from a class method. https://github.com/jruby/jruby/issues/975
56
- Object.const_set("HttpSolrServer", org.apache.solr.client.solrj.impl.HttpSolrServer) unless defined? ::HttpSolrServer
57
- Object.const_set("SolrInputDocument", org.apache.solr.common.SolrInputDocument) unless defined? ::SolrInputDocument
58
- rescue NameError => e
59
- included_jar_dir = File.expand_path("../../vendor/solrj/lib", File.dirname(__FILE__))
60
-
61
- jardir = settings["solrj.jar_dir"] || included_jar_dir
62
- Dir.glob("#{jardir}/*.jar") do |x|
63
- require x
64
- end
65
- if tries > 1
66
- raise LoadError.new("Can not find SolrJ java classes")
67
- else
68
- retry
41
+ queue_size.times do
42
+ result << queue.deq(:raise_if_empty)
69
43
  end
44
+ rescue ThreadError
45
+ # Need do nothing, queue was concurrently popped, no biggie
70
46
  end
71
- end
72
47
 
73
- # just does a `require 'java'` but rescues the exception if we
74
- # aren't jruby, and raises a better error message.
75
- # Pass in a developer-presentable name of a feature to include in the error
76
- # message if you want.
77
- def self.jruby_ensure_init!(feature = nil)
78
- begin
79
- require 'java'
80
- rescue LoadError => e
81
- feature ||= "A traject feature is in use that"
82
- msg = if feature
83
- "#{feature} requires jruby, but you do not appear to be running under jruby. We recommend `chruby` for managing multiple ruby installs."
84
- end
85
- raise LoadError.new(msg)
86
- end
48
+ return result
87
49
  end
88
50
 
89
51
  end
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "1.1.0"
2
+ VERSION = "2.0.0.rc.1"
3
3
  end
@@ -1,5 +1,5 @@
1
1
  # Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task
2
- # Scraped from http://www.loc.gov/marc/geoareas/gacs_code.html at 2013-07-31 12:05:20 -0400
2
+ # Scraped from http://www.loc.gov/marc/geoareas/gacs_code.html at 2015-01-27 23:00:08 -0500
3
3
  # Intentionally includes discontinued codes.
4
4
 
5
5
  'a': 'Asia'
@@ -327,7 +327,7 @@
327
327
  'lnaz': 'Azores'
328
328
  'lnbm': 'Bermuda Islands'
329
329
  'lnca': 'Canary Islands'
330
- 'lncv': 'Cape Verde'
330
+ 'lncv': 'Cabo Verde'
331
331
  'lnfa': 'Faroe Islands'
332
332
  'lnjn': 'Jan Mayen Island'
333
333
  'lnma': 'Madeira Islands'
@@ -0,0 +1,104 @@
1
+ # Encoding: UTF-8
2
+
3
+ require 'test_helper'
4
+ require 'stringio'
5
+ require 'traject/delimited_writer'
6
+ require 'traject/csv_writer'
7
+
8
+ require 'csv'
9
+
10
+ describe "Delimited/CSV Writers" do
11
+
12
+ before do
13
+ @out = StringIO.new
14
+ @settings = {'output_stream' => @out, 'delimited_writer.fields' => 'four,one,two'}
15
+ @context = Struct.new(:output_hash).new
16
+ @context.output_hash = {'one' => 'one', 'two' => %w[two1 two2], 'three' => 'three', 'four' => 'four'}
17
+ end
18
+
19
+ after do
20
+ @out.close
21
+ end
22
+
23
+ describe "Traject::DelimitedWriter" do
24
+
25
+ it "creates a dw with defaults" do
26
+ dw = Traject::DelimitedWriter.new(@settings)
27
+ dw.delimiter.must_equal "\t"
28
+ dw.internal_delimiter.must_equal '|'
29
+ dw.edelim.must_equal ' '
30
+ dw.eidelim.must_equal '\\|'
31
+ end
32
+
33
+ it "respects different delimiter" do
34
+ @settings['delimited_writer.delimiter'] = '^'
35
+ dw = Traject::DelimitedWriter.new(@settings)
36
+ dw.delimiter.must_equal '^'
37
+ dw.edelim.must_equal '\\^'
38
+ dw.internal_delimiter.must_equal '|'
39
+ end
40
+
41
+ it "outputs a header if asked to" do
42
+ dw = Traject::DelimitedWriter.new(@settings)
43
+ @out.string.chomp.must_equal %w[four one two].join("\t")
44
+ end
45
+
46
+ it "doesn't output a header if asked not to" do
47
+ @settings['delimited_writer.header'] = 'false'
48
+ dw = Traject::DelimitedWriter.new(@settings)
49
+ @out.string.must_be_empty
50
+ end
51
+
52
+ it "deals with multiple values" do
53
+ dw = Traject::DelimitedWriter.new(@settings)
54
+ dw.put @context
55
+ @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(dw.delimiter)
56
+ end
57
+
58
+ it "bails if delimited_writer.fields isn't set" do
59
+ @settings.delete 'delimited_writer.fields'
60
+ proc { Traject::DelimitedWriter.new(@settings) }.must_raise(ArgumentError)
61
+ end
62
+
63
+ end
64
+
65
+ describe "Traject::CSVWriter" do
66
+ it "unsets the delimiter" do
67
+ cw = Traject::CSVWriter.new(@settings)
68
+ cw.delimiter.must_be_nil
69
+ end
70
+
71
+ it "writes the header" do
72
+ cw = Traject::CSVWriter.new(@settings)
73
+ @out.string.chomp.must_equal 'four,one,two'
74
+ end
75
+
76
+ it "uses the internal delimiter" do
77
+ cw = Traject::CSVWriter.new(@settings)
78
+ cw.put @context
79
+ @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(',')
80
+ end
81
+
82
+ it "produces complex output" do
83
+ @context.output_hash = {
84
+ 'four' => ['Bill Clinton, Jr.', 'Jesse "the Body" Ventura'],
85
+ 'one' => 'Willard "Mitt" Romney',
86
+ 'two' => 'Dueber, Bill'
87
+ }
88
+ canonical = StringIO.new
89
+ csv = CSV.new(canonical)
90
+
91
+ csv_vals = [@context.output_hash['four'].join('|'), @context.output_hash['one'], @context.output_hash['two']]
92
+ csv << csv_vals
93
+ csv_output = canonical.string.chomp
94
+
95
+ cw = Traject::CSVWriter.new(@settings)
96
+ cw.put @context
97
+ traject_csvwriter_output = @out.string.split("\n").last.chomp
98
+
99
+ assert_equal(csv_output, traject_csvwriter_output)
100
+
101
+ end
102
+
103
+ end
104
+ end
@@ -62,29 +62,9 @@ describe "Traject::Indexer#process" do
62
62
  assert writer_settings["memory_writer.closed"]
63
63
  end
64
64
 
65
- it "returns false if skipped records" do
66
- @indexer = Traject::Indexer.new(
67
- "solrj_writer.server_class_name" => "MockSolrServer",
68
- "solr.url" => "http://example.org",
69
- "writer_class_name" => "Traject::SolrJWriter"
70
- )
71
- @file = File.open(support_file_path "manufacturing_consent.marc")
72
-
73
-
74
- @indexer.to_field("id") do |record, accumulator|
75
- # intentionally make error
76
- accumulator.concat ["one_id", "two_id"]
77
- end
78
- return_value = @indexer.process(@file)
79
-
80
- assert ! return_value, "returns false on skipped record errors"
81
- end
82
-
83
65
  require 'traject/null_writer'
84
66
  it "calls after_processing after processing" do
85
67
  @indexer = Traject::Indexer.new(
86
- "solrj_writer.server_class_name" => "MockSolrServer",
87
- "solr.url" => "http://example.org",
88
68
  "writer_class_name" => "Traject::NullWriter"
89
69
  )
90
70
  @file = File.open(support_file_path "test_data.utf8.mrc")
@@ -106,8 +86,6 @@ describe "Traject::Indexer#process" do
106
86
  describe "demo_config.rb" do
107
87
  before do
108
88
  @indexer = Traject::Indexer.new(
109
- "solrj_writer.server_class_name" => "MockSolrServer",
110
- "solr.url" => "http://example.org",
111
89
  "writer_class_name" => "Traject::NullWriter"
112
90
  )
113
91
  end
@@ -124,5 +124,29 @@ describe "Traject::Indexer#settings" do
124
124
  assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
125
125
  end
126
126
  end
127
+
128
+ describe "JRuby / MRI" do
129
+ before do
130
+ @indexer = Traject::Indexer.new
131
+ end
132
+
133
+ it "has the right indexer name" do
134
+ if defined? JRUBY_VERSION
135
+ assert_equal "Traject::Marc4JReader", @indexer.settings['reader_class_name']
136
+ else
137
+ assert_equal "Traject::MarcReader", @indexer.settings['reader_class_name']
138
+ end
139
+ end
140
+
141
+ # This next one has the added effect of making sure the correct class
142
+ # has actually been loaded -- otherwise the constant wouldn't be available
143
+ it "has the correct default indexer class based on platform" do
144
+ if defined? JRUBY_VERSION
145
+ assert_equal Traject::Marc4JReader, @indexer.reader_class
146
+ else
147
+ assert_equal Traject::MarcReader, @indexer.reader_class
148
+ end
149
+ end
150
+ end
127
151
 
128
152
  end
@@ -0,0 +1,248 @@
1
+ require 'test_helper'
2
+ require 'httpclient'
3
+ require 'traject/solr_json_writer'
4
+ require 'thread'
5
+ require 'json'
6
+ require 'stringio'
7
+ require 'logger'
8
+
9
+
10
+ # Some basic tests, using a mocked HTTPClient so we can see what it did --
11
+ # these tests do not run against a real solr server at present.
12
+ describe "Traject::SolrJsonWriter" do
13
+
14
+
15
+ #######
16
+ # A bunch of utilities to help testing
17
+ #######
18
+
19
+ class FakeHTTPClient
20
+ # Always reply with this status, normally 200, can
21
+ # be reset for testing error conditions.
22
+ attr_accessor :response_status
23
+ attr_accessor :allow_update_json_path
24
+
25
+ def initialize(*args)
26
+ @post_args = []
27
+ @get_args = []
28
+ @response_status = 200
29
+ @allow_update_json_path = true
30
+ @mutex = Monitor.new
31
+ end
32
+
33
+ def post(*args)
34
+ @mutex.synchronize do
35
+ @post_args << args
36
+ end
37
+
38
+ resp = HTTP::Message.new_response("")
39
+ resp.status = self.response_status
40
+
41
+ return resp
42
+ end
43
+
44
+ def get (*args)
45
+ @mutex.synchronize do
46
+ @get_args << args
47
+ end
48
+
49
+ resp = HTTP::Message.new_response("")
50
+ resp.status = self.response_status
51
+
52
+ if args.first.end_with?("/update/json") && ! self.allow_update_json_path
53
+ # Need to test auto-detection of /update/json being available
54
+ resp.status = 404
55
+ end
56
+
57
+ return resp
58
+ end
59
+
60
+ def post_args
61
+ @mutex.synchronize do
62
+ @post_args.dup
63
+ end
64
+ end
65
+
66
+ def get_args
67
+ @mutex.synchronize do
68
+ @get_args.dup
69
+ end
70
+ end
71
+
72
+ # Everything else, just return nil please
73
+ def method_missing(*args)
74
+ end
75
+ end
76
+
77
+
78
+ def context_with(hash)
79
+ Traject::Indexer::Context.new(:output_hash => hash)
80
+ end
81
+
82
+ def create_writer(settings = {})
83
+ settings = {
84
+ "solr.url" => "http://example.com/solr",
85
+ "solr_json_writer.http_client" => FakeHTTPClient.new
86
+ }.merge!(settings)
87
+ @fake_http_client = settings["solr_json_writer.http_client"]
88
+
89
+ writer = Traject::SolrJsonWriter.new(settings)
90
+
91
+ return writer
92
+ end
93
+
94
+ # strio = StringIO.new
95
+ # logger_to_strio(strio)
96
+ #
97
+ # Later check for strio.string for contents
98
+ def logger_to_strio(strio)
99
+ # Yell makes this hard, let's do it with an ordinary logger, think
100
+ # it's okay.
101
+ Logger.new(strio)
102
+ end
103
+
104
+ #########
105
+ # Actual tests
106
+ #########
107
+
108
+ before do
109
+ @writer = create_writer
110
+ end
111
+
112
+ it "defaults to 1 bg thread" do
113
+ assert_equal 1, @writer.thread_pool_size
114
+ end
115
+
116
+ it "adds a document" do
117
+ @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
118
+ @writer.close
119
+
120
+ post_args = @fake_http_client.post_args.first
121
+
122
+ refute_nil post_args
123
+
124
+ assert_equal "http://example.com/solr/update/json", post_args[0]
125
+
126
+ refute_nil post_args[1]
127
+ posted_json = JSON.parse(post_args[1])
128
+
129
+ assert_equal [{"id" => "one", "key" => ["value1", "value2"]}], posted_json
130
+ end
131
+
132
+ it "adds more than a batch in batches" do
133
+ (Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE + 1).times do |i|
134
+ doc = {"id" => "doc_#{i}", "key" => "value"}
135
+ @writer.put context_with(doc)
136
+ end
137
+ @writer.close
138
+
139
+ post_args = @fake_http_client.post_args
140
+
141
+ assert_length 2, post_args, "Makes two posts to Solr for two batches"
142
+
143
+ assert_length Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE, JSON.parse(post_args[0][1]), "first batch posted with batch size docs"
144
+ assert_length 1, JSON.parse(post_args[1][1]), "second batch posted with last remaining doc"
145
+ end
146
+
147
+ it "commits on close when set" do
148
+ @writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
149
+ @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
150
+ @writer.close
151
+
152
+ last_solr_get = @fake_http_client.get_args.last
153
+
154
+ assert_equal "http://example.com/update/json", last_solr_get[0]
155
+ assert_equal( {"commit" => "true"}, last_solr_get[1] )
156
+ end
157
+
158
+ describe "skipped records" do
159
+ it "skips and reports under max_skipped" do
160
+ strio = StringIO.new
161
+ @writer = create_writer("solr_writer.max_skipped" => 10, "logger" => logger_to_strio(strio))
162
+ @fake_http_client.response_status = 500
163
+
164
+ 10.times do |i|
165
+ @writer.put context_with("id" => "doc_#{i}", "key" => "value")
166
+ end
167
+ @writer.close
168
+
169
+ assert_equal 10, @writer.skipped_record_count
170
+
171
+ logged = strio.string
172
+
173
+ 10.times do |i|
174
+ assert_match /ERROR.*Could not add record doc_#{i} at source file position : Solr error response: 500/, logged
175
+ end
176
+ end
177
+
178
+ it "raises when skipped more than max_skipped" do
179
+ @writer = create_writer("solr_writer.max_skipped" => 5)
180
+ @fake_http_client.response_status = 500
181
+
182
+ e = assert_raises(RuntimeError) do
183
+ 6.times do |i|
184
+ @writer.put context_with("id" => "doc_#{i}", "key" => "value")
185
+ end
186
+ @writer.close
187
+ end
188
+
189
+ assert_includes e.message, "Exceeded maximum number of skipped records"
190
+ end
191
+
192
+ it "raises on one skipped record when max_skipped is 0" do
193
+ @writer = create_writer("solr_writer.max_skipped" => 0)
194
+ @fake_http_client.response_status = 500
195
+
196
+ e = assert_raises(RuntimeError) do
197
+ @writer.put context_with("id" => "doc_1", "key" => "value")
198
+ @writer.close
199
+ end
200
+ end
201
+ end
202
+
203
+ describe "auto-discovers proper update path" do
204
+ it "finds /update/json" do
205
+ assert_equal "http://example.com/solr/update/json", @writer.determine_solr_update_url
206
+ end
207
+
208
+ it "resorts to plain /update" do
209
+ @fake_http_client = FakeHTTPClient.new
210
+ @fake_http_client.allow_update_json_path = false
211
+
212
+ @writer = create_writer("solr.url" => "http://example.com/solr",
213
+ "solr_json_writer.http_client" => @fake_http_client)
214
+
215
+ assert_equal "http://example.com/solr/update", @writer.determine_solr_update_url
216
+ end
217
+ end
218
+
219
+ describe "Record id from context" do
220
+ before do
221
+ @record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
222
+ @context = Traject::Indexer::Context.new
223
+ @writer = create_writer
224
+ @record_001 = " 00282214 " # from the mrc file
225
+ end
226
+
227
+ it "gets it from 001" do
228
+ @context.source_record = @record
229
+ assert_equal @record_001, @writer.record_id_from_context(@context)
230
+ end
231
+
232
+ it "gets it from the id" do
233
+ @context.output_hash['id'] = 'the_record_id'
234
+ assert_equal 'the_record_id', @writer.record_id_from_context(@context)
235
+ end
236
+
237
+ it "gets it from both 001 and id" do
238
+ @context.output_hash['id'] = 'the_record_id'
239
+ @context.source_record = @record
240
+ assert_equal [@record_001, 'the_record_id'].join('/'), @writer.record_id_from_context(@context)
241
+ end
242
+
243
+
244
+
245
+ end
246
+
247
+
248
+ end