traject 1.1.0 → 2.0.0.rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +20 -0
  3. data/README.md +85 -73
  4. data/doc/batch_execution.md +2 -6
  5. data/doc/other_commands.md +3 -5
  6. data/doc/settings.md +27 -38
  7. data/lib/traject/command_line.rb +1 -1
  8. data/lib/traject/csv_writer.rb +34 -0
  9. data/lib/traject/delimited_writer.rb +110 -0
  10. data/lib/traject/indexer.rb +29 -11
  11. data/lib/traject/indexer/settings.rb +39 -13
  12. data/lib/traject/line_writer.rb +10 -6
  13. data/lib/traject/marc_reader.rb +2 -1
  14. data/lib/traject/solr_json_writer.rb +277 -0
  15. data/lib/traject/thread_pool.rb +38 -48
  16. data/lib/traject/translation_map.rb +3 -0
  17. data/lib/traject/util.rb +13 -51
  18. data/lib/traject/version.rb +1 -1
  19. data/lib/translation_maps/marc_geographic.yaml +2 -2
  20. data/test/delimited_writer_test.rb +104 -0
  21. data/test/indexer/read_write_test.rb +0 -22
  22. data/test/indexer/settings_test.rb +24 -0
  23. data/test/solr_json_writer_test.rb +248 -0
  24. data/test/test_helper.rb +5 -3
  25. data/test/test_support/demo_config.rb +0 -5
  26. data/test/translation_map_test.rb +9 -0
  27. data/traject.gemspec +18 -5
  28. metadata +77 -87
  29. data/lib/traject/marc4j_reader.rb +0 -153
  30. data/lib/traject/solrj_writer.rb +0 -351
  31. data/test/marc4j_reader_test.rb +0 -136
  32. data/test/solrj_writer_test.rb +0 -209
  33. data/vendor/solrj/README +0 -8
  34. data/vendor/solrj/build.xml +0 -39
  35. data/vendor/solrj/ivy.xml +0 -16
  36. data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
  37. data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
  38. data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
  39. data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
  40. data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
  41. data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
  42. data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
  43. data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
  44. data/vendor/solrj/lib/noggit-0.5.jar +0 -0
  45. data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
  46. data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
  47. data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
  48. data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
  49. data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
  50. data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
  51. data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
@@ -1,28 +1,33 @@
1
+ require 'concurrent'
2
+ require 'thread' # for Queue
3
+
1
4
  module Traject
2
- # An abstraction wrapping a threadpool executor in some configuration choices
3
- # and other apparatus.
5
+ # An abstraction wrapping a Concurrent::ThreadPool in some configuration choices
6
+ # and other apparatus. Concurrent::ThreadPool is a Java ThreadPool executor on
7
+ # jruby for performance, and is ruby-concurrent's own ruby implementation otherwise.
4
8
  #
5
9
  # 1) Initialize with chosen pool size -- we create fixed size pools, where
6
10
  # core and max sizes are the same.
7
11
  #
8
- # 2) If initialized with nil for threadcount, no thread pool will actually
9
- # be created, and all threadpool-related methods become no-ops. We call this
10
- # the nil/null threadpool. A non-nil threadpool requires jruby, but you can
11
- # create a null Traject::ThreadPool.new(nil) under MRI without anything
12
- # complaining.
12
+ # 2) If initialized with nil or 0 for threadcount, no thread pool will actually
13
+ # be created, and work sent to the Traject::ThreadPool will just be executed
14
+ # in the caller thread. We call this a nil threadpool. One situation it can be useful
15
+ # is if you are running under MRI, where multi-core parallelism isn't available, so
16
+ # an actual threadpool may not be useful. (Although in some cases a thread pool,
17
+ # especially one with size 1, can be useful in MRI for I/O blocking operations)
13
18
  #
14
19
  # 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
15
- # execution -- if no threadpool configured your block will just be
20
+ # execution -- if configurred with a nil threadcount, your block will just be
16
21
  # executed in calling thread. Be careful to not refer to any non-local
17
22
  # variables in the block, unless the variable has an object you can
18
23
  # use thread-safely!
19
24
  #
20
- # 4) Thread pools are java.util.concurrent.ThreadPoolExecutor, manually created
21
- # with a work queue that will buffer up to (pool_size*3) tasks. If queue is full,
22
- # the ThreadPoolExecutor is set up to use the ThreadPoolExecutor.CallerRunsPolicy,
25
+ # 4) We configure our underlying Concurrent::ThreadPool
26
+ # with a work queue that will buffer up to (pool_size*3) tasks. If the queue is full,
27
+ # the underlying Concurrent::ThreadPool is set up to use the :caller_runs policy
23
28
  # meaning the block will end up executing in caller's own thread. With the kind
24
29
  # of work we're doing, where each unit of work is small and there are many of them--
25
- # the CallerRunsPolicy serves as an effective 'back pressure' mechanism to keep
30
+ # the :caller_runs policy serves as an effective 'back pressure' mechanism to keep
26
31
  # the work queue from getting too large and exhausting memory, when producers are
27
32
  # faster than consumers.
28
33
  #
@@ -34,8 +39,8 @@ module Traject
34
39
  # #shutdown_and_wait, which will wait for all current queued work
35
40
  # to complete, then return. You can not give any more work to the pool
36
41
  # after you do this. By default it'll wait pretty much forever, which should
37
- # be fine. If you never call shutdown, the pool will keep running forever
38
- # and not allow your program to exit!
42
+ # be fine. If you never call shutdown, then queued or in-progress work
43
+ # may be abandoned when the program ends, which would be bad.
39
44
  #
40
45
  # 7) We will keep track of total times a block is run in thread pool, and
41
46
  # total elapsed (wall) time of running all blocks, so an average_execution_ms
@@ -43,33 +48,27 @@ module Traject
43
48
  # threads are still executing, as it's not entirely thread safe (may get
44
49
  # an off by one as to total iterations)
45
50
  class ThreadPool
46
- attr_reader :pool_size, :label, :queue_capacity
51
+ attr_reader :pool_size, :queue_capacity
47
52
 
48
- # First arg is pool size, 0 or nil and we'll be a null/no-op pool
53
+ # First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
54
+ # work in caller thread.
49
55
  def initialize(pool_size)
50
56
  unless pool_size.nil? || pool_size == 0
51
- require 'java' # trigger an exception now if we're not jruby
52
-
53
- @label = label
54
-
55
- @pool_size = pool_size.to_i # just for reflection, we don't really need it again
57
+ @pool_size = pool_size.to_i
56
58
  @queue_capacity = pool_size * 3
57
59
 
58
-
59
- blockingQueue = java.util.concurrent.ArrayBlockingQueue.new(@queue_capacity)
60
- rejectedExecutionHandler = java.util.concurrent.ThreadPoolExecutor::CallerRunsPolicy.new
61
-
62
- # keepalive times don't matter, we are setting core and max pool to
63
- # same thing, fixed size pool.
64
- @thread_pool = java.util.concurrent.ThreadPoolExecutor.new(
65
- @pool_size, @pool_size, 0, java.util.concurrent.TimeUnit::MILLISECONDS,
66
- blockingQueue, rejectedExecutionHandler)
60
+ @thread_pool = Concurrent::ThreadPoolExecutor.new(
61
+ :min_threads => @pool_size,
62
+ :max_threads => @pool_size,
63
+ :max_queue => @queue_capacity,
64
+ :fallback_policy => :caller_runs
65
+ )
67
66
 
68
67
  # A thread-safe queue to collect exceptions cross-threads.
69
- # We make it small, we really only need to store the first
70
- # exception, we don't care too much about others. But we'll
71
- # keep the first 20, why not.
72
- @async_exception_queue = java.util.concurrent.ArrayBlockingQueue.new(20)
68
+ # We really only need to save the first exception, but a queue
69
+ # is a convenient way to store a value concurrency-safely, and
70
+ # might as well store all of them.
71
+ @exceptions_caught_queue = Queue.new
73
72
  end
74
73
  end
75
74
 
@@ -106,7 +105,7 @@ module Traject
106
105
  start_t = Time.now
107
106
 
108
107
  if @thread_pool
109
- @thread_pool.execute do
108
+ @thread_pool.post do
110
109
  begin
111
110
  yield(*args)
112
111
  rescue Exception => e
@@ -119,21 +118,13 @@ module Traject
119
118
 
120
119
  end
121
120
 
122
- # Just for monitoring/debugging purposes, we'll return the work queue
123
- # used by the threadpool. Don't recommend you do anything with it, as
124
- # the original java.util.concurrent docs make the same recommendation.
125
- def queue
126
- @thread_pool && @thread_pool.queue
127
- end
128
121
 
129
122
  # thread-safe way of storing an exception, to raise
130
123
  # later in a different thread. We don't guarantee
131
124
  # that we can store more than one at a time, only
132
125
  # the first one recorded may be stored.
133
126
  def collect_exception(e)
134
- # offer will silently do nothing if the queue is full, that's fine
135
- # with us.
136
- @async_exception_queue.offer(e)
127
+ @exceptions_caught_queue.push(e)
137
128
  end
138
129
 
139
130
  # If there's a stored collected exception, raise it
@@ -144,7 +135,8 @@ module Traject
144
135
  # as a non-functioning threadpool -- then this method is just
145
136
  # a no-op.
146
137
  def raise_collected_exception!
147
- if @async_exception_queue && e = @async_exception_queue.poll
138
+ if @exceptions_caught_queue && (! @exceptions_caught_queue.empty?)
139
+ e = @exceptions_caught_queue.pop
148
140
  raise e
149
141
  end
150
142
  end
@@ -159,9 +151,7 @@ module Traject
159
151
 
160
152
  if @thread_pool
161
153
  @thread_pool.shutdown
162
- # We pretty much want to wait forever, although we need to give
163
- # a timeout. Okay, one day!
164
- @thread_pool.awaitTermination(1, java.util.concurrent.TimeUnit::DAYS)
154
+ @thread_pool.wait_for_termination
165
155
  end
166
156
 
167
157
  return (Time.now - start_t)
@@ -171,6 +171,9 @@ module Traject
171
171
  def initialize(defn, options = {})
172
172
  if defn.kind_of? Hash
173
173
  @hash = defn
174
+ elsif defn.kind_of? self.class
175
+ @hash = defn.to_hash
176
+ @default = defn.default
174
177
  else
175
178
  @hash = self.class.cache.lookup(defn)
176
179
  raise NotFound.new(defn) if @hash.nil?
@@ -27,63 +27,25 @@ module Traject
27
27
  end
28
28
 
29
29
 
30
- # Requires solrj jar(s) from settings['solrj.jar_dir'] if given, otherwise
31
- # uses jars bundled with traject gem in ./vendor
32
- #
33
- # Have to pass in a settings arg, so we can check it for specified jar dir.
34
- #
35
- # Tries not to do the dirglob and require if solrj has already been loaded.
36
- # Will define global constants with classes HttpSolrServer and SolrInputDocument
37
- # if not already defined.
30
+
31
+ # Ruby stdlib queue lacks a 'drain' function, we write one.
38
32
  #
39
- # This is all a bit janky, maybe there's a better way to do this? We do want
40
- # a 'require' method defined somewhere utility, so multiple classes can
41
- # use it, including extra gems. This method may be used by extra gems, so should
42
- # be considered part of the API -- after it's called, those top-level
43
- # globals should be available, and solrj should be loaded.
44
- def self.require_solrj_jars(settings)
45
- jruby_ensure_init!
33
+ # Removes everything currently in the ruby stdlib queue, and returns
34
+ # it an array. Should be concurrent-safe, but queue may still have
35
+ # some things in it after drain, if there are concurrent writers.
36
+ def self.drain_queue(queue)
37
+ result = []
46
38
 
47
- tries = 0
39
+ queue_size = queue.size
48
40
  begin
49
- tries += 1
50
-
51
- org.apache.solr
52
- org.apache.solr.client.solrj
53
-
54
- # java_import which we'd normally use weirdly doesn't work
55
- # from a class method. https://github.com/jruby/jruby/issues/975
56
- Object.const_set("HttpSolrServer", org.apache.solr.client.solrj.impl.HttpSolrServer) unless defined? ::HttpSolrServer
57
- Object.const_set("SolrInputDocument", org.apache.solr.common.SolrInputDocument) unless defined? ::SolrInputDocument
58
- rescue NameError => e
59
- included_jar_dir = File.expand_path("../../vendor/solrj/lib", File.dirname(__FILE__))
60
-
61
- jardir = settings["solrj.jar_dir"] || included_jar_dir
62
- Dir.glob("#{jardir}/*.jar") do |x|
63
- require x
64
- end
65
- if tries > 1
66
- raise LoadError.new("Can not find SolrJ java classes")
67
- else
68
- retry
41
+ queue_size.times do
42
+ result << queue.deq(:raise_if_empty)
69
43
  end
44
+ rescue ThreadError
45
+ # Need do nothing, queue was concurrently popped, no biggie
70
46
  end
71
- end
72
47
 
73
- # just does a `require 'java'` but rescues the exception if we
74
- # aren't jruby, and raises a better error message.
75
- # Pass in a developer-presentable name of a feature to include in the error
76
- # message if you want.
77
- def self.jruby_ensure_init!(feature = nil)
78
- begin
79
- require 'java'
80
- rescue LoadError => e
81
- feature ||= "A traject feature is in use that"
82
- msg = if feature
83
- "#{feature} requires jruby, but you do not appear to be running under jruby. We recommend `chruby` for managing multiple ruby installs."
84
- end
85
- raise LoadError.new(msg)
86
- end
48
+ return result
87
49
  end
88
50
 
89
51
  end
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "1.1.0"
2
+ VERSION = "2.0.0.rc.1"
3
3
  end
@@ -1,5 +1,5 @@
1
1
  # Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task
2
- # Scraped from http://www.loc.gov/marc/geoareas/gacs_code.html at 2013-07-31 12:05:20 -0400
2
+ # Scraped from http://www.loc.gov/marc/geoareas/gacs_code.html at 2015-01-27 23:00:08 -0500
3
3
  # Intentionally includes discontinued codes.
4
4
 
5
5
  'a': 'Asia'
@@ -327,7 +327,7 @@
327
327
  'lnaz': 'Azores'
328
328
  'lnbm': 'Bermuda Islands'
329
329
  'lnca': 'Canary Islands'
330
- 'lncv': 'Cape Verde'
330
+ 'lncv': 'Cabo Verde'
331
331
  'lnfa': 'Faroe Islands'
332
332
  'lnjn': 'Jan Mayen Island'
333
333
  'lnma': 'Madeira Islands'
@@ -0,0 +1,104 @@
1
+ # Encoding: UTF-8
2
+
3
+ require 'test_helper'
4
+ require 'stringio'
5
+ require 'traject/delimited_writer'
6
+ require 'traject/csv_writer'
7
+
8
+ require 'csv'
9
+
10
+ describe "Delimited/CSV Writers" do
11
+
12
+ before do
13
+ @out = StringIO.new
14
+ @settings = {'output_stream' => @out, 'delimited_writer.fields' => 'four,one,two'}
15
+ @context = Struct.new(:output_hash).new
16
+ @context.output_hash = {'one' => 'one', 'two' => %w[two1 two2], 'three' => 'three', 'four' => 'four'}
17
+ end
18
+
19
+ after do
20
+ @out.close
21
+ end
22
+
23
+ describe "Traject::DelimitedWriter" do
24
+
25
+ it "creates a dw with defaults" do
26
+ dw = Traject::DelimitedWriter.new(@settings)
27
+ dw.delimiter.must_equal "\t"
28
+ dw.internal_delimiter.must_equal '|'
29
+ dw.edelim.must_equal ' '
30
+ dw.eidelim.must_equal '\\|'
31
+ end
32
+
33
+ it "respects different delimiter" do
34
+ @settings['delimited_writer.delimiter'] = '^'
35
+ dw = Traject::DelimitedWriter.new(@settings)
36
+ dw.delimiter.must_equal '^'
37
+ dw.edelim.must_equal '\\^'
38
+ dw.internal_delimiter.must_equal '|'
39
+ end
40
+
41
+ it "outputs a header if asked to" do
42
+ dw = Traject::DelimitedWriter.new(@settings)
43
+ @out.string.chomp.must_equal %w[four one two].join("\t")
44
+ end
45
+
46
+ it "doesn't output a header if asked not to" do
47
+ @settings['delimited_writer.header'] = 'false'
48
+ dw = Traject::DelimitedWriter.new(@settings)
49
+ @out.string.must_be_empty
50
+ end
51
+
52
+ it "deals with multiple values" do
53
+ dw = Traject::DelimitedWriter.new(@settings)
54
+ dw.put @context
55
+ @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(dw.delimiter)
56
+ end
57
+
58
+ it "bails if delimited_writer.fields isn't set" do
59
+ @settings.delete 'delimited_writer.fields'
60
+ proc { Traject::DelimitedWriter.new(@settings) }.must_raise(ArgumentError)
61
+ end
62
+
63
+ end
64
+
65
+ describe "Traject::CSVWriter" do
66
+ it "unsets the delimiter" do
67
+ cw = Traject::CSVWriter.new(@settings)
68
+ cw.delimiter.must_be_nil
69
+ end
70
+
71
+ it "writes the header" do
72
+ cw = Traject::CSVWriter.new(@settings)
73
+ @out.string.chomp.must_equal 'four,one,two'
74
+ end
75
+
76
+ it "uses the internal delimiter" do
77
+ cw = Traject::CSVWriter.new(@settings)
78
+ cw.put @context
79
+ @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(',')
80
+ end
81
+
82
+ it "produces complex output" do
83
+ @context.output_hash = {
84
+ 'four' => ['Bill Clinton, Jr.', 'Jesse "the Body" Ventura'],
85
+ 'one' => 'Willard "Mitt" Romney',
86
+ 'two' => 'Dueber, Bill'
87
+ }
88
+ canonical = StringIO.new
89
+ csv = CSV.new(canonical)
90
+
91
+ csv_vals = [@context.output_hash['four'].join('|'), @context.output_hash['one'], @context.output_hash['two']]
92
+ csv << csv_vals
93
+ csv_output = canonical.string.chomp
94
+
95
+ cw = Traject::CSVWriter.new(@settings)
96
+ cw.put @context
97
+ traject_csvwriter_output = @out.string.split("\n").last.chomp
98
+
99
+ assert_equal(csv_output, traject_csvwriter_output)
100
+
101
+ end
102
+
103
+ end
104
+ end
@@ -62,29 +62,9 @@ describe "Traject::Indexer#process" do
62
62
  assert writer_settings["memory_writer.closed"]
63
63
  end
64
64
 
65
- it "returns false if skipped records" do
66
- @indexer = Traject::Indexer.new(
67
- "solrj_writer.server_class_name" => "MockSolrServer",
68
- "solr.url" => "http://example.org",
69
- "writer_class_name" => "Traject::SolrJWriter"
70
- )
71
- @file = File.open(support_file_path "manufacturing_consent.marc")
72
-
73
-
74
- @indexer.to_field("id") do |record, accumulator|
75
- # intentionally make error
76
- accumulator.concat ["one_id", "two_id"]
77
- end
78
- return_value = @indexer.process(@file)
79
-
80
- assert ! return_value, "returns false on skipped record errors"
81
- end
82
-
83
65
  require 'traject/null_writer'
84
66
  it "calls after_processing after processing" do
85
67
  @indexer = Traject::Indexer.new(
86
- "solrj_writer.server_class_name" => "MockSolrServer",
87
- "solr.url" => "http://example.org",
88
68
  "writer_class_name" => "Traject::NullWriter"
89
69
  )
90
70
  @file = File.open(support_file_path "test_data.utf8.mrc")
@@ -106,8 +86,6 @@ describe "Traject::Indexer#process" do
106
86
  describe "demo_config.rb" do
107
87
  before do
108
88
  @indexer = Traject::Indexer.new(
109
- "solrj_writer.server_class_name" => "MockSolrServer",
110
- "solr.url" => "http://example.org",
111
89
  "writer_class_name" => "Traject::NullWriter"
112
90
  )
113
91
  end
@@ -124,5 +124,29 @@ describe "Traject::Indexer#settings" do
124
124
  assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
125
125
  end
126
126
  end
127
+
128
+ describe "JRuby / MRI" do
129
+ before do
130
+ @indexer = Traject::Indexer.new
131
+ end
132
+
133
+ it "has the right indexer name" do
134
+ if defined? JRUBY_VERSION
135
+ assert_equal "Traject::Marc4JReader", @indexer.settings['reader_class_name']
136
+ else
137
+ assert_equal "Traject::MarcReader", @indexer.settings['reader_class_name']
138
+ end
139
+ end
140
+
141
+ # This next one has the added effect of making sure the correct class
142
+ # has actually been loaded -- otherwise the constant wouldn't be available
143
+ it "has the correct default indexer class based on platform" do
144
+ if defined? JRUBY_VERSION
145
+ assert_equal Traject::Marc4JReader, @indexer.reader_class
146
+ else
147
+ assert_equal Traject::MarcReader, @indexer.reader_class
148
+ end
149
+ end
150
+ end
127
151
 
128
152
  end
@@ -0,0 +1,248 @@
1
+ require 'test_helper'
2
+ require 'httpclient'
3
+ require 'traject/solr_json_writer'
4
+ require 'thread'
5
+ require 'json'
6
+ require 'stringio'
7
+ require 'logger'
8
+
9
+
10
+ # Some basic tests, using a mocked HTTPClient so we can see what it did --
11
+ # these tests do not run against a real solr server at present.
12
+ describe "Traject::SolrJsonWriter" do
13
+
14
+
15
+ #######
16
+ # A bunch of utilities to help testing
17
+ #######
18
+
19
+ class FakeHTTPClient
20
+ # Always reply with this status, normally 200, can
21
+ # be reset for testing error conditions.
22
+ attr_accessor :response_status
23
+ attr_accessor :allow_update_json_path
24
+
25
+ def initialize(*args)
26
+ @post_args = []
27
+ @get_args = []
28
+ @response_status = 200
29
+ @allow_update_json_path = true
30
+ @mutex = Monitor.new
31
+ end
32
+
33
+ def post(*args)
34
+ @mutex.synchronize do
35
+ @post_args << args
36
+ end
37
+
38
+ resp = HTTP::Message.new_response("")
39
+ resp.status = self.response_status
40
+
41
+ return resp
42
+ end
43
+
44
+ def get (*args)
45
+ @mutex.synchronize do
46
+ @get_args << args
47
+ end
48
+
49
+ resp = HTTP::Message.new_response("")
50
+ resp.status = self.response_status
51
+
52
+ if args.first.end_with?("/update/json") && ! self.allow_update_json_path
53
+ # Need to test auto-detection of /update/json being available
54
+ resp.status = 404
55
+ end
56
+
57
+ return resp
58
+ end
59
+
60
+ def post_args
61
+ @mutex.synchronize do
62
+ @post_args.dup
63
+ end
64
+ end
65
+
66
+ def get_args
67
+ @mutex.synchronize do
68
+ @get_args.dup
69
+ end
70
+ end
71
+
72
+ # Everything else, just return nil please
73
+ def method_missing(*args)
74
+ end
75
+ end
76
+
77
+
78
+ def context_with(hash)
79
+ Traject::Indexer::Context.new(:output_hash => hash)
80
+ end
81
+
82
+ def create_writer(settings = {})
83
+ settings = {
84
+ "solr.url" => "http://example.com/solr",
85
+ "solr_json_writer.http_client" => FakeHTTPClient.new
86
+ }.merge!(settings)
87
+ @fake_http_client = settings["solr_json_writer.http_client"]
88
+
89
+ writer = Traject::SolrJsonWriter.new(settings)
90
+
91
+ return writer
92
+ end
93
+
94
+ # strio = StringIO.new
95
+ # logger_to_strio(strio)
96
+ #
97
+ # Later check for strio.string for contents
98
+ def logger_to_strio(strio)
99
+ # Yell makes this hard, let's do it with an ordinary logger, think
100
+ # it's okay.
101
+ Logger.new(strio)
102
+ end
103
+
104
+ #########
105
+ # Actual tests
106
+ #########
107
+
108
+ before do
109
+ @writer = create_writer
110
+ end
111
+
112
+ it "defaults to 1 bg thread" do
113
+ assert_equal 1, @writer.thread_pool_size
114
+ end
115
+
116
+ it "adds a document" do
117
+ @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
118
+ @writer.close
119
+
120
+ post_args = @fake_http_client.post_args.first
121
+
122
+ refute_nil post_args
123
+
124
+ assert_equal "http://example.com/solr/update/json", post_args[0]
125
+
126
+ refute_nil post_args[1]
127
+ posted_json = JSON.parse(post_args[1])
128
+
129
+ assert_equal [{"id" => "one", "key" => ["value1", "value2"]}], posted_json
130
+ end
131
+
132
+ it "adds more than a batch in batches" do
133
+ (Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE + 1).times do |i|
134
+ doc = {"id" => "doc_#{i}", "key" => "value"}
135
+ @writer.put context_with(doc)
136
+ end
137
+ @writer.close
138
+
139
+ post_args = @fake_http_client.post_args
140
+
141
+ assert_length 2, post_args, "Makes two posts to Solr for two batches"
142
+
143
+ assert_length Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE, JSON.parse(post_args[0][1]), "first batch posted with batch size docs"
144
+ assert_length 1, JSON.parse(post_args[1][1]), "second batch posted with last remaining doc"
145
+ end
146
+
147
+ it "commits on close when set" do
148
+ @writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
149
+ @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
150
+ @writer.close
151
+
152
+ last_solr_get = @fake_http_client.get_args.last
153
+
154
+ assert_equal "http://example.com/update/json", last_solr_get[0]
155
+ assert_equal( {"commit" => "true"}, last_solr_get[1] )
156
+ end
157
+
158
+ describe "skipped records" do
159
+ it "skips and reports under max_skipped" do
160
+ strio = StringIO.new
161
+ @writer = create_writer("solr_writer.max_skipped" => 10, "logger" => logger_to_strio(strio))
162
+ @fake_http_client.response_status = 500
163
+
164
+ 10.times do |i|
165
+ @writer.put context_with("id" => "doc_#{i}", "key" => "value")
166
+ end
167
+ @writer.close
168
+
169
+ assert_equal 10, @writer.skipped_record_count
170
+
171
+ logged = strio.string
172
+
173
+ 10.times do |i|
174
+ assert_match /ERROR.*Could not add record doc_#{i} at source file position : Solr error response: 500/, logged
175
+ end
176
+ end
177
+
178
+ it "raises when skipped more than max_skipped" do
179
+ @writer = create_writer("solr_writer.max_skipped" => 5)
180
+ @fake_http_client.response_status = 500
181
+
182
+ e = assert_raises(RuntimeError) do
183
+ 6.times do |i|
184
+ @writer.put context_with("id" => "doc_#{i}", "key" => "value")
185
+ end
186
+ @writer.close
187
+ end
188
+
189
+ assert_includes e.message, "Exceeded maximum number of skipped records"
190
+ end
191
+
192
+ it "raises on one skipped record when max_skipped is 0" do
193
+ @writer = create_writer("solr_writer.max_skipped" => 0)
194
+ @fake_http_client.response_status = 500
195
+
196
+ e = assert_raises(RuntimeError) do
197
+ @writer.put context_with("id" => "doc_1", "key" => "value")
198
+ @writer.close
199
+ end
200
+ end
201
+ end
202
+
203
+ describe "auto-discovers proper update path" do
204
+ it "finds /update/json" do
205
+ assert_equal "http://example.com/solr/update/json", @writer.determine_solr_update_url
206
+ end
207
+
208
+ it "resorts to plain /update" do
209
+ @fake_http_client = FakeHTTPClient.new
210
+ @fake_http_client.allow_update_json_path = false
211
+
212
+ @writer = create_writer("solr.url" => "http://example.com/solr",
213
+ "solr_json_writer.http_client" => @fake_http_client)
214
+
215
+ assert_equal "http://example.com/solr/update", @writer.determine_solr_update_url
216
+ end
217
+ end
218
+
219
+ describe "Record id from context" do
220
+ before do
221
+ @record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
222
+ @context = Traject::Indexer::Context.new
223
+ @writer = create_writer
224
+ @record_001 = " 00282214 " # from the mrc file
225
+ end
226
+
227
+ it "gets it from 001" do
228
+ @context.source_record = @record
229
+ assert_equal @record_001, @writer.record_id_from_context(@context)
230
+ end
231
+
232
+ it "gets it from the id" do
233
+ @context.output_hash['id'] = 'the_record_id'
234
+ assert_equal 'the_record_id', @writer.record_id_from_context(@context)
235
+ end
236
+
237
+ it "gets it from both 001 and id" do
238
+ @context.output_hash['id'] = 'the_record_id'
239
+ @context.source_record = @record
240
+ assert_equal [@record_001, 'the_record_id'].join('/'), @writer.record_id_from_context(@context)
241
+ end
242
+
243
+
244
+
245
+ end
246
+
247
+
248
+ end