RubyGems - traject - Versions diffs - 1.1.0 → 2.0.0.rc.1 - Mend

traject 1.1.0 → 2.0.0.rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

checksums.yaml +4 -4
data/.travis.yml +20 -0
data/README.md +85 -73
data/doc/batch_execution.md +2 -6
data/doc/other_commands.md +3 -5
data/doc/settings.md +27 -38
data/lib/traject/command_line.rb +1 -1
data/lib/traject/csv_writer.rb +34 -0
data/lib/traject/delimited_writer.rb +110 -0
data/lib/traject/indexer.rb +29 -11
data/lib/traject/indexer/settings.rb +39 -13
data/lib/traject/line_writer.rb +10 -6
data/lib/traject/marc_reader.rb +2 -1
data/lib/traject/solr_json_writer.rb +277 -0
data/lib/traject/thread_pool.rb +38 -48
data/lib/traject/translation_map.rb +3 -0
data/lib/traject/util.rb +13 -51
data/lib/traject/version.rb +1 -1
data/lib/translation_maps/marc_geographic.yaml +2 -2
data/test/delimited_writer_test.rb +104 -0
data/test/indexer/read_write_test.rb +0 -22
data/test/indexer/settings_test.rb +24 -0
data/test/solr_json_writer_test.rb +248 -0
data/test/test_helper.rb +5 -3
data/test/test_support/demo_config.rb +0 -5
data/test/translation_map_test.rb +9 -0
data/traject.gemspec +18 -5
metadata +77 -87
data/lib/traject/marc4j_reader.rb +0 -153
data/lib/traject/solrj_writer.rb +0 -351
data/test/marc4j_reader_test.rb +0 -136
data/test/solrj_writer_test.rb +0 -209
data/vendor/solrj/README +0 -8
data/vendor/solrj/build.xml +0 -39
data/vendor/solrj/ivy.xml +0 -16
data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
data/vendor/solrj/lib/noggit-0.5.jar +0 -0
data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0

data/lib/traject/thread_pool.rb CHANGED

@@ -1,28 +1,33 @@
+require 'concurrent'
+require 'thread' # for Queue
 module Traject
-  # An abstraction wrapping a threadpool executor in some configuration choices
-  # and other apparatus.
+  # An abstraction wrapping a Concurrent::ThreadPool in some configuration choices
+  # and other apparatus.  Concurrent::ThreadPool is a Java ThreadPool executor on
+  # jruby for performance, and is ruby-concurrent's own ruby implementation otherwise.
   #
   # 1) Initialize with chosen pool size -- we create fixed size pools, where
   # core and max sizes are the same.
   #
-  # 2) If initialized with nil for threadcount,  no thread pool will actually
-  # be created, and all threadpool-related methods become no-ops. We call this
-  # the nil/null threadpool.  A non-nil threadpool requires jruby, but you can
-  # create a null Traject::ThreadPool.new(nil) under MRI without anything
-  # complaining.
+  # 2) If initialized with nil or 0 for threadcount,  no thread pool will actually
+  # be created, and work sent to the Traject::ThreadPool will just be executed
+  # in the caller thread. We call this a nil threadpool. One situation it can be useful
+  # is if you are running under MRI, where multi-core parallelism isn't available, so
+  # an actual threadpool may not be useful. (Although in some cases a thread pool,
+  # especially one with size 1, can be useful in MRI for I/O blocking operations)
   #
   # 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
-  # execution -- if no threadpool configured your block will just be
+  # execution -- if configurred with a nil threadcount, your block will just be
   # executed in calling thread. Be careful to not refer to any non-local
   # variables in the block, unless the variable has an object you can
   # use thread-safely!
   #
-  # 4) Thread pools are java.util.concurrent.ThreadPoolExecutor, manually created
-  # with a work queue that will buffer up to (pool_size*3) tasks. If queue is full,
-  # the ThreadPoolExecutor is set up to use the ThreadPoolExecutor.CallerRunsPolicy,
+  # 4) We configure our underlying Concurrent::ThreadPool
+  # with a work queue that will buffer up to (pool_size*3) tasks. If the queue is full,
+  # the underlying Concurrent::ThreadPool is set up to use the :caller_runs policy
   # meaning the block will end up executing in caller's own thread. With the kind
   # of work we're doing, where each unit of work is small and there are many of them--
-  # the CallerRunsPolicy serves as an effective 'back pressure' mechanism to keep
+  # the :caller_runs policy serves as an effective 'back pressure' mechanism to keep
   # the work queue from getting too large and exhausting memory, when producers are
   # faster than consumers.
   #
@@ -34,8 +39,8 @@ module Traject
   #  #shutdown_and_wait, which will wait for all current queued work
   #  to complete, then return.  You can not give any more work to the pool
   #  after you do this. By default it'll wait pretty much forever, which should
-  #  be fine. If you never call shutdown, the pool will keep running forever
-  #  and not allow your program to exit!
+  #  be fine. If you never call shutdown, then queued or in-progress work
+  #  may be abandoned when the program ends, which would be bad.
   #
   # 7) We will keep track of total times a block is run in thread pool, and
   #  total elapsed (wall) time of running all blocks, so an average_execution_ms
@@ -43,33 +48,27 @@ module Traject
   #  threads are still executing, as it's not entirely thread safe (may get
   #  an off by one as to total iterations)
   class ThreadPool
-    attr_reader :pool_size, :label, :queue_capacity
+    attr_reader :pool_size, :queue_capacity
-    # First arg is pool size, 0 or nil and we'll be a null/no-op pool
+    # First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
+    # work in caller thread.
     def initialize(pool_size)
       unless pool_size.nil? || pool_size == 0
-        require 'java' # trigger an exception now if we're not jruby
-        @label = label
-        @pool_size = pool_size.to_i # just for reflection, we don't really need it again
+        @pool_size = pool_size.to_i
         @queue_capacity = pool_size * 3
-        blockingQueue            =  java.util.concurrent.ArrayBlockingQueue.new(@queue_capacity)
-        rejectedExecutionHandler =  java.util.concurrent.ThreadPoolExecutor::CallerRunsPolicy.new
-        # keepalive times don't matter, we are setting core and max pool to
-        # same thing, fixed size pool.
-        @thread_pool =  java.util.concurrent.ThreadPoolExecutor.new(
-          @pool_size, @pool_size, 0, java.util.concurrent.TimeUnit::MILLISECONDS,
-          blockingQueue, rejectedExecutionHandler)
+        @thread_pool = Concurrent::ThreadPoolExecutor.new(
+          :min_threads     => @pool_size,
+          :max_threads     => @pool_size,
+          :max_queue       => @queue_capacity,
+          :fallback_policy => :caller_runs
+        )
         # A thread-safe queue to collect exceptions cross-threads.
-        # We make it small, we really only need to store the first
-        # exception, we don't care too much about others. But we'll
-        # keep the first 20, why not.
-        @async_exception_queue   =  java.util.concurrent.ArrayBlockingQueue.new(20)
+        # We really only need to save the first exception, but a queue
+        # is a convenient way to store a value concurrency-safely, and
+        # might as well store all of them.
+        @exceptions_caught_queue   =  Queue.new
       end
     end
@@ -106,7 +105,7 @@ module Traject
       start_t = Time.now
       if @thread_pool
-        @thread_pool.execute do
+        @thread_pool.post do
           begin
             yield(*args)
           rescue Exception => e
@@ -119,21 +118,13 @@ module Traject
     end
-    # Just for monitoring/debugging purposes, we'll return the work queue
-    # used by the threadpool. Don't recommend you do anything with it, as
-    # the original java.util.concurrent docs make the same recommendation.
-    def queue
-      @thread_pool && @thread_pool.queue
-    end
     # thread-safe way of storing an exception, to raise
     # later in a different thread. We don't guarantee
     # that we can store more than one at a time, only
     # the first one recorded may be stored.
     def collect_exception(e)
-      # offer will silently do nothing if the queue is full, that's fine
-      # with us.
-      @async_exception_queue.offer(e)
+      @exceptions_caught_queue.push(e)
     end
     # If there's a stored collected exception, raise it
@@ -144,7 +135,8 @@ module Traject
     # as a non-functioning threadpool -- then this method is just
     # a no-op.
     def raise_collected_exception!
-      if @async_exception_queue && e = @async_exception_queue.poll
+      if @exceptions_caught_queue && (! @exceptions_caught_queue.empty?)
+        e = @exceptions_caught_queue.pop
         raise e
       end
     end
@@ -159,9 +151,7 @@ module Traject
       if @thread_pool
         @thread_pool.shutdown
-        # We pretty much want to wait forever, although we need to give
-        # a timeout. Okay, one day!
-        @thread_pool.awaitTermination(1, java.util.concurrent.TimeUnit::DAYS)
+        @thread_pool.wait_for_termination
       end
       return (Time.now - start_t)

data/lib/traject/translation_map.rb CHANGED

@@ -171,6 +171,9 @@ module Traject
     def initialize(defn, options = {})
       if defn.kind_of? Hash
         @hash = defn
+      elsif defn.kind_of? self.class
+        @hash = defn.to_hash
+        @default = defn.default
       else
         @hash = self.class.cache.lookup(defn)
         raise NotFound.new(defn) if @hash.nil?

data/lib/traject/util.rb CHANGED

@@ -27,63 +27,25 @@ module Traject
     end
-    # Requires solrj jar(s) from settings['solrj.jar_dir'] if given, otherwise
-    # uses jars bundled with traject gem in ./vendor
-    #
-    # Have to pass in a settings arg, so we can check it for specified jar dir.
-    #
-    # Tries not to do the dirglob and require if solrj has already been loaded.
-    # Will define global constants with classes HttpSolrServer and SolrInputDocument
-    # if not already defined.
+    # Ruby stdlib queue lacks a 'drain' function, we write one.
     #
-    # This is all a bit janky, maybe there's a better way to do this? We do want
-    # a 'require' method defined somewhere utility, so multiple classes can
-    # use it, including extra gems. This method may be used by extra gems, so should
-    # be considered part of the API -- after it's called, those top-level
-    # globals should be available, and solrj should be loaded.
-    def self.require_solrj_jars(settings)
-      jruby_ensure_init!
+    # Removes everything currently in the ruby stdlib queue, and returns
+    # it an array.  Should be concurrent-safe, but queue may still have
+    # some things in it after drain, if there are concurrent writers.
+    def self.drain_queue(queue)
+      result = []
-      tries = 0
+      queue_size = queue.size
       begin
-        tries += 1
-        org.apache.solr
-        org.apache.solr.client.solrj
-        # java_import which we'd normally use weirdly doesn't work
-        # from a class method. https://github.com/jruby/jruby/issues/975
-        Object.const_set("HttpSolrServer", org.apache.solr.client.solrj.impl.HttpSolrServer) unless defined? ::HttpSolrServer
-        Object.const_set("SolrInputDocument", org.apache.solr.common.SolrInputDocument) unless defined? ::SolrInputDocument
-      rescue NameError  => e
-        included_jar_dir = File.expand_path("../../vendor/solrj/lib", File.dirname(__FILE__))
-        jardir = settings["solrj.jar_dir"] || included_jar_dir
-        Dir.glob("#{jardir}/*.jar") do |x|
-          require x
-        end
-        if tries > 1
-          raise LoadError.new("Can not find SolrJ java classes")
-        else
-          retry
+        queue_size.times do
+          result << queue.deq(:raise_if_empty)
         end
+      rescue ThreadError
+        # Need do nothing, queue was concurrently popped, no biggie
       end
-    end
-    # just does a `require 'java'` but rescues the exception if we
-    # aren't jruby, and raises a better error message.
-    # Pass in a developer-presentable name of a feature to include in the error
-    # message if you want.
-    def self.jruby_ensure_init!(feature = nil)
-      begin
-        require 'java'
-      rescue LoadError => e
-        feature ||= "A traject feature is in use that"
-        msg = if feature
-          "#{feature} requires jruby, but you do not appear to be running under jruby. We recommend `chruby` for managing multiple ruby installs."
-        end
-        raise LoadError.new(msg)
-      end
+      return result
     end
   end

data/lib/traject/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Traject
-  VERSION = "1.1.0"
+  VERSION = "2.0.0.rc.1"
 end

data/lib/translation_maps/marc_geographic.yaml CHANGED

@@ -1,5 +1,5 @@
 # Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task
-# Scraped from http://www.loc.gov/marc/geoareas/gacs_code.html at 2013-07-31 12:05:20 -0400
+# Scraped from http://www.loc.gov/marc/geoareas/gacs_code.html at 2015-01-27 23:00:08 -0500
 # Intentionally includes discontinued codes.
 'a': 'Asia'
@@ -327,7 +327,7 @@
 'lnaz': 'Azores'
 'lnbm': 'Bermuda Islands'
 'lnca': 'Canary Islands'
-'lncv': 'Cape Verde'
+'lncv': 'Cabo Verde'
 'lnfa': 'Faroe Islands'
 'lnjn': 'Jan Mayen Island'
 'lnma': 'Madeira Islands'

data/test/delimited_writer_test.rb ADDED

@@ -0,0 +1,104 @@
+# Encoding: UTF-8
+require 'test_helper'
+require 'stringio'
+require 'traject/delimited_writer'
+require 'traject/csv_writer'
+require 'csv'
+describe "Delimited/CSV Writers" do
+  before do
+    @out                 = StringIO.new
+    @settings            = {'output_stream' => @out, 'delimited_writer.fields' => 'four,one,two'}
+    @context             = Struct.new(:output_hash).new
+    @context.output_hash = {'one' => 'one', 'two' => %w[two1 two2], 'three' => 'three', 'four' => 'four'}
+  end
+  after do
+    @out.close
+  end
+  describe "Traject::DelimitedWriter" do
+    it "creates a dw with defaults" do
+      dw = Traject::DelimitedWriter.new(@settings)
+      dw.delimiter.must_equal "\t"
+      dw.internal_delimiter.must_equal '|'
+      dw.edelim.must_equal ' '
+      dw.eidelim.must_equal '\\|'
+    end
+    it "respects different delimiter" do
+      @settings['delimited_writer.delimiter'] = '^'
+      dw                                      = Traject::DelimitedWriter.new(@settings)
+      dw.delimiter.must_equal '^'
+      dw.edelim.must_equal '\\^'
+      dw.internal_delimiter.must_equal '|'
+    end
+    it "outputs a header if asked to" do
+      dw = Traject::DelimitedWriter.new(@settings)
+      @out.string.chomp.must_equal %w[four one two].join("\t")
+    end
+    it "doesn't output a header if asked not to" do
+      @settings['delimited_writer.header'] = 'false'
+      dw                                   = Traject::DelimitedWriter.new(@settings)
+      @out.string.must_be_empty
+    end
+    it "deals with multiple values" do
+      dw = Traject::DelimitedWriter.new(@settings)
+      dw.put @context
+      @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(dw.delimiter)
+    end
+    it "bails if delimited_writer.fields isn't set" do
+      @settings.delete 'delimited_writer.fields'
+      proc { Traject::DelimitedWriter.new(@settings) }.must_raise(ArgumentError)
+    end
+  end
+  describe "Traject::CSVWriter" do
+    it "unsets the delimiter" do
+      cw = Traject::CSVWriter.new(@settings)
+      cw.delimiter.must_be_nil
+    end
+    it "writes the header" do
+      cw = Traject::CSVWriter.new(@settings)
+      @out.string.chomp.must_equal 'four,one,two'
+    end
+    it "uses the internal delimiter" do
+      cw = Traject::CSVWriter.new(@settings)
+      cw.put @context
+      @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(',')
+    end
+    it "produces complex output" do
+      @context.output_hash = {
+          'four' => ['Bill Clinton, Jr.', 'Jesse "the Body" Ventura'],
+          'one' => 'Willard "Mitt" Romney',
+          'two' => 'Dueber, Bill'
+      }
+      canonical = StringIO.new
+      csv = CSV.new(canonical)
+      csv_vals = [@context.output_hash['four'].join('|'), @context.output_hash['one'], @context.output_hash['two']]
+      csv << csv_vals
+      csv_output = canonical.string.chomp
+      cw = Traject::CSVWriter.new(@settings)
+      cw.put @context
+      traject_csvwriter_output = @out.string.split("\n").last.chomp
+      assert_equal(csv_output, traject_csvwriter_output)
+    end
+  end
+end

data/test/indexer/read_write_test.rb CHANGED

@@ -62,29 +62,9 @@ describe "Traject::Indexer#process" do
     assert writer_settings["memory_writer.closed"]
   end
-  it "returns false if skipped records" do
-    @indexer = Traject::Indexer.new(
-      "solrj_writer.server_class_name" => "MockSolrServer",
-      "solr.url" => "http://example.org",
-      "writer_class_name" => "Traject::SolrJWriter"
-    )
-    @file = File.open(support_file_path "manufacturing_consent.marc")
-    @indexer.to_field("id") do |record, accumulator|
-      # intentionally make error
-      accumulator.concat ["one_id", "two_id"]
-    end
-    return_value = @indexer.process(@file)
-    assert ! return_value, "returns false on skipped record errors"
-  end
   require 'traject/null_writer'
   it "calls after_processing after processing" do
     @indexer = Traject::Indexer.new(
-      "solrj_writer.server_class_name" => "MockSolrServer",
-      "solr.url" => "http://example.org",
       "writer_class_name" => "Traject::NullWriter"
     )
     @file = File.open(support_file_path "test_data.utf8.mrc")
@@ -106,8 +86,6 @@ describe "Traject::Indexer#process" do
   describe "demo_config.rb" do
     before do
       @indexer = Traject::Indexer.new(
-        "solrj_writer.server_class_name" => "MockSolrServer",
-        "solr.url" => "http://example.org",
         "writer_class_name" => "Traject::NullWriter"
       )
     end

data/test/indexer/settings_test.rb CHANGED

@@ -124,5 +124,29 @@ describe "Traject::Indexer#settings" do
       assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
     end
   end
+  describe "JRuby / MRI" do
+    before do
+      @indexer = Traject::Indexer.new
+    end
+    it "has the right indexer name" do
+      if defined? JRUBY_VERSION
+        assert_equal "Traject::Marc4JReader", @indexer.settings['reader_class_name']
+      else
+        assert_equal "Traject::MarcReader", @indexer.settings['reader_class_name']
+      end
+    end
+    # This next one has the added effect of making sure the correct class
+    # has actually been loaded -- otherwise the constant wouldn't be available
+    it "has the correct default indexer class based on platform" do
+      if defined? JRUBY_VERSION
+        assert_equal Traject::Marc4JReader, @indexer.reader_class
+      else
+        assert_equal Traject::MarcReader, @indexer.reader_class
+      end
+    end
+  end
 end

data/test/solr_json_writer_test.rb ADDED

@@ -0,0 +1,248 @@
+require 'test_helper'
+require 'httpclient'
+require 'traject/solr_json_writer'
+require 'thread'
+require 'json'
+require 'stringio'
+require 'logger'
+# Some basic tests, using a mocked HTTPClient so we can see what it did --
+# these tests do not run against a real solr server at present.
+describe "Traject::SolrJsonWriter" do
+  #######
+  # A bunch of utilities to help testing
+  #######
+  class FakeHTTPClient
+    # Always reply with this status, normally 200, can
+    # be reset for testing error conditions.
+    attr_accessor :response_status
+    attr_accessor :allow_update_json_path
+    def initialize(*args)
+      @post_args = []
+      @get_args  = []
+      @response_status = 200
+      @allow_update_json_path = true
+      @mutex = Monitor.new
+    end
+    def post(*args)
+      @mutex.synchronize do
+        @post_args << args
+      end
+      resp = HTTP::Message.new_response("")
+      resp.status = self.response_status
+      return resp
+    end
+    def get (*args)
+      @mutex.synchronize do
+        @get_args << args
+      end
+      resp = HTTP::Message.new_response("")
+      resp.status = self.response_status
+      if args.first.end_with?("/update/json") && ! self.allow_update_json_path
+        # Need to test auto-detection of /update/json being available
+        resp.status = 404
+      end
+      return resp
+    end
+    def post_args
+      @mutex.synchronize do
+        @post_args.dup
+      end
+    end
+    def get_args
+      @mutex.synchronize do
+        @get_args.dup
+      end
+    end
+    # Everything else, just return nil please
+    def method_missing(*args)
+    end
+  end
+  def context_with(hash)
+    Traject::Indexer::Context.new(:output_hash => hash)
+  end
+  def create_writer(settings = {})
+    settings = {
+      "solr.url" => "http://example.com/solr",
+      "solr_json_writer.http_client" => FakeHTTPClient.new
+      }.merge!(settings)
+    @fake_http_client = settings["solr_json_writer.http_client"]
+    writer = Traject::SolrJsonWriter.new(settings)
+    return writer
+  end
+  # strio = StringIO.new
+  # logger_to_strio(strio)
+  #
+  # Later check for strio.string for contents
+  def logger_to_strio(strio)
+    # Yell makes this hard, let's do it with an ordinary logger, think
+    # it's okay.
+    Logger.new(strio)
+  end
+  #########
+  # Actual tests
+  #########
+  before do
+    @writer = create_writer
+  end
+  it "defaults to 1 bg thread" do
+    assert_equal 1, @writer.thread_pool_size
+  end
+  it "adds a document" do
+    @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
+    @writer.close
+    post_args = @fake_http_client.post_args.first
+    refute_nil post_args
+    assert_equal "http://example.com/solr/update/json", post_args[0]
+    refute_nil post_args[1]
+    posted_json = JSON.parse(post_args[1])
+    assert_equal [{"id" => "one", "key" => ["value1", "value2"]}], posted_json
+  end
+  it "adds more than a batch in batches" do
+    (Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE + 1).times do |i|
+      doc = {"id" => "doc_#{i}", "key" => "value"}
+      @writer.put context_with(doc)
+    end
+    @writer.close
+    post_args = @fake_http_client.post_args
+    assert_length 2, post_args, "Makes two posts to Solr for two batches"
+    assert_length Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE, JSON.parse(post_args[0][1]), "first batch posted with batch size docs"
+    assert_length 1, JSON.parse(post_args[1][1]), "second batch posted with last remaining doc"
+  end
+  it "commits on close when set" do
+    @writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
+    @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
+    @writer.close
+    last_solr_get = @fake_http_client.get_args.last
+    assert_equal "http://example.com/update/json", last_solr_get[0]
+    assert_equal( {"commit" => "true"}, last_solr_get[1] )
+  end
+  describe "skipped records" do
+    it "skips and reports under max_skipped" do
+      strio = StringIO.new
+      @writer = create_writer("solr_writer.max_skipped" => 10, "logger" => logger_to_strio(strio))
+      @fake_http_client.response_status = 500
+      10.times do |i|
+        @writer.put context_with("id" => "doc_#{i}", "key" => "value")
+      end
+      @writer.close
+      assert_equal 10, @writer.skipped_record_count
+      logged = strio.string
+      10.times do |i|
+        assert_match /ERROR.*Could not add record doc_#{i} at source file position : Solr error response: 500/, logged
+      end
+    end
+    it "raises when skipped more than max_skipped" do
+      @writer = create_writer("solr_writer.max_skipped" => 5)
+      @fake_http_client.response_status = 500
+      e = assert_raises(RuntimeError) do
+        6.times do |i|
+          @writer.put context_with("id" => "doc_#{i}", "key" => "value")
+        end
+        @writer.close
+      end
+      assert_includes e.message, "Exceeded maximum number of skipped records"
+    end
+    it "raises on one skipped record when max_skipped is 0" do
+      @writer = create_writer("solr_writer.max_skipped" => 0)
+      @fake_http_client.response_status = 500
+      e = assert_raises(RuntimeError) do
+        @writer.put context_with("id" => "doc_1", "key" => "value")
+        @writer.close
+      end
+    end
+  end
+  describe "auto-discovers proper update path" do
+    it "finds /update/json" do
+      assert_equal "http://example.com/solr/update/json", @writer.determine_solr_update_url
+    end
+    it "resorts to plain /update" do
+      @fake_http_client = FakeHTTPClient.new
+      @fake_http_client.allow_update_json_path = false
+      @writer = create_writer("solr.url" => "http://example.com/solr",
+        "solr_json_writer.http_client" => @fake_http_client)
+      assert_equal "http://example.com/solr/update", @writer.determine_solr_update_url
+    end
+  end
+  describe "Record id from context" do
+    before do
+      @record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
+      @context = Traject::Indexer::Context.new
+      @writer = create_writer
+      @record_001 = "   00282214 " # from the mrc file
+    end
+    it "gets it from 001" do
+      @context.source_record = @record
+      assert_equal @record_001, @writer.record_id_from_context(@context)
+    end
+    it "gets it from the id" do
+      @context.output_hash['id'] = 'the_record_id'
+      assert_equal 'the_record_id', @writer.record_id_from_context(@context)
+    end
+    it "gets it from both 001 and id" do
+      @context.output_hash['id'] = 'the_record_id'
+      @context.source_record = @record
+      assert_equal [@record_001, 'the_record_id'].join('/'), @writer.record_id_from_context(@context)
+    end
+  end
+end