RubyGems - traject - Versions diffs - 0.0.2 → 0.9.1 - Mend

traject 0.0.2 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

data/Gemfile +4 -0
data/README.md +85 -61
data/Rakefile +5 -0
data/bin/traject +31 -3
data/doc/settings.md +74 -13
data/lib/tasks/load_maps.rake +48 -0
data/lib/traject/indexer/settings.rb +75 -0
data/lib/traject/indexer.rb +255 -45
data/lib/traject/json_writer.rb +4 -2
data/lib/traject/macros/marc21.rb +18 -6
data/lib/traject/macros/marc21_semantics.rb +405 -0
data/lib/traject/macros/marc_format_classifier.rb +180 -0
data/lib/traject/marc4j_reader.rb +160 -0
data/lib/traject/marc_extractor.rb +33 -17
data/lib/traject/marc_reader.rb +14 -11
data/lib/traject/solrj_writer.rb +247 -9
data/lib/traject/thread_pool.rb +154 -0
data/lib/traject/translation_map.rb +46 -4
data/lib/traject/util.rb +30 -0
data/lib/traject/version.rb +1 -1
data/lib/translation_maps/lcc_top_level.yaml +26 -0
data/lib/translation_maps/marc_genre_007.yaml +9 -0
data/lib/translation_maps/marc_genre_leader.yaml +22 -0
data/lib/translation_maps/marc_geographic.yaml +589 -0
data/lib/translation_maps/marc_instruments.yaml +102 -0
data/lib/translation_maps/marc_languages.yaml +490 -0
data/test/indexer/each_record_test.rb +34 -0
data/test/indexer/macros_marc21_semantics_test.rb +206 -0
data/test/indexer/macros_marc21_test.rb +10 -1
data/test/indexer/map_record_test.rb +78 -8
data/test/indexer/read_write_test.rb +43 -10
data/test/indexer/settings_test.rb +60 -4
data/test/indexer/to_field_test.rb +39 -0
data/test/marc4j_reader_test.rb +75 -0
data/test/marc_extractor_test.rb +62 -0
data/test/marc_format_classifier_test.rb +91 -0
data/test/marc_reader_test.rb +12 -0
data/test/solrj_writer_test.rb +146 -43
data/test/test_helper.rb +50 -0
data/test/test_support/245_no_ab.marc +1 -0
data/test/test_support/880_with_no_6.utf8.marc +1 -0
data/test/test_support/bad_subfield_code.marc +1 -0
data/test/test_support/date_resort_to_260.marc +1 -0
data/test/test_support/date_type_r_missing_date2.marc +1 -0
data/test/test_support/date_with_u.marc +1 -0
data/test/test_support/demo_config.rb +153 -0
data/test/test_support/emptyish_record.marc +1 -0
data/test/test_support/louis_armstrong.marc +1 -0
data/test/test_support/manuscript_online_thesis.marc +1 -0
data/test/test_support/microform_online_conference.marc +1 -0
data/test/test_support/multi_era.marc +1 -0
data/test/test_support/multi_geo.marc +1 -0
data/test/test_support/musical_cage.marc +1 -0
data/test/test_support/one-marc8.mrc +1 -0
data/test/test_support/online_only.marc +1 -0
data/test/test_support/packed_041a_lang.marc +1 -0
data/test/test_support/the_business_ren.marc +1 -0
data/test/translation_map_test.rb +8 -0
data/test/translation_maps/properties_map.properties +5 -0
data/traject.gemspec +1 -1
data/vendor/marc4j/README.md +17 -0
data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
metadata +81 -2

data/lib/traject/marc4j_reader.rb ADDED Viewed

@@ -0,0 +1,160 @@
+require 'traject'
+require 'marc'
+# Uses Marc4J to read the marc records, but then translates them to
+# ruby-marc before delivering them still, Marc4J is just inside the black
+# box.
+#
+# But one way to get ability to transcode from Marc8. Records it delivers
+# are ALWAYS in UTF8, will be transcoded if needed.
+#
+# Also hope it gives us some performance benefit.
+#
+# Uses the Marc4J MarcPermissiveStreamReader for binary, but sometimes
+# in non-permissive mode, according to settings. Uses the Marc4j MarcXmlReader
+# for xml.
+#
+# NOTE: If you aren't reading in binary records encoded in MARC8, you may
+# find the pure-ruby Traject::MarcReader faster; the extra step to read
+# Marc4J but translate to ruby MARC::Record adds some overhead.
+#
+# Settings:
+#
+# * marc_source.type:     serialization type. default 'binary', also 'xml' (TODO: json/marc-in-json)
+#
+# * marc4j_reader.permissive:   default true, false to turn off permissive reading. Used as
+#                             value to 'permissive' arg of MarcPermissiveStreamReader constructor.
+#                             Only used for 'binary'
+#
+# * marc4j_reader.source_encoding: Only used for 'binary', otherwise always UTF-8.
+#         String of the values MarcPermissiveStreamReader accepts:
+#         * BESTGUESS  (tries to use MARC leader and believe it, I think)
+#         * ISO8859_1
+#         * UTF-8
+#         * MARC8
+#         Default 'BESTGUESS', but marc records in the wild are so wrong here, recommend setting.
+#         (will ALWAYS be transcoded to UTF-8 on the way out. We insist.)
+#
+# * marc4j_reader.jar_dir:   Path to a directory containing Marc4J jar file to use. All .jar's in dir will
+#                          be loaded. If unset, uses marc4j.jar bundled with traject.
+class Traject::Marc4JReader
+  include Enumerable
+  attr_reader :settings, :input_stream
+  def initialize(input_stream, settings)
+    @settings     = Traject::Indexer::Settings.new settings
+    @input_stream = input_stream
+    ensure_marc4j_loaded!
+  end
+    # Loads solrj if not already loaded. By loading all jars found
+  # in settings["solrj.jar_dir"]
+  def ensure_marc4j_loaded!
+    unless defined?(MarcPermissiveStreamReader)
+      require 'java'
+      tries = 0
+      begin
+        tries += 1
+        java_import org.marc4j.MarcPermissiveStreamReader
+        java_import org.marc4j.MarcXmlReader
+      rescue NameError  => e
+        # /Users/jrochkind/code/solrj-gem/lib"
+        include_jar_dir = File.expand_path("../../vendor/marc4j/lib", File.dirname(__FILE__))
+        jardir = settings["marc4j_reader.jar_dir"] || include_jar_dir
+        Dir.glob("#{jardir}/*.jar") do |x|
+          require x
+        end
+        if tries > 1
+          raise LoadError.new("Can not find Marc4J java classes")
+        else
+          retry
+        end
+      end
+    end
+  end
+  def internal_reader
+    @internal_reader ||= create_marc_reader!
+  end
+  def input_type
+    # maybe later add some guessing somehow
+    settings["marc_source.type"]
+  end
+  def create_marc_reader!
+    case input_type
+    when "binary"
+      permissive = settings["marc4j_reader.permissive"].to_s == "true"
+      # #to_inputstream turns our ruby IO into a Java InputStream
+      # third arg means 'convert to UTF-8, yes'
+      MarcPermissiveStreamReader.new(input_stream.to_inputstream, permissive, true, settings["marc4j_reader.source_encoding"])
+    when "xml"
+      MarcXmlReader.new(input_stream.to_inputstream)
+    else
+      raise IllegalArgument.new("Unrecgonized marc_source.type: #{input_type}")
+    end
+  end
+  def each
+    while (internal_reader.hasNext)
+      begin
+        marc4j = internal_reader.next
+        rubymarc = convert_marc4j_to_rubymarc(marc4j)
+      rescue Exception =>e
+        msg = "MARC4JReader: Error reading MARC, fatal, re-raising"
+        if marc4j
+          msg += "\n    001 id: #{marc4j.getControlNumber}"
+        end
+        msg += "\n    #{Traject::Util.exception_to_log_message(e)}"
+        logger.fatal msg
+        raise e
+      end
+      yield rubymarc
+    end
+  end
+  def logger
+    @logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
+  end
+  def convert_marc4j_to_rubymarc(marc4j)
+    rmarc = MARC::Record.new
+    rmarc.leader = marc4j.getLeader.marshal
+    marc4j.getControlFields.each do |marc4j_control|
+      rmarc.append( MARC::ControlField.new(marc4j_control.getTag(), marc4j_control.getData )  )
+    end
+    marc4j.getDataFields.each do |marc4j_data|
+      rdata = MARC::DataField.new(  marc4j_data.getTag,  marc4j_data.getIndicator1.chr, marc4j_data.getIndicator2.chr )
+      marc4j_data.getSubfields.each do |subfield|
+        # We assume Marc21, skip corrupted data
+        # if subfield.getCode is more than 255, subsequent .chr
+        # would raise.
+        if subfield.getCode > 255
+          logger.warn("Marc4JReader: Corrupted MARC data, record id #{marc4j.getControlNumber}, field #{marc4j_data.tag}, corrupt subfield code byte #{subfield.getCode}. Skipping subfield, but continuing with record.")
+          next
+        end
+        rsubfield = MARC::Subfield.new(subfield.getCode.chr, subfield.getData)
+        rdata.append rsubfield
+      end
+      rmarc.append rdata
+    end
+    return rmarc
+  end
+end

data/lib/traject/marc_extractor.rb CHANGED Viewed

@@ -26,11 +26,7 @@ module Traject
     # Third arg is an optional options hash that will be passed as
     # third arg of MarcExtractor constructor.
     def self.extract_by_spec(marc_record, specification, options = {})
-      (raise IllegalArgument, "first argument must not be nil") if marc_record.nil?
-      unless specification.kind_of? Hash
-        specification = self.parse_string_spec(specification)
-      end
+      (raise ArgumentError, "first argument must not be nil") if marc_record.nil?
       Traject::MarcExtractor.new(marc_record, specification, options).extract
     end
@@ -38,6 +34,10 @@ module Traject
     # Take a hash that's the output of #parse_string_spec, return
     # an array of strings extracted from a marc record accordingly
     #
+    # Second arg can either be a string specification that will be passed
+    # to MarcExtractor.parse_string_spec, or a Hash that's
+    # already been created by it.
+    #
     # options:
     #
     # [:seperator]  default ' ' (space), what to use to seperate
@@ -47,16 +47,15 @@ module Traject
     #                     that match spec. Also:
     #                     * false => do not include.
     #                     * :only => only include linked 880s, not original
-    def initialize(marc_record, spec_hash, options = {})
+    def initialize(marc_record, spec, options = {})
       self.options = {
         :seperator => ' ',
         :alternate_script => :include
       }.merge(options)
-      raise IllegalArgumentException("second arg to MarcExtractor.new must be a Hash specification object") unless spec_hash.kind_of? Hash
       self.marc_record = marc_record
-      self.spec_hash = spec_hash
+      self.spec_hash = spec.kind_of?(Hash) ? spec : self.class.parse_string_spec(spec)
     end
     # Converts from a string marc spec like "245abc:700a" to a nested hash used internally
@@ -129,7 +128,7 @@ module Traject
     end
-    # Returns array of strings, extracted values
+    # Returns array of strings, extracted values. Maybe empty array.
     def extract
       results = []
@@ -145,26 +144,46 @@ module Traject
     end
     # Yields a block for every line in source record that matches
-    # spec. First arg to block is MARC::Field (control or data), second
+    # spec. First arg to block is MARC::DataField or ControlField, second
     # is the hash specification that it matched on. May take account
     # of options such as :alternate_script
+    #
+    # Third (optional) arg to block is self, the MarcExtractor object, useful for custom
+    # implementations.
     def each_matching_line
       self.marc_record.each do |field|
         if (spec = spec_covering_field(field)) && matches_indicators(field, spec)
-          yield(field, spec)
+          yield(field, spec, self)
         end
       end
     end
+    # line each_matching_line, takes a block to process each matching line,
+    # but collects results of block into an array -- flattens any subarrays for you!
+    #
+    # Useful for re-use of this class for custom processing
+    def collect_matching_lines
+      results = []
+      self.each_matching_line do |field, spec, extractor|
+        results.concat [yield(field, spec, extractor)].flatten
+      end
+      return results
+    end
     # Pass in a marc data field and a hash spec, returns
     # an ARRAY of one or more strings, subfields extracted
     # and processed per spec. Takes account of options such
     # as :seperator
+    #
+    # Always returns array, sometimes empty array.
     def collect_subfields(field, spec)
       subfields = field.subfields.collect do |subfield|
         subfield.value if spec[:subfields].nil? || spec[:subfields].include?(subfield.code)
       end.compact
+      return subfields if subfields.empty? # empty array, just return it.
       return options[:seperator] ? [ subfields.join( options[:seperator]) ] : subfields
     end
@@ -175,13 +194,10 @@ module Traject
     # otherwise will always return nil for 880s, you have to handle :alternate_script :include
     # elsewhere, to add in the 880 in the right order
     def spec_covering_field(field)
-      #require 'pry'
-      #binding.pry if field.tag == "880"
-      if field.tag == "880" && options[:alternate_script] != false
+      if field.tag == "880" && field['6'] && options[:alternate_script] != false
         # pull out the spec for corresponding original marc tag this 880 corresponds to
         # Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need
-        # to do this weird encode gymnastics, which fixes it for mysterious reasons.
+        # to do this weird encode gymnastics, which fixes it for mysterious reasons.
         orig_field = field["6"].encode(field["6"].encoding).byteslice(0,3)
         field["6"] && self.spec_hash[  orig_field  ]
       elsif options[:alternate_script] != :only

data/lib/traject/marc_reader.rb CHANGED Viewed

@@ -1,31 +1,34 @@
 require 'marc'
 # A Reader class that can be used with Traject::Indexer.reader, to read
-# MARC records.
+# MARC records.
 #
-# Includes Enumerable for convenience.
+# Includes Enumerable for convenience.
 #
 # Reads in Marc records using ruby marc. Depends on config variables to
 # determine what serialization type to expect, and other parameters controlling
-# de-serialization.
+# de-serialization.
+#
+# NOTE: MarcReader can not handle Marc8 encoding. If you need to read binary
+# records in MARC8, use Traject::Marc4JReader instead.
 #
 # Settings:
 #   ["marc_source.type"]  serialization type. default 'binary'
-#                 * "binary". Actual marc.
+#                 * "binary". Actual marc.
 #                 * "xml", MarcXML
 #                 * "json". (NOT YET IMPLEMENTED) The "marc-in-json" format, encoded as newline-seperated
 #                   json. A simplistic newline-seperated json, with no comments
 #                   allowed, and no unescpaed internal newlines allowed in the json
 #                   objects -- we just read line by line, and assume each line is a
 #                   marc-in-json. http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/
-#   ["marc_source.xml_parser"] For XML type, which XML parser to tell Marc::Reader
+#   ["marc_reader.xml_parser"] For XML type, which XML parser to tell Marc::Reader
 #                              to use. Anything recognized by Marc::Reader :parser
 #                              argument. By default, asks Marc::Reader to take
 #                              it's best guess as to highest performance available
-#                              installed option.
+#                              installed option.
 #
 #
-# Can NOT yet read Marc8, input is always assumed UTF8.
+# Can NOT yet read Marc8, input is always assumed UTF8.
 class Traject::MarcReader
   include Enumerable
@@ -34,18 +37,18 @@ class Traject::MarcReader
   @@best_xml_parser = MARC::XMLReader.best_available
   def initialize(input_stream, settings)
-    @settings = settings
+    @settings = Traject::Indexer::Settings.new settings
     @input_stream = input_stream
   end
   # Creates proper kind of ruby MARC reader, depending
   # on settings or guesses.
   def internal_reader
-    unless defined? @internal_reader
-      @internal_reader =
+    unless defined? @internal_reader
+      @internal_reader =
         case settings["marc_source.type"]
         when "xml"
-          parser = settings["marc_source.xml_parser"] || @@best_xml_parser
+          parser = settings["marc_reader.xml_parser"] || @@best_xml_parser
           MARC::XMLReader.new(self.input_stream, :parser=> parser)
         else
           MARC::Reader.new(self.input_stream)

data/lib/traject/solrj_writer.rb CHANGED Viewed

@@ -1,10 +1,40 @@
+# TODO: THREAD POOL
+#
+# 1) Exception handling in threads, what's the right thing to do
+# 2) General count of failed records in a thread safe way, so we can report
+#    it back from 'close', so process can report it back, and non-zero exit
+#    code can be emited from command-line.
+# 3) back pressure on thread pool. give it a bounded blocking queue instead,
+#    to make sure thousands of add tasks don't build up, waiting until the end.
+#    or does that even matter? So what if they build up in the queue and only
+#    get taken care of at the end, is that okay? I do emit a warning right now
+#    if it takes more than 60 seconds to process remaining thread pool task queue
+#    at end.
+# 4) No tests yet that actually test thread pool stuff; additionally, may make
+#    some of the batch tests fail in non-deterministic ways, since batch tests
+#    assume order of add (and our Mock solr server is not thread safe yet!)
+require 'yell'
 require 'traject'
+require 'traject/util'
 require 'traject/qualified_const_get'
+require 'traject/thread_pool'
+require 'uri'
+require 'thread' # for Mutex
 #
 # Writes to a Solr using SolrJ, and the SolrJ HttpSolrServer.
 #  (sub-class later for the ConcurrentUpdate server?)
 #
+# After you call #close, you can check #skipped_record_count if you want
+# for an integer count of skipped records.
+#
+# For fatal errors that raise... async processing with thread_pool means that
+# you may not get a raise immediately after calling #put, you may get it on
+# a FUTURE #put or #close. You should get it eventually though.
+#
 # settings:
 #   [solr.url] Your solr url (required)
 #   [solrj_writer.server_class_name]  Defaults to "HttpSolrServer". You can specify
@@ -27,18 +57,57 @@ require 'traject/qualified_const_get'
 #                                    "XMLResponseParser"
 #   [solrj_writer.commit_on_close]  If true (or string 'true'), send a commit to solr
 #                                   at end of #process.
+#   [solrj_writer.batch_size]       If non-nil and more than 1, send documents to
+#                                   solr in batches of solrj_writer.batch_size. If nil/1,
+#                                   however, an http transaction with solr will be done
+#                                   per doc. DEFAULT to 100, which seems to be a sweet spot.
+#   [solrj_writer.thread_pool]      Defaults to 4. A thread pool is used for submitting docs
+#                                   to solr. Set to 0 or nil to disable threading. Set to 1,
+#                                   there will still be a single bg thread doing the adds.
+#                                   May make sense to set higher than number of cores on your
+#                                   indexing machine, as these threads will mostly be waiting
+#                                   on Solr. Speed/capacity of your solr is more relevant.
 class Traject::SolrJWriter
+  # just a tuple of a SolrInputDocument
+  # and a Traject::Indexer::Context it came from
+  class UpdatePackage
+    attr_accessor :solr_document, :context
+    def initialize(doc, ctx)
+      self.solr_document = doc
+      self.context = ctx
+    end
+  end
   include Traject::QualifiedConstGet
   attr_reader :settings
+  attr_reader :batched_queue
   def initialize(argSettings)
-    @settings = argSettings
+    @settings = Traject::Indexer::Settings.new(argSettings)
     settings_check!(settings)
     ensure_solrj_loaded!
     solr_server # init
+    @batched_queue = java.util.concurrent.LinkedBlockingQueue.new
+    # when multi-threaded exceptions raised in threads are held here
+    # we need a HIGH performance queue here to try and avoid slowing things down,
+    # since we need to check it frequently.
+    @async_exception_queue = java.util.concurrent.ConcurrentLinkedQueue.new
+    # Store error count in an AtomicInteger, so multi threads can increment
+    # it safely, if we're threaded.
+    @skipped_record_incrementer = java.util.concurrent.atomic.AtomicInteger.new(0)
+    # if our thread pool settings are 0, it'll just create a null threadpool that
+    # executes in calling context.
+    @thread_pool = Traject::ThreadPool.new( @settings["solrj_writer.thread_pool"].to_i )
+    @debug_ascii_progress = (@settings["debug_ascii_progress"].to_s == "true")
   end
   # Loads solrj if not already loaded. By loading all jars found
@@ -68,29 +137,193 @@ class Traject::SolrJWriter
         end
       end
     end
+    # And for now, SILENCE SolrJ logging
+    org.apache.log4j.Logger.getRootLogger().addAppender(org.apache.log4j.varia.NullAppender.new)
+  end
+  # Method IS thread-safe, can be called concurrently by multi-threads.
+  #
+  # Why? If not using batched add, we just use the SolrServer, which is already
+  # thread safe itself.
+  #
+  # If we are using batch add, we surround all access to our shared state batch queue
+  # in a mutex -- just a naive implementation. May be able to improve performance
+  # with more sophisticated java.util.concurrent data structure (blocking queue etc)
+  # I did try a java ArrayBlockingQueue or LinkedBlockingQueue instead of our own
+  # mutex -- I did not see consistently different performance. May want to
+  # change so doesn't use a mutex at all if multiple mapping threads aren't being
+  # used.
+  #
+  # this class does not at present use any threads itself, all work will be done
+  # in the calling thread, including actual http transactions to solr via solrj SolrServer
+  # if using batches, then not every #put is a http transaction, but when it is,
+  # it's in the calling thread, synchronously.
+  def put(context)
+    @thread_pool.raise_collected_exception!
+    # package the SolrInputDocument along with the context, so we have
+    # the context for error reporting when we actually add.
+    package = UpdatePackage.new(hash_to_solr_document(context.output_hash), context)
+    if settings["solrj_writer.batch_size"].to_i > 1
+      ready_batch = []
+      # Synchronize access to our shared batched_queue state,
+      # but once we've pulled out what we want in local var
+      # `ready_batch`, don't need to synchronize anymore.
+      batched_queue.add(package)
+      if batched_queue.size >= settings["solrj_writer.batch_size"].to_i
+        batched_queue.drain_to(ready_batch)
+      end
+      if ready_batch.length > 0
+        if @debug_ascii_progress
+          $stderr.write("^")
+          if @thread_pool.queue && (@thread_pool.queue.size >= @thread_pool.queue_capacity)
+            $stderr.write "!"
+          end
+        end
+        @thread_pool.maybe_in_thread_pool { batch_add_document_packages(ready_batch) }
+      end
+    else # non-batched add, add one at a time.
+      @thread_pool.maybe_in_thread_pool { add_one_document_package(package) }
+    end
   end
-  def put(hash)
+  def hash_to_solr_document(hash)
     doc = SolrInputDocument.new
     hash.each_pair do |key, value_array|
       value_array.each do |value|
         doc.addField( key, value )
       end
     end
+    return doc
+  end
-    # TODO: Buffer docs internally, add in arrays, one http
-    # transaction per array. Is what solrj wiki recommends.
-    solr_server.add(doc)
+  # Takes array and batch adds it to solr -- array of UpdatePackage tuples of
+  # SolrInputDocument and context.
+  #
+  # Catches error in batch add, logs, and re-tries docs individually
+  #
+  # Is thread-safe, because SolrServer is thread-safe, and we aren't
+  # referencing any other shared state. Important that CALLER passes
+  # in a doc array that is not shared state, extracting it from
+  # shared state batched_queue in a mutex.
+  def batch_add_document_packages(current_batch)
+    begin
+      a = current_batch.collect {|package| package.solr_document }
+      solr_server.add( a )
+      $stderr.write "%" if @debug_ascii_progress
+    rescue Exception => e
+      # Error in batch, none of the docs got added, let's try to re-add
+      # em all individually, so those that CAN get added get added, and those
+      # that can't get individually logged.
+      logger.warn "Error encountered in batch solr add, will re-try documents individually, at a performance penalty...\n" + Traject::Util.exception_to_log_message(e)
+      current_batch.each do |package|
+        add_one_document_package(package)
+      end
+    end
+  end
+  # Adds a single SolrInputDocument passed in as an UpdatePackage combo of SolrInputDocument
+  # and context.
+  #
+  # Rescues exceptions thrown by SolrServer.add, logs them, and then raises them
+  # again if deemed fatal and should stop indexing. Only intended to be used on a SINGLE
+  # document add. If we get an exception on a multi-doc batch add, we need to recover
+  # differently.
+  def add_one_document_package(package)
+    begin
+      solr_server.add(package.solr_document)
+    # Honestly not sure what the difference is between those types, but SolrJ raises both
+    rescue org.apache.solr.common.SolrException, org.apache.solr.client.solrj.SolrServerException  => e
+      id        = package.context.source_record && package.context.source_record['001'] && package.context.source_record['001'].value
+      id_str    = id ? "001:#{id}" : ""
+      position  = package.context.position
+      position_str = position ? "at file position #{position} (starting at 1)" : ""
+      logger.error("Could not index record #{id_str} #{position_str}\n" + Traject::Util.exception_to_log_message(e) )
+      logger.debug(package.context.source_record.to_s)
+      @skipped_record_incrementer.getAndIncrement() # AtomicInteger, thread-safe increment.
+      if fatal_exception? e
+        logger.fatal ("SolrJ exception judged fatal, raising...")
+        raise e
+      end
+    end
+  end
+  def logger
+    settings["logger"] ||=  Yell.new(STDERR, :level => "gt.fatal") # null logger
+  end
+  # If an exception is encountered talking to Solr, is it one we should
+  # entirely give up on? SolrJ doesn't use a useful exception class hieararchy,
+  # we have to look into it's details and guess.
+  def fatal_exception?(e)
+    root_cause = e.respond_to?(:getRootCause) && e.getRootCause
+    # Various kinds of inability to actually talk to the
+    # server look like this:
+    if root_cause.kind_of? java.io.IOException
+      return true
+    end
+    return false
   end
   def close
-    solr_server.commit if settings["solrj_writer.commit_on_close"].to_s == "true"
+    @thread_pool.raise_collected_exception!
+    # Any leftovers in batch buffer? Send em to the threadpool too.
+    if batched_queue.length > 0
+      packages = []
+      batched_queue.drain_to(packages)
+      # we do it in the thread pool for consistency, and so
+      # it goes to the end of the queue behind any outstanding
+      # work in the pool.
+      @thread_pool.maybe_in_thread_pool { batch_add_document_packages( packages ) }
+    end
+    # Wait for shutdown, and time it.
+    logger.debug "SolrJWriter: Shutting down thread pool, waiting if needed..."
+    elapsed = @thread_pool.shutdown_and_wait
+    if elapsed > 60
+      logger.warn "Waited #{elapsed} seconds for all SolrJWriter threads, you may want to increase solrj_writer.thread_pool (currently #{@settings["solrj_writer.thread_pool"]})"
+    end
+    logger.debug "SolrJWriter: Thread pool shutdown complete"
+    logger.warn "SolrJWriter: #{skipped_record_count} skipped records" if skipped_record_count > 0
+    # check again now that we've waited, there could still be some
+    # that didn't show up before.
+    @thread_pool.raise_collected_exception!
+    if settings["solrj_writer.commit_on_close"].to_s == "true"
+      logger.info "SolrJWriter: Sending commit to solr..."
+      solr_server.commit
+    end
     solr_server.shutdown
     @solr_server = nil
   end
+  # Return count of encountered skipped records. Most accurate to call
+  # it after #close, in which case it should include full count, even
+  # under async thread_pool.
+  def skipped_record_count
+    @skipped_record_incrementer.get
+  end
   def solr_server
     @solr_server ||= instantiate_solr_server!
@@ -104,7 +337,8 @@ class Traject::SolrJWriter
     server        = server_class.new( settings["solr.url"].to_s );
     if parser_name = settings["solrj_writer.parser_class_name"]
-      parser = org.apache.solr.client.solrj.impl.const_get(parser_name).new
+      #parser = org.apache.solr.client.solrj.impl.const_get(parser_name).new
+      parser = Java::JavaClass.for_name("org.apache.solr.client.solrj.impl.#{parser_name}").ruby_class.new
       server.setParser( parser )
     end
@@ -115,6 +349,10 @@ class Traject::SolrJWriter
     unless settings.has_key?("solr.url") && ! settings["solr.url"].nil?
       raise ArgumentError.new("SolrJWriter requires a 'solr.url' solr url in settings")
     end
+    unless settings["solr.url"] =~ /^#{URI::regexp}$/
+      raise ArgumentError.new("SolrJWriter requires a 'solr.url' setting that looks like a URL, not: `#{settings['solr.url']}`")
+    end
   end
-end
+end