RubyGems - traject - Versions diffs - 2.3.4 → 3.0.0.alpha.1 - Mend

traject 2.3.4 → 3.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

checksums.yaml +5 -5
data/.travis.yml +16 -9
data/CHANGES.md +74 -1
data/Gemfile +2 -1
data/README.md +104 -53
data/Rakefile +8 -1
data/doc/indexing_rules.md +79 -63
data/doc/programmatic_use.md +218 -0
data/doc/settings.md +28 -1
data/doc/xml.md +134 -0
data/lib/traject.rb +5 -0
data/lib/traject/array_writer.rb +34 -0
data/lib/traject/command_line.rb +18 -22
data/lib/traject/debug_writer.rb +2 -5
data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
data/lib/traject/indexer.rb +321 -92
data/lib/traject/indexer/context.rb +39 -13
data/lib/traject/indexer/marc_indexer.rb +30 -0
data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
data/lib/traject/indexer/settings.rb +36 -53
data/lib/traject/indexer/step.rb +27 -33
data/lib/traject/macros/marc21.rb +37 -12
data/lib/traject/macros/nokogiri_macros.rb +43 -0
data/lib/traject/macros/transformation.rb +162 -0
data/lib/traject/marc_extractor.rb +2 -0
data/lib/traject/ndj_reader.rb +1 -1
data/lib/traject/nokogiri_reader.rb +179 -0
data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
data/lib/traject/solr_json_writer.rb +19 -12
data/lib/traject/thread_pool.rb +13 -0
data/lib/traject/util.rb +14 -2
data/lib/traject/version.rb +1 -1
data/test/debug_writer_test.rb +3 -3
data/test/delimited_writer_test.rb +3 -3
data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
data/test/indexer/context_test.rb +23 -13
data/test/indexer/error_handler_test.rb +59 -0
data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
data/test/indexer/macros/to_field_test.rb +2 -2
data/test/indexer/macros/transformation_test.rb +177 -0
data/test/indexer/map_record_test.rb +2 -3
data/test/indexer/nokogiri_indexer_test.rb +103 -0
data/test/indexer/process_record_test.rb +55 -0
data/test/indexer/process_with_test.rb +148 -0
data/test/indexer/read_write_test.rb +52 -2
data/test/indexer/settings_test.rb +34 -24
data/test/indexer/to_field_test.rb +27 -2
data/test/marc_extractor_test.rb +7 -7
data/test/marc_reader_test.rb +4 -4
data/test/nokogiri_reader_test.rb +158 -0
data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
data/test/solr_json_writer_test.rb +24 -28
data/test/test_helper.rb +8 -2
data/test/test_support/namespace-test.xml +7 -0
data/test/test_support/nokogiri_demo_config.rb +17 -0
data/test/test_support/oai-pmh-one-record-2.xml +24 -0
data/test/test_support/oai-pmh-one-record-first.xml +24 -0
data/test/test_support/sample-oai-no-namespace.xml +197 -0
data/test/test_support/sample-oai-pmh.xml +197 -0
data/test/thread_pool_test.rb +38 -0
data/test/translation_map_test.rb +3 -3
data/test/translation_maps/ruby_map.rb +2 -1
data/test/translation_maps/yaml_map.yaml +2 -1
data/traject.gemspec +4 -11
metadata +92 -6

data/lib/traject/debug_writer.rb CHANGED

@@ -40,12 +40,9 @@ class Traject::DebugWriter < Traject::LineWriter
     @idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
     @format  = settings['debug_writer.format'] || DEFAULT_FORMAT
-    if @idfield == 'record_position' then
-      @use_position = true
-    end
+    @use_position = (@idfield == 'record_position')
     @already_threw_warning_about_missing_id = false
   end
   def record_number(context)
@@ -54,7 +51,7 @@ class Traject::DebugWriter < Traject::LineWriter
       context.output_hash[@idfield].first
     else
       unless @already_threw_warning_about_missing_id
-        context.logger.warn "At least one record (##{context.position}) doesn't define field '#{@idfield}'.
+        context.logger.warn "At least one record (#{context.record_inspect}) doesn't define field '#{@idfield}'.
 All records are assumed to have a unique id. You can set which field to look in via the setting 'debug_writer.idfield'"
         @already_threw_warning_about_missing_id = true
       end

data/lib/traject/experimental_nokogiri_streaming_reader.rb ADDED

@@ -0,0 +1,276 @@
+module Traject
+  # An EXPERIMENTAL HALF-FINISHED implementation of a streaming/pull reader using Nokogiri.
+  # Not ready for use, not stable API, could go away.
+  #
+  # This was my first try at a NokogiriReader implementation, it didn't work out, at least without
+  # a lot more work. I think we'd need to re-do it to build the Nokogiri::XML::Nodes by hand as the
+  # source is traversed, instead of relying on #outer_xml -- outer_xml returning a string results in a double-parsing,
+  # with the expected 50% performance hit.  Picadillos in Nokogiri JRuby namespace handling don't help.
+  #
+  # All in all, it's possible something could be gotten here with a lot more work, it's also possible
+  # Nokogiri's antipathy to namespaces could keep getting in the way.
+  class ExperimentalNokogiriStreamingReader
+    include Enumerable
+    attr_reader :settings, :input_stream, :clipboard, :path_tracker
+    def initialize(input_stream, settings)
+      @settings = Traject::Indexer::Settings.new settings
+      @input_stream = input_stream
+      @clipboard = Traject::Util.is_jruby? ? Concurrent::Map.new : Concurrent::Hash.new
+      if each_record_xpath
+        @path_tracker = PathTracker.new(each_record_xpath,
+                                          clipboard: self.clipboard,
+                                          namespaces: default_namespaces,
+                                          extra_xpath_hooks: extra_xpath_hooks)
+      end
+      default_namespaces # trigger validation
+      validate_limited_xpath(each_record_xpath, key_name: "each_record_xpath")
+    end
+    def each_record_xpath
+      @each_record_xpath ||= settings["nokogiri.each_record_xpath"]
+    end
+    def extra_xpath_hooks
+      @extra_xpath_hooks ||= begin
+        (settings["nokogiri_reader.extra_xpath_hooks"] || {}).tap do |hash|
+          hash.each_pair do |limited_xpath, callable|
+            validate_limited_xpath(limited_xpath, key_name: "nokogiri_reader.extra_xpath_hooks")
+          end
+        end
+      end
+    end
+    protected def validate_limited_xpath(each_record_xpath, key_name:)
+      return unless each_record_xpath
+      components = each_record_xpath.split('/')
+      components.each do |component|
+        prefix, element = component.split(':')
+        unless element
+          # there was no namespace
+          prefix, element = nil, prefix
+        end
+        # We don't support brackets or any xpath beyond the MOST simple.
+        # Catch a few we can catch.
+        if element =~ /::/ || element =~ /[\[\]]/
+          raise ArgumentError, "#{key_name}: Only very simple xpaths supported. '//some/path' or '/some/path'. Not: #{each_record_xpath.inspect}"
+        end
+        if prefix
+          ns_uri = default_namespaces[prefix]
+          if ns_uri.nil?
+            raise ArgumentError, "each_record_xpath: Can't find namespace prefix '#{prefix}' in '#{each_record_xpath}'. To use a namespace in each_record_xpath, it has to be registered with nokogiri.namespaces: #{default_namespaces.inspect}"
+          end
+        end
+      end
+      each_record_xpath
+    end
+    def default_namespaces
+      @default_namespaces ||= (settings["nokogiri.namespaces"] || {}).tap { |ns|
+        unless ns.kind_of?(Hash)
+          raise ArgumentError, "nokogiri.namespaces must be a hash, not: #{ns.inspect}"
+        end
+      }
+    end
+    def each
+      unless each_record_xpath
+        # forget streaming, just read it and return it once, done.
+        yield Nokogiri::XML.parse(input_stream)
+        return
+      end
+      reader = Nokogiri::XML::Reader(input_stream)
+      reader.each do |reader_node|
+        if reader_node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
+          path_tracker.push(reader_node)
+          if path_tracker.match?
+            yield path_tracker.current_node_doc
+          end
+          path_tracker.run_extra_xpath_hooks
+          if reader_node.self_closing?
+            path_tracker.pop
+          end
+        end
+        if reader_node.node_type == Nokogiri::XML::Reader::TYPE_END_ELEMENT
+          path_tracker.pop
+        end
+      end
+    end
+    private
+    # initialized with the specification (a very small subset of xpath) for
+    # what records to yield-on-each.  Tests to see if a Nokogiri::XML::Reader
+    # node matches spec.
+    #
+    #    '//record'
+    # or anchored to root:
+    #   '/body/head/meta' same thing as './body/head/meta' or 'head/meta'
+    #
+    # Elements can (and must, to match) have XML namespaces, if and only if
+    # they are registered with settings nokogiri.namespaces
+    #
+    # sadly JRuby Nokogiri has an incompatibility with true nokogiri, and
+    # doesn't preserve our namespaces on outer_xml,
+    # so in JRuby we have to track them ourselves, and then also do yet ANOTHER
+    # parse in nokogiri. This may make this in Java even LESS performant, I'm afraid.
+    class PathTracker
+      attr_reader :path_spec, :inverted_namespaces, :current_path, :namespaces_stack, :extra_xpath_hooks, :clipboard
+      def initialize(str_spec, clipboard:, namespaces: {}, extra_xpath_hooks: {})
+        @inverted_namespaces  = namespaces.invert
+        @clipboard = clipboard
+        # We're guessing using a string will be more efficient than an array
+        @current_path         = ""
+        @floating             = false
+        @path_spec, @floating = parse_path(str_spec)
+        @namespaces_stack = []
+        @extra_xpath_hooks = extra_xpath_hooks.collect do |path, callable|
+          bare_path, floating = parse_path(path)
+          {
+            path: bare_path,
+            floating: floating,
+            callable: callable
+          }
+        end
+      end
+      # returns [bare_path, is_floating]
+      protected def parse_path(str_spec)
+        floating = false
+        if str_spec.start_with?('//')
+          str_spec = str_spec.slice(2..-1)
+          floating = true
+        else
+          str_spec = str_spec.slice(1..-1) if str_spec.start_with?(".")
+          str_spec = "/" + str_spec unless str_spec.start_with?("/")
+        end
+        return [str_spec, floating]
+      end
+      def is_jruby?
+        Traject::Util.is_jruby?
+      end
+      # adds a component to slash-separated current_path, with namespace prefix.
+      def push(reader_node)
+        namespace_prefix = reader_node.namespace_uri && inverted_namespaces[reader_node.namespace_uri]
+        # gah, reader_node.name has the namespace prefix in there
+        node_name = reader_node.name.gsub(/[^:]+:/, '')
+        node_str = if namespace_prefix
+          namespace_prefix + ":" + node_name
+        else
+          reader_node.name
+        end
+        current_path << ("/" + node_str)
+        if is_jruby?
+          namespaces_stack << reader_node.namespaces
+        end
+        @current_node = reader_node
+      end
+      def current_node_doc
+        return nil unless @current_node
+        # yeah, sadly we got to have nokogiri parse it again
+        fix_namespaces(Nokogiri::XML.parse(@current_node.outer_xml))
+      end
+      # removes the last slash-separated component from current_path
+      def pop
+        current_path.slice!( current_path.rindex('/')..-1 )
+        @current_node = nil
+        if is_jruby?
+          namespaces_stack.pop
+        end
+      end
+      def floating?
+        !!@floating
+      end
+      def match?
+        match_path?(path_spec, floating: floating?)
+      end
+      def match_path?(path_to_match, floating:)
+        if floating?
+          current_path.end_with?(path_to_match)
+        else
+          current_path == path_to_match
+        end
+      end
+      def run_extra_xpath_hooks
+        return unless @current_node
+        extra_xpath_hooks.each do |hook_spec|
+          if match_path?(hook_spec[:path], floating: hook_spec[:floating])
+            hook_spec[:callable].call(current_node_doc, clipboard)
+          end
+        end
+      end
+      # no-op unless it's jruby, and then we use our namespace stack to
+      # correctly add namespaces to the Nokogiri::XML::Document, cause
+      # in Jruby outer_xml on the Reader doesn't do it for us. :(
+      def fix_namespaces(doc)
+        if is_jruby?
+          # Only needed in jruby, nokogiri's jruby implementation isn't weird
+          # around namespaces in exactly the same way as MRI. We need to keep
+          # track of the namespaces in outer contexts ourselves, and then see
+          # if they are needed ourselves. :(
+          namespaces = namespaces_stack.compact.reduce({}, :merge)
+          default_ns = namespaces.delete("xmlns")
+          namespaces.each_pair do |attrib, uri|
+            ns_prefix = attrib.sub(/\Axmlns:/, '')
+            # gotta make sure it's actually used in the doc to not add it
+            # unecessarily. GAH.
+            if    doc.xpath("//*[starts-with(name(), '#{ns_prefix}:')][1]").empty? &&
+                  doc.xpath("//@*[starts-with(name(), '#{ns_prefix}:')][1]").empty?
+              next
+            end
+            doc.root.add_namespace_definition(ns_prefix, uri)
+          end
+          if default_ns
+            doc.root.default_namespace = default_ns
+            # OMG nokogiri, really?
+            default_ns = doc.root.namespace
+            doc.xpath("//*[namespace-uri()='']").each do |node|
+              node.namespace = default_ns
+            end
+          end
+        end
+        return doc
+      end
+    end
+  end
+end

data/lib/traject/hashie/indifferent_access_fix.rb ADDED

@@ -0,0 +1,25 @@
+require 'hashie'
+module Traject
+  module Hashie
+    # Backporting fix from https://github.com/intridea/hashie/commit/a82c594710e1bc9460d3de4d2989cb700f4c3c7f
+    # into Hashie.
+    #
+    # This makes merge(ordinary_hash) on a Hash that has IndifferentAccess included work, without
+    # raising. Which we needed.
+    #
+    # As of this writing that fix is not available in a Hashie release, if it becomes so
+    # later than this monkey-patch may no longer be required, we can just depend on fixed version.
+    #
+    # See also https://github.com/intridea/hashie/issues/451
+    module IndifferentAccessFix
+      def merge(*args)
+        result = super
+        ::Hashie::Extensions::IndifferentAccess.inject!(result) if hash_lacking_indifference?(result)
+        result.convert!
+      end
+    end
+  end
+end
+Hashie::Extensions::IndifferentAccess.include(Traject::Hashie::IndifferentAccessFix)

data/lib/traject/indexer.rb CHANGED

@@ -11,14 +11,14 @@ require 'traject/marc_reader'
 require 'traject/json_writer'
 require 'traject/solr_json_writer'
 require 'traject/debug_writer'
+require 'traject/array_writer'
 require 'traject/macros/marc21'
 require 'traject/macros/basic'
+require 'traject/macros/transformation'
+require 'traject/indexer/marc_indexer'
-if defined? JRUBY_VERSION
-  require 'traject/marc4j_reader'
-end
 # This class does indexing for traject: Getting input records from a Reader
 # class, mapping the input records to an output hash, and then sending the output
@@ -157,33 +157,39 @@ end
 # inconveient for you, we'd like to know your use case and improve things.
 #
 class Traject::Indexer
-  # Arity error on a passed block
-  class ArityError < ArgumentError;
-  end
-  class NamingError < ArgumentError;
-  end
+  CompletedStateError = Class.new(StandardError)
+  ArityError          = Class.new(ArgumentError)
+  NamingError         = Class.new(ArgumentError)
   include Traject::QualifiedConstGet
+  extend Traject::QualifiedConstGet
   attr_writer :reader_class, :writer_class, :writer
-  # For now we hard-code these basic macro's included
-  # TODO, make these added with extend per-indexer,
-  # added by default but easily turned off (or have other
-  # default macro modules provided)
-  include Traject::Macros::Marc21
   include Traject::Macros::Basic
+  include Traject::Macros::Transformation
   # optional hash or Traject::Indexer::Settings object of settings.
-  def initialize(arg_settings = {})
-    @settings               = Settings.new(arg_settings)
+  # optionally takes a block which is instance_eval'd in the indexer,
+  # intended for configuration simimlar to what would be in a config file.
+  def initialize(arg_settings = {}, &block)
+    @writer_class           = nil
+    @completed              = false
+    @settings               = Settings.new(arg_settings).with_defaults(self.class.default_settings)
     @index_steps            = []
     @after_processing_steps = []
+    instance_eval(&block) if block
+  end
+  # Right now just does an `instance_eval`, but encouraged in case we change the underlying
+  # implementation later, and to make intent more clear.
+  def configure(&block)
+    instance_eval(&block)
   end
   # Pass a string file path, a Pathname, or a File object, for
   # a config file to load into indexer.
   #
@@ -234,16 +240,81 @@ class Traject::Indexer
   def settings(new_settings = nil, &block)
     @settings.merge!(new_settings) if new_settings
-    @settings.instance_eval &block if block_given?
+    @settings.instance_eval(&block) if block_given?
     return @settings
   end
+  # Hash is frozen to avoid inheritance-mutability confusion.
+  def self.default_settings
+    @default_settings ||= {
+        # Writer defaults
+        "writer_class_name"       => "Traject::SolrJsonWriter",
+        "solr_writer.batch_size"  => 100,
+        "solr_writer.thread_pool" => 1,
+        # Threading and logging
+        "processing_thread_pool"  => Traject::Indexer::Settings.default_processing_thread_pool,
+        "log.batch_size.severity" => "info",
+        # how to post-process the accumulator
+        "allow_nil_values"        => false,
+        "allow_duplicate_values"  => true,
+        "allow_empty_fields"      => false
+    }.freeze
+  end
+  # Not sure if allowing changing of default_settings is a good idea, but we do
+  # use it in test. For now we make it private to require extreme measures to do it,
+  # and advertise that this API could go away or change without a major version release,
+  # it is experimental and internal.
+  private_class_method def self.default_settings=(settings)
+    @default_settings = settings
+  end
+  # Sub-classes should override to return a _proc_ object that takes one arg,
+  # a source record, and returns an identifier for it that can be used in
+  # logged messages. This differs depending on input record format, is why we
+  # leave it to sub-classes.
+  def source_record_id_proc
+    if defined?(@@legacy_marc_mode) && @@legacy_marc_mode
+      return @source_record_id_proc ||= lambda do |source_marc_record|
+        if ( source_marc_record &&
+             source_marc_record.kind_of?(MARC::Record) &&
+             source_marc_record['001'] )
+          source_marc_record['001'].value
+        end
+      end
+    end
+    @source_record_id_proc ||= lambda { |source| nil }
+  end
+  def self.legacy_marc_mode!
+    @@legacy_marc_mode = true
+    # include legacy Marc macros
+    include Traject::Macros::Marc21
+    # Reader defaults
+    legacy_settings = {
+      "reader_class_name"       => "Traject::MarcReader",
+      "marc_source.type"        => "binary",
+    }
+    default_settings.merge!(legacy_settings)
+    self
+  end
   # Part of DSL, used to define an indexing mapping. Register logic
   # to be called for each record, and generate values for a particular
-  # output field.
-  def to_field(field_name, aLambda = nil, &block)
-    @index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first))
+  # output field. The first field_name argument can be a single string, or
+  # an array of multiple strings -- in the latter case, the processed values
+  # will be added to each field mentioned.
+  def to_field(field_name, *procs, &block)
+    @index_steps << ToFieldStep.new(field_name, procs, block, Traject::Util.extract_caller_location(caller.first))
   end
   # Part of DSL, register logic to be called for each record
@@ -313,14 +384,33 @@ class Traject::Indexer
   # this indexer. Returns the output hash (a hash whose keys are
   # string fields, and values are arrays of one or more values in that field)
   #
+  # If the record is marked `skip` as part of processing, this will return
+  # nil.
+  #
   # This is a convenience shortcut for #map_to_context! -- use that one
   # if you want to provide addtional context
   # like position, and/or get back the full context.
   def map_record(record)
-    context = Context.new(:source_record => record, :settings => settings)
+    context = Context.new(:source_record => record, :settings => settings, :source_record_id_proc => source_record_id_proc, :logger => logger)
+    map_to_context!(context)
+    return context.output_hash unless context.skip?
+  end
+  # Takes a single record, maps it, and sends it to the instance-configured
+  # writer. No threading, no logging, no error handling. Respects skipped
+  # records by not adding them. Returns the Traject::Indexer::Context.
+  #
+  # Aliased as #<<
+  def process_record(record)
+    check_uncompleted
+    context = Context.new(:source_record => record, :settings => settings, :source_record_id_proc =>  source_record_id_proc, :logger => logger)
     map_to_context!(context)
-    return context.output_hash
+    writer.put( context ) unless context.skip?
+    return context
   end
+  alias_method :<<, :process_record
   # Maps a single record INTO the second argument, a Traject::Indexer::Context.
   #
@@ -342,7 +432,7 @@ class Traject::Indexer
       # Set the index step for error reporting
       context.index_step = index_step
-      log_mapping_errors(context, index_step) do
+      handle_mapping_errors(context) do
         index_step.execute(context) # will always return [] for an each_record step
       end
@@ -353,31 +443,40 @@ class Traject::Indexer
     return context
   end
-  # just a wrapper that captures and records any unexpected
-  # errors raised in mapping, along with contextual information
-  # on record and location in source file of mapping rule.
+  protected def default_mapping_rescue
+    @default_mapping_rescue ||= lambda do |context, exception|
+      msg = "Unexpected error on record #{context.record_inspect}\n"
+      msg += "    while executing #{context.index_step.inspect}\n"
+      msg += begin
+        "\n    Record: #{context.source_record.to_s}\n"
+      rescue StandardError => to_s_exception
+        "\n    (Could not log record, #{to_s_exception})\n"
+      end
+      msg += Traject::Util.exception_to_log_message(exception)
+      context.logger.error(msg) if context.logger
+      raise exception
+    end
+  end
+  # just a wrapper that catches any errors, and handles them. By default, logs
+  # and re-raises. But you can set custom setting `mapping_rescue`
+  # to customize
   #
-  # Re-raises error at the moment.
   #
-  # log_mapping_errors(context, index_step) do
+  # handle_mapping_errors(context, index_step) do
   #    all_sorts_of_stuff # that will have errors logged
   # end
-  def log_mapping_errors(context, index_step)
+  protected def handle_mapping_errors(context)
     begin
       yield
-    rescue Exception => e
-      msg = "Unexpected error on record id `#{context.source_record_id}` at file position #{context.position}\n"
-      msg += "    while executing #{index_step.inspect}\n"
-      msg += Traject::Util.exception_to_log_message(e)
-      logger.error msg
-      begin
-        logger.debug "Record: " + context.source_record.to_s
-      rescue Exception => marc_to_s_exception
-        logger.debug "(Could not log record, #{marc_to_s_exception})"
-      end
-      raise e
+    rescue StandardError => e
+      error_handler = settings["mapping_rescue"] || default_mapping_rescue
+      error_handler.call(context, e)
     end
   end
@@ -385,67 +484,80 @@ class Traject::Indexer
   # mapping according to configured mapping rules, and then writing
   # to configured Writer.
   #
+  # You instead give it an _array_ of streams, as well.
+  #
   # returns 'false' as a signal to command line to return non-zero exit code
   # for some reason (reason found in logs, presumably). This particular mechanism
   # is open to complexification, starting simple. We do need SOME way to return
   # non-zero to command line.
   #
-  def process(io_stream)
+  # @param [#read, Array<#read>]
+  def process(io_stream_or_array)
+    check_uncompleted
     settings.fill_in_defaults!
     count      = 0
     start_time = batch_start_time = Time.now
-    logger.debug "beginning Indexer#process with settings: #{settings.inspect}"
-    reader = self.reader!(io_stream)
+    logger.debug "beginning Traject::Indexer#process with settings: #{settings.inspect}"
     processing_threads = settings["processing_thread_pool"].to_i
     thread_pool        = Traject::ThreadPool.new(processing_threads)
-    logger.info "   Indexer with #{processing_threads} processing threads, reader: #{reader.class.name} and writer: #{writer.class.name}"
+    logger.info "   Traject::Indexer with #{processing_threads} processing threads, reader: #{reader_class.name} and writer: #{writer.class.name}"
-    log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
+    #io_stream can now be an array of io_streams.
+    (io_stream_or_array.kind_of?(Array) ? io_stream_or_array : [io_stream_or_array]).each do |io_stream|
+      reader = self.reader!(io_stream)
+      input_name = Traject::Util.io_name(io_stream)
+      position_in_input = 0
-    reader.each do |record; position |
-      count    += 1
+      log_batch_size = settings["log.batch_size"] && settings["log.batch_size"].to_i
-      # have to use a block local var, so the changing `count` one
-      # doesn't get caught in the closure. Weird, yeah.
-      position = count
+      reader.each do |record; safe_count, safe_position_in_input |
+        count    += 1
+        position_in_input += 1
-      thread_pool.raise_collected_exception!
+        # have to use a block local var, so the changing `count` one
+        # doesn't get caught in the closure. Don't totally get it, but
+        # I think it's so.
+        safe_count, safe_position_in_input = count, position_in_input
-      if settings["debug_ascii_progress"].to_s == "true"
-        $stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
-      end
+        thread_pool.raise_collected_exception!
-      context = Context.new(
-          :source_record => record,
-          :settings      => settings,
-          :position      => position,
-          :logger        => logger
-      )
-      if log_batch_size && (count % log_batch_size == 0)
-        batch_rps   = log_batch_size / (Time.now - batch_start_time)
-        overall_rps = count / (Time.now - start_time)
-        logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at id:#{context.source_record_id}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
-        batch_start_time = Time.now
-      end
+        if settings["debug_ascii_progress"].to_s == "true"
+          $stderr.write "." if count % settings["solr_writer.batch_size"].to_i == 0
+        end
-      # We pass context in a block arg to properly 'capture' it, so
-      # we don't accidentally share the local var under closure between
-      # threads.
-      thread_pool.maybe_in_thread_pool(context) do |context|
-        map_to_context!(context)
-        if context.skip?
-          log_skip(context)
-        else
-          writer.put context
+        context = Context.new(
+            :source_record => record,
+            :source_record_id_proc => source_record_id_proc,
+            :settings      => settings,
+            :position      => safe_count,
+            :input_name    => input_name,
+            :position_in_input => safe_position_in_input,
+            :logger        => logger
+        )
+        if log_batch_size && (count % log_batch_size == 0)
+          batch_rps   = log_batch_size / (Time.now - batch_start_time)
+          overall_rps = count / (Time.now - start_time)
+          logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at: #{context.source_inspect}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
+          batch_start_time = Time.now
         end
+        # We pass context in a block arg to properly 'capture' it, so
+        # we don't accidentally share the local var under closure between
+        # threads.
+        thread_pool.maybe_in_thread_pool(context) do |t_context|
+          map_to_context!(t_context)
+          if context.skip?
+            log_skip(t_context)
+          else
+            writer.put t_context
+          end
+        end
       end
     end
     $stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
@@ -455,39 +567,156 @@ class Traject::Indexer
     thread_pool.raise_collected_exception!
+    complete
+    elapsed = Time.now - start_time
+    avg_rps = (count / elapsed)
+    logger.info "finished Traject::Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
+    if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
+      logger.error "Traject::Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
+      return false
+    end
+    return true
+  end
+  def completed?
+    @completed
+  end
+  # Instance variable readers and writers are not generally re-usble.
+  # The writer may have been closed. The reader does it's thing and doesn't
+  # rewind. If we're completed, as a sanity check don't let someone do
+  # something with the indexer that uses the reader or writer and isn't gonna work.
+  protected def check_uncompleted
+    if completed?
+      raise CompletedStateError.new("This Traject::Indexer has been completed, and it's reader and writer are not in a usable state")
+    end
+  end
+  # Closes the writer (which may flush/save/finalize buffered records),
+  # and calls run_after_processing_steps
+  def complete
     writer.close if writer.respond_to?(:close)
+    run_after_processing_steps
+    # after an indexer has been completed, it is not really usable anymore,
+    # as the writer has been closed.
+    @completed = true
+  end
+  def run_after_processing_steps
     @after_processing_steps.each do |step|
       begin
         step.execute
-      rescue Exception => e
+      rescue StandardError => e
         logger.fatal("Unexpected exception #{e} when executing #{step}")
         raise e
       end
     end
+  end
-    elapsed = Time.now - start_time
-    avg_rps = (count / elapsed)
-    logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
+  # A light-weight process method meant for programmatic use, generally
+  # intended for only a "few" (not milliions) of records.
+  #
+  # It does _not_ use instance-configured reader or writer, instead taking
+  # a source/reader and destination/writer as arguments to this call.
+  #
+  # The reader can be anything that has an #each returning source
+  # records. This includes an ordinary array of source records, or any
+  # traject Reader.
+  #
+  # The writer can be anything with a #put method taking a Traject::Indexer::Context.
+  # For convenience, see the Traject::ArrayWriter that just collects output in an array.
+  #
+  # Return value of process_with is the writer passed as second arg, for your convenience.
+  #
+  # This does much less than the full #process method, to be more flexible
+  # and make fewer assumptions:
+  #
+  #  * Will never use any additional threads (unless writer does). Wrap in your own threading if desired.
+  #  * Will not do any standard logging or progress bars, regardless of indexer settings.
+  #    Log yourself if desired.
+  #  * Will _not_ call any `after_processing` steps. Call yourself with `indexer.run_after_processing_steps` as desired.
+  #  * WILL by default call #close on the writer, IF the writer has a #close method.
+  #    pass `:close_writer => false` to not do so.
+  #  * exceptions will just raise out, unless you pass in a rescue: option, value is a proc/lambda
+  #    that will receive two args, context and exception. If the rescue proc doesn't re-raise,
+  #    `process_with` will continue to process subsequent records.
+  #
+  # @example
+  #     array_writer_instance = indexer.process_with([record1, record2], Traject::ArrayWriter.new)
+  #
+  # @example With a block, in addition to or instead of a writer.
+  #
+  #     indexer.process_with([record]) do |context|
+  #       do_something_with(context.output_hash)
+  #     end
+  #
+  # @param source [#each]
+  # @param destination [#put]
+  # @param close_writer whether the destination should have #close called on it, if it responds to.
+  # @param rescue_with [Proc] to call on errors, taking two args: A Traject::Indexer::Context and an exception.
+  #   If nil (default), exceptions will be raised out. If set, you can raise or handle otherwise if you like.
+  # @param on_skipped [Proc] will be called for any skipped records, with one arg Traject::Indexer::Context
+  def process_with(source, destination = nil, close_writer: true, rescue_with: nil, on_skipped: nil)
+    unless destination || block_given?
+      raise ArgumentError, "Need either a second arg writer/destination, or a block"
+    end
-    if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
-      logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
-      return false
+    settings.fill_in_defaults!
+    position = 0
+    input_name = Traject::Util.io_name(source)
+    source.each do |record |
+      begin
+        position += 1
+        context = Context.new(
+            :source_record          => record,
+            :source_record_id_proc  => source_record_id_proc,
+            :settings               => settings,
+            :position               => position,
+            :position_in_input      => (position if input_name),
+            :logger                 => logger
+        )
+        map_to_context!(context)
+        if context.skip?
+          on_skipped.call(context) if on_skipped
+        else
+          destination.put(context) if destination
+          yield(context) if block_given?
+        end
+      rescue StandardError => e
+        if rescue_with
+          rescue_with.call(context, e)
+        else
+          raise e
+        end
+      end
     end
-    return true
+    if close_writer && destination.respond_to?(:close)
+      destination.close
+    end
+    return destination
   end
   # Log that the current record is being skipped, using
   # data in context.position and context.skipmessage
   def log_skip(context)
-    logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
+    logger.debug "Skipped record #{context.record_inspect}: #{context.skipmessage}"
   end
   def reader_class
     unless defined? @reader_class
-      @reader_class = qualified_const_get(settings["reader_class_name"])
+      reader_class_name = settings["reader_class_name"]
+      @reader_class = qualified_const_get(reader_class_name)
     end
     return @reader_class
   end