RubyGems - traject - Versions diffs - 0.16.0 → 0.17.0 - Mend

traject 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

checksums.yaml +7 -0
data/.yardopts +1 -0
data/README.md +183 -191
data/bench/bench.rb +1 -1
data/doc/batch_execution.md +14 -0
data/doc/extending.md +14 -12
data/doc/indexing_rules.md +265 -0
data/lib/traject/command_line.rb +12 -41
data/lib/traject/debug_writer.rb +32 -13
data/lib/traject/indexer.rb +101 -24
data/lib/traject/indexer/settings.rb +18 -17
data/lib/traject/json_writer.rb +32 -11
data/lib/traject/line_writer.rb +6 -6
data/lib/traject/macros/basic.rb +1 -1
data/lib/traject/macros/marc21.rb +17 -13
data/lib/traject/macros/marc21_semantics.rb +27 -25
data/lib/traject/macros/marc_format_classifier.rb +39 -25
data/lib/traject/marc4j_reader.rb +36 -22
data/lib/traject/marc_extractor.rb +79 -75
data/lib/traject/marc_reader.rb +33 -25
data/lib/traject/mock_reader.rb +9 -10
data/lib/traject/ndj_reader.rb +7 -7
data/lib/traject/null_writer.rb +1 -1
data/lib/traject/qualified_const_get.rb +12 -2
data/lib/traject/solrj_writer.rb +61 -52
data/lib/traject/thread_pool.rb +45 -45
data/lib/traject/translation_map.rb +59 -27
data/lib/traject/util.rb +3 -3
data/lib/traject/version.rb +1 -1
data/lib/traject/yaml_writer.rb +1 -1
data/test/debug_writer_test.rb +7 -7
data/test/indexer/each_record_test.rb +4 -4
data/test/indexer/macros_marc21_semantics_test.rb +12 -12
data/test/indexer/macros_marc21_test.rb +10 -10
data/test/indexer/macros_test.rb +1 -1
data/test/indexer/map_record_test.rb +6 -6
data/test/indexer/read_write_test.rb +43 -4
data/test/indexer/settings_test.rb +2 -2
data/test/indexer/to_field_test.rb +8 -8
data/test/marc4j_reader_test.rb +4 -4
data/test/marc_extractor_test.rb +33 -25
data/test/marc_format_classifier_test.rb +3 -3
data/test/marc_reader_test.rb +2 -2
data/test/test_helper.rb +3 -3
data/test/test_support/demo_config.rb +52 -48
data/test/translation_map_test.rb +22 -4
data/test/translation_maps/bad_ruby.rb +2 -2
data/test/translation_maps/both_map.rb +1 -1
data/test/translation_maps/default_literal.rb +1 -1
data/test/translation_maps/default_passthrough.rb +1 -1
data/test/translation_maps/ruby_map.rb +1 -1
metadata +7 -31
data/doc/macros.md +0 -103

data/lib/traject/marc_reader.rb CHANGED

@@ -1,35 +1,43 @@
 require 'marc'
-require 'traject/ndj_reader'
+require 'traject/ndj_reader'
-# A Reader class that can be used with Traject::Indexer.reader, to read
-# MARC records.
+# `Traject::MarcReader` uses pure ruby marc gem to parse MARC records. It
+# can read MARC ISO 2709 ('binary'), MARC-XML, and Marc-in-json (newline-delimited-json).
 #
-# Includes Enumerable for convenience.
+# MarcReader can not currently read binary MARC in the MARC8 encoding, see
+# the Traject::Marc4JReader instead.
 #
-# Reads in Marc records using ruby marc. Depends on config variables to
-# determine what serialization type to expect, and other parameters controlling
-# de-serialization.
+# By default assumes binary MARC encoding, please set marc_source.type setting
+# for XML or json.
 #
-# NOTE: MarcReader can not handle Marc8 encoding. If you need to read binary
-# records in MARC8, use Traject::Marc4JReader instead.
+# ## Settings
+# * "marc_source.type":  serialization type. default 'binary'
+#       * "binary". standard ISO 2709 "binary" MARC format.
+#       * "xml", MarcXML
+#       * "json" The "marc-in-json" format, encoded as newline-separated
+#         json. (synonym 'ndj'). A simplistic newline-separated json, with no comments
+#         allowed, and no unescpaed internal newlines allowed in the json
+#         objects -- we just read line by line, and assume each line is a
+#         marc-in-json. http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/
+# * "marc_reader.xml_parser": For XML type, which XML parser to tell Marc::Reader
+#         to use. Anything recognized by [Marc::Reader :parser
+#         argument](http://rdoc.info/github/ruby-marc/ruby-marc/MARC/XMLReader).
+#         By default, asks Marc::Reader to take
+#         it's best guess as to highest performance available
+#         installed option. Probably best to leave as default.
+#
+# ## Example
+#
+# In a configuration file:
 #
-# Settings:
-#   ["marc_source.type"]  serialization type. default 'binary'
-#                 * "binary". Actual marc.
-#                 * "xml", MarcXML
-#                 * "json" The "marc-in-json" format, encoded as newline-separated
-#                   json. A simplistic newline-separated json, with no comments
-#                   allowed, and no unescpaed internal newlines allowed in the json
-#                   objects -- we just read line by line, and assume each line is a
-#                   marc-in-json. http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/
-#   ["marc_reader.xml_parser"] For XML type, which XML parser to tell Marc::Reader
-#                              to use. Anything recognized by Marc::Reader :parser
-#                              argument. By default, asks Marc::Reader to take
-#                              it's best guess as to highest performance available
-#                              installed option.
+#     require 'traject/marc_reader'
 #
+#     settings do
+#       provide "reader_class_name", "Traject::MarcReader"
+#       provide "marc_source.type", "xml"
+#     end
 #
-# Can NOT yet read Marc8, input is always assumed UTF8.
 class Traject::MarcReader
   include Enumerable
@@ -64,4 +72,4 @@ class Traject::MarcReader
     self.internal_reader.each(*args, &block)
   end
-end
+end

data/lib/traject/mock_reader.rb CHANGED

@@ -10,15 +10,14 @@ module Traject
   #
   # Specify in a config files as follows:
   #
-    # require 'traject/mock_writer'
-    # require 'traject/mock_reader'
-    #
-    # settings do
-    #   store "reader_class_name", "Traject::MockReader"
-    #   store "writer_class_name", "Traject::MockWriter"
-    #   store "mock_reader.limit", 4_000 # default is 10_000
-    # end
+  #     require 'traject/mock_writer'
+  #     require 'traject/mock_reader'
+  #
+  #     settings do
+  #       store "reader_class_name", "Traject::MockReader"
+  #       store "writer_class_name", "Traject::MockWriter"
+  #       store "mock_reader.limit", 4_000 # default is 10_000
+  #     end
   class MockReader
     attr_accessor :limit
@@ -50,7 +49,7 @@ module Traject
         while true
           json = this_file_iter.next
           next unless json =~ /\S/
-          records << MARC::Record.new_from_hash(JSON.parse(json))
+          records << MARC::Record.new_from_hash(JSON.parse(json))
         end
       rescue StopIteration
       end

data/lib/traject/ndj_reader.rb CHANGED

@@ -8,7 +8,7 @@ require 'zlib'
 class Traject::NDJReader
   include Enumerable
   def initialize(input_stream, settings)
     @settings = settings
     @input_stream = input_stream
@@ -16,16 +16,16 @@ class Traject::NDJReader
       @input_stream = Zlib::GzipReader.new(@input_stream, :external_encoding => "UTF-8")
     end
   end
   def logger
     @logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
-  end
+  end
   def each
     unless block_given?
       return enum_for(:each)
     end
     @input_stream.each_with_index do |json, i|
       begin
         yield MARC::Record.new_from_hash(JSON.parse(json))
@@ -34,7 +34,7 @@ class Traject::NDJReader
       end
     end
   end
 end

data/lib/traject/null_writer.rb CHANGED

@@ -19,4 +19,4 @@ class Traject::NullWriter
     # null
   end
-end
+end

data/lib/traject/qualified_const_get.rb CHANGED

@@ -3,7 +3,17 @@
 #
 # Method to take a string constant name, including :: qualifications, and
 # look up the actual constant. Looks up relative to current file.
-# REspects leading ::. Etc.
+# Respects leading ::. Etc.
+#
+#     class Something
+#       include Traject::QualifiedConstGet
+#
+#       def foo
+#         #...
+#         klass = qualified_const_get("Foo::Bar")
+#         #...
+#       end
+#     end
 module Traject::QualifiedConstGet
@@ -27,4 +37,4 @@ module Traject::QualifiedConstGet
     path.inject(Object) { |ns,name| ns.const_get(name) }
   end
-end
+end

data/lib/traject/solrj_writer.rb CHANGED

@@ -1,19 +1,3 @@
-# TODO: THREAD POOL
-#
-# 1) Exception handling in threads, what's the right thing to do
-# 2) General count of failed records in a thread safe way, so we can report
-#    it back from 'close', so process can report it back, and non-zero exit
-#    code can be emited from command-line.
-# 3) back pressure on thread pool. give it a bounded blocking queue instead,
-#    to make sure thousands of add tasks don't build up, waiting until the end.
-#    or does that even matter? So what if they build up in the queue and only
-#    get taken care of at the end, is that okay? I do emit a warning right now
-#    if it takes more than 60 seconds to process remaining thread pool task queue
-#    at end.
-# 4) No tests yet that actually test thread pool stuff; additionally, may make
-#    some of the batch tests fail in non-deterministic ways, since batch tests
-#    assume order of add (and our Mock solr server is not thread safe yet!)
 require 'yell'
 require 'traject'
@@ -26,7 +10,6 @@ require 'thread' # for Mutex
 #
 # Writes to a Solr using SolrJ, and the SolrJ HttpSolrServer.
-#  (sub-class later for the ConcurrentUpdate server?)
 #
 # After you call #close, you can check #skipped_record_count if you want
 # for an integer count of skipped records.
@@ -35,38 +18,64 @@ require 'thread' # for Mutex
 # you may not get a raise immediately after calling #put, you may get it on
 # a FUTURE #put or #close. You should get it eventually though.
 #
-# settings:
-#   [solr.url] Your solr url (required)
-#   [solrj_writer.server_class_name]  Defaults to "HttpSolrServer". You can specify
-#                                   another Solr Server sub-class, but it has
-#                                   to take a one-arg url constructor. Maybe
-#                                   subclass this writer class and overwrite
-#                                   instantiate_solr_server! otherwise
-#   [solrj.jar_dir] Custom directory containing all of the SolrJ jars. All
-#                   jars in this dir will be loaded. Otherwise,
-#                   we load our own packaged solrj jars. This setting
-#                   can't really be used differently in the same app instance,
-#                   since jars are loaded globally.
-#   [solrj_writer.parser_class_name] A String name of a class in package
-#                                    org.apache.solr.client.solrj.impl,
-#                                    we'll instantiate one with a zero-arg
-#                                    constructor, and pass it as an arg to setParser on
-#                                    the SolrServer instance, if present.
-#                                    NOTE: For contacting a Solr 1.x server, with the
-#                                    recent version of SolrJ used by default, set to
-#                                    "XMLResponseParser"
-#   [solrj_writer.commit_on_close]  If true (or string 'true'), send a commit to solr
-#                                   at end of #process.
-#   [solrj_writer.batch_size]       If non-nil and more than 1, send documents to
-#                                   solr in batches of solrj_writer.batch_size. If nil/1,
-#                                   however, an http transaction with solr will be done
-#                                   per doc. DEFAULT to 100, which seems to be a sweet spot.
-#   [solrj_writer.thread_pool]      Defaults to 4. A thread pool is used for submitting docs
-#                                   to solr. Set to 0 or nil to disable threading. Set to 1,
-#                                   there will still be a single bg thread doing the adds.
-#                                   May make sense to set higher than number of cores on your
-#                                   indexing machine, as these threads will mostly be waiting
-#                                   on Solr. Speed/capacity of your solr is more relevant.
+# ## Settings
+#
+# * solr.url: Your solr url (required)
+#
+# * solrj_writer.server_class_name:  Defaults to "HttpSolrServer". You can specify
+#   another Solr Server sub-class, but it has
+#   to take a one-arg url constructor. Maybe
+#   subclass this writer class and overwrite
+#   instantiate_solr_server! otherwise
+#
+# * solrj.jar_dir: Custom directory containing all of the SolrJ jars. All
+#   jars in this dir will be loaded. Otherwise,
+#   we load our own packaged solrj jars. This setting
+#   can't really be used differently in the same app instance,
+#   since jars are loaded globally.
+#
+# * solrj_writer.parser_class_name: A String name of a class in package
+#   org.apache.solr.client.solrj.impl,
+#   we'll instantiate one with a zero-arg
+#   constructor, and pass it as an arg to setParser on
+#   the SolrServer instance, if present.
+#   NOTE: For contacting a Solr 1.x server, with the
+#   recent version of SolrJ used by default, set to
+#   "XMLResponseParser"
+#
+# * solrj_writer.commit_on_close:  If true (or string 'true'), send a commit to solr
+#   at end of #process.
+#
+# * solrj_writer.batch_size:      If non-nil and more than 1, send documents to
+#   solr in batches of solrj_writer.batch_size. If nil/1,
+#   however, an http transaction with solr will be done
+#   per doc. DEFAULT to 100, which seems to be a sweet spot.
+#
+# * solrj_writer.thread_pool:      Defaults to 1. A thread pool is used for submitting docs
+#   to solr. Set to 0 or nil to disable threading. Set to 1,
+#   there will still be a single bg thread doing the adds. For
+#   very fast Solr servers and very fast indexing processes, may
+#   make sense to increase this value to throw at Solr as fast as it
+#   can catch.
+#
+# ## Example
+#
+#     settings do
+#       provide "writer_class_name", "Traject::SolrJWriter"
+#
+#       # This is just regular ruby, so don't be afraid to have conditionals!
+#       # Switch on hostname, for test and production server differences
+#       if Socket.gethostname =~ /devhost/
+#         provide "solr.url", "http://my.dev.machine:9033/catalog"
+#       else
+#         provide "solr.url", "http://my.production.machine:9033/catalog"
+#       end
+#
+#       provide "solrj_writer.parser_class_name", "BinaryResponseParser" # for Solr 4.x
+#       # provide "solrj_writer.parser_class_name", "XMLResponseParser" # For solr 1.x or 3.x
+#
+#       provide "solrj_writer.commit_on_close", "true"
+#     end
 class Traject::SolrJWriter
   # just a tuple of a SolrInputDocument
   # and a Traject::Indexer::Context it came from
@@ -150,7 +159,7 @@ class Traject::SolrJWriter
     if settings["solrj_writer.batch_size"].to_i > 1
       ready_batch = []
       batched_queue.add(package)
       if batched_queue.size >= settings["solrj_writer.batch_size"].to_i
         batched_queue.drain_to(ready_batch)
@@ -164,7 +173,7 @@ class Traject::SolrJWriter
           end
         end
-        @thread_pool.maybe_in_thread_pool { batch_add_document_packages(ready_batch) }
+        @thread_pool.maybe_in_thread_pool { batch_add_document_packages(ready_batch) }
       end
     else # non-batched add, add one at a time.
       @thread_pool.maybe_in_thread_pool { add_one_document_package(package) }
@@ -192,7 +201,7 @@ class Traject::SolrJWriter
   # shared state batched_queue in a mutex.
   def batch_add_document_packages(current_batch)
     begin
-      a = current_batch.collect {|package| package.solr_document }
+      a = current_batch.collect {|package| package.solr_document }
       solr_server.add( a )
       $stderr.write "%" if @debug_ascii_progress

data/lib/traject/thread_pool.rb CHANGED

@@ -1,47 +1,47 @@
 module Traject
   # An abstraction wrapping a threadpool executor in some configuration choices
-  # and other apparatus.
+  # and other apparatus.
+  #
+  # 1) Initialize with chosen pool size -- we create fixed size pools, where
+  # core and max sizes are the same.
   #
-  # 1) Initialize with chosen pool size -- we create fixed size pools, where
-  #    core and max sizes are the same.
   # 2) If initialized with nil for threadcount,  no thread pool will actually
-  #    be created, and all threadpool-related methods become no-ops. We call this
-  #    the nil/null threadpool.  A non-nil threadpool requires jruby, but you can
-  #    create a null Traject::ThreadPool.new(nil) under MRI without anything
-  #    complaining.
+  # be created, and all threadpool-related methods become no-ops. We call this
+  # the nil/null threadpool.  A non-nil threadpool requires jruby, but you can
+  # create a null Traject::ThreadPool.new(nil) under MRI without anything
+  # complaining.
   #
   # 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
-  #    execution -- if no threadpool configured your block will just be
-  #    executed in calling thread. Be careful to not refer to any non-local
-  #    variables in the block, unless the variable has an object you can
-  #    use thread-safely!
+  # execution -- if no threadpool configured your block will just be
+  # executed in calling thread. Be careful to not refer to any non-local
+  # variables in the block, unless the variable has an object you can
+  # use thread-safely!
   #
   # 4) Thread pools are java.util.concurrent.ThreadPoolExecutor, manually created
-  #    with a work queue that will buffer up to (pool_size*3) tasks. If queue is full,
-  #    the ThreadPoolExecutor is set up to use the ThreadPoolExecutor.CallerRunsPolicy,
-  #    meaning the block will end up executing in caller's own thread. With the kind
-  #    of work we're doing, where each unit of work is small and there are many of them--
-  #    the CallerRunsPolicy serves as an effective 'back pressure' mechanism to keep
-  #    the work queue from getting too large and exhausting memory, when producers are
-  #    faster than consumers.
+  # with a work queue that will buffer up to (pool_size*3) tasks. If queue is full,
+  # the ThreadPoolExecutor is set up to use the ThreadPoolExecutor.CallerRunsPolicy,
+  # meaning the block will end up executing in caller's own thread. With the kind
+  # of work we're doing, where each unit of work is small and there are many of them--
+  # the CallerRunsPolicy serves as an effective 'back pressure' mechanism to keep
+  # the work queue from getting too large and exhausting memory, when producers are
+  # faster than consumers.
   #
-  #  5) Any exceptions raised by pool-executed work are captured accumulated in a thread-safe
-  #     manner, and can be re-raised in the thread of your choice by calling
-  #     #raise_collected_exception!
+  # 5) Any exceptions raised by pool-executed work are captured accumulated in a thread-safe
+  #  manner, and can be re-raised in the thread of your choice by calling
+  #  #raise_collected_exception!
   #
-  #  6) When you are done with the threadpool, you can and must call
-  #     #shutdown_and_wait, which will wait for all current queued work
-  #     to complete, then return.  You can not give any more work to the pool
-  #     after you do this. By default it'll wait pretty much forever, which should
-  #     be fine. If you never call shutdown, the pool will keep running forever
-  #     and not allow your program to exit!
+  # 6) When you are done with the threadpool, you can and must call
+  #  #shutdown_and_wait, which will wait for all current queued work
+  #  to complete, then return.  You can not give any more work to the pool
+  #  after you do this. By default it'll wait pretty much forever, which should
+  #  be fine. If you never call shutdown, the pool will keep running forever
+  #  and not allow your program to exit!
   #
-  #  7) We will keep track of total times a block is run in thread pool, and
-  #     total elapsed (wall) time of running all blocks, so an average_execution_ms
-  #     time can be given.  #average_execution_ms may be inaccurate if called when
-  #     threads are still executing, as it's not entirely thread safe (may get
-  #     an off by one as to total iterations)
+  # 7) We will keep track of total times a block is run in thread pool, and
+  #  total elapsed (wall) time of running all blocks, so an average_execution_ms
+  #  time can be given.  #average_execution_ms may be inaccurate if called when
+  #  threads are still executing, as it's not entirely thread safe (may get
+  #  an off by one as to total iterations)
   class ThreadPool
     attr_reader :pool_size, :label, :queue_capacity
@@ -60,15 +60,15 @@ module Traject
         rejectedExecutionHandler =  java.util.concurrent.ThreadPoolExecutor::CallerRunsPolicy.new
         # keepalive times don't matter, we are setting core and max pool to
-        # same thing, fixed size pool.
+        # same thing, fixed size pool.
         @thread_pool =  java.util.concurrent.ThreadPoolExecutor.new(
-          @pool_size, @pool_size, 0, java.util.concurrent.TimeUnit::MILLISECONDS,
+          @pool_size, @pool_size, 0, java.util.concurrent.TimeUnit::MILLISECONDS,
           blockingQueue, rejectedExecutionHandler)
-        # A thread-safe queue to collect exceptions cross-threads.
+        # A thread-safe queue to collect exceptions cross-threads.
         # We make it small, we really only need to store the first
         # exception, we don't care too much about others. But we'll
-        # keep the first 20, why not.
+        # keep the first 20, why not.
         @async_exception_queue   =  java.util.concurrent.ArrayBlockingQueue.new(20)
       end
     end
@@ -101,7 +101,7 @@ module Traject
     #     # and would be pointing to a different string now!
     #
     #  Note, that just makes block-local variables, it doesn't
-    #  help you with whether a data structure itself is thread safe.
+    #  help you with whether a data structure itself is thread safe.
     def maybe_in_thread_pool(*args)
       start_t = Time.now
@@ -121,7 +121,7 @@ module Traject
     # Just for monitoring/debugging purposes, we'll return the work queue
     # used by the threadpool. Don't recommend you do anything with it, as
-    # the original java.util.concurrent docs make the same recommendation.
+    # the original java.util.concurrent docs make the same recommendation.
     def queue
       @thread_pool && @thread_pool.queue
     end
@@ -129,20 +129,20 @@ module Traject
     # thread-safe way of storing an exception, to raise
     # later in a different thread. We don't guarantee
     # that we can store more than one at a time, only
-    # the first one recorded may be stored.
+    # the first one recorded may be stored.
     def collect_exception(e)
       # offer will silently do nothing if the queue is full, that's fine
-      # with us.
+      # with us.
       @async_exception_queue.offer(e)
     end
     # If there's a stored collected exception, raise it
     # again now. Call this to re-raise exceptions caught in
-    # other threads in the thread of your choice.
+    # other threads in the thread of your choice.
     #
     # If you call this method on a ThreadPool initialized with nil
     # as a non-functioning threadpool -- then this method is just
-    # a no-op.
+    # a no-op.
     def raise_collected_exception!
       if @async_exception_queue && e = @async_exception_queue.poll
         raise e
@@ -151,7 +151,7 @@ module Traject
     # shutdown threadpool, and wait for all work to complete.
     # this one is also a no-op if you have a null ThreadPool that
-    # doesn't really have a threadpool at all.
+    # doesn't really have a threadpool at all.
     #
     # returns elapsed time in seconds it took to shutdown
     def shutdown_and_wait
@@ -168,4 +168,4 @@ module Traject
     end
   end
-end
+end