RubyGems - traject - Versions diffs - 0.16.0 → 0.17.0 - Mend

traject 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

checksums.yaml +7 -0
data/.yardopts +1 -0
data/README.md +183 -191
data/bench/bench.rb +1 -1
data/doc/batch_execution.md +14 -0
data/doc/extending.md +14 -12
data/doc/indexing_rules.md +265 -0
data/lib/traject/command_line.rb +12 -41
data/lib/traject/debug_writer.rb +32 -13
data/lib/traject/indexer.rb +101 -24
data/lib/traject/indexer/settings.rb +18 -17
data/lib/traject/json_writer.rb +32 -11
data/lib/traject/line_writer.rb +6 -6
data/lib/traject/macros/basic.rb +1 -1
data/lib/traject/macros/marc21.rb +17 -13
data/lib/traject/macros/marc21_semantics.rb +27 -25
data/lib/traject/macros/marc_format_classifier.rb +39 -25
data/lib/traject/marc4j_reader.rb +36 -22
data/lib/traject/marc_extractor.rb +79 -75
data/lib/traject/marc_reader.rb +33 -25
data/lib/traject/mock_reader.rb +9 -10
data/lib/traject/ndj_reader.rb +7 -7
data/lib/traject/null_writer.rb +1 -1
data/lib/traject/qualified_const_get.rb +12 -2
data/lib/traject/solrj_writer.rb +61 -52
data/lib/traject/thread_pool.rb +45 -45
data/lib/traject/translation_map.rb +59 -27
data/lib/traject/util.rb +3 -3
data/lib/traject/version.rb +1 -1
data/lib/traject/yaml_writer.rb +1 -1
data/test/debug_writer_test.rb +7 -7
data/test/indexer/each_record_test.rb +4 -4
data/test/indexer/macros_marc21_semantics_test.rb +12 -12
data/test/indexer/macros_marc21_test.rb +10 -10
data/test/indexer/macros_test.rb +1 -1
data/test/indexer/map_record_test.rb +6 -6
data/test/indexer/read_write_test.rb +43 -4
data/test/indexer/settings_test.rb +2 -2
data/test/indexer/to_field_test.rb +8 -8
data/test/marc4j_reader_test.rb +4 -4
data/test/marc_extractor_test.rb +33 -25
data/test/marc_format_classifier_test.rb +3 -3
data/test/marc_reader_test.rb +2 -2
data/test/test_helper.rb +3 -3
data/test/test_support/demo_config.rb +52 -48
data/test/translation_map_test.rb +22 -4
data/test/translation_maps/bad_ruby.rb +2 -2
data/test/translation_maps/both_map.rb +1 -1
data/test/translation_maps/default_literal.rb +1 -1
data/test/translation_maps/default_passthrough.rb +1 -1
data/test/translation_maps/ruby_map.rb +1 -1
metadata +7 -31
data/doc/macros.md +0 -103

data/lib/traject/debug_writer.rb CHANGED

@@ -1,21 +1,40 @@
 require 'traject/line_writer'
-# A writer for Traject::Indexer that outputs each record as a series of
-# lines, prefixed by the id, one for each field and it's values.
-# Multiple values are separated by pipes
+# The Traject::DebugWriter produces a simple, human-readable output format that's
+# also amenable to simple computer processing (e.g., with a simple grep).
+# It's the output format used when you pass the --debug-mode switch to traject on the command line.
 #
-# Applicable settings:
+# Output format is three columns: id, output field, values (multiple
+# values seperated by '|'), and looks something like:
 #
-#  - 'output_file' -- the name of the file to output to
-#  - 'output_stream' -- alternately, the IO stream
-#  - 'debug_writer.idfield' -- the solr field from which to pull the record ID (default: 'id')
-#  - 'debug_writer.format'  -- How to format the id/solr field/values (default: '%-12s %-25s %s')
+#     000001580    edition                   [1st ed.]
+#     000001580    format                    Book | Online | Print
+#     000001580    geo                       Great Britain
+#     000001580    id                        000001580
+#     000001580    isbn                      0631126902
+#
+# ## Settings
+#
+#  * 'output_file' -- the name of the file to output to (command line -o shortcut).
+#  * 'output_stream' -- alternately, the IO stream
+#  * 'debug_writer.idfield' -- the solr field from which to pull the record ID (default: 'id')
+#  * 'debug_writer.format'  -- How to format the id/solr field/values (default: '%-12s %-25s %s')
+#
+# By default, with neither output_file nor output_stream provided, writes to stdout, which
+# can be useful for debugging diagnosis.
+#
+# ## Example configuration file
+#
+#     require 'traject/debug_writer'
+#
+#     settings do
+#       provide "writer_class_name", "Traject::DebugWriter"
+#       provide "output_file", "out.txt"
+#     end
 class Traject::DebugWriter < Traject::LineWriter
   DEFAULT_FORMAT = '%-12s %-25s %s'
   DEFAULT_IDFIELD = 'id'
   def serialize(context)
     idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
     format  = settings['debug_writer.format']  || DEFAULT_FORMAT
@@ -23,6 +42,6 @@ class Traject::DebugWriter < Traject::LineWriter
     lines = h.keys.sort.map {|k| format % [h[idfield].first, k, h[k].join(' | ')] }
     lines.push "\n"
     lines.join("\n")
-  end
+  end
-end
+end

data/lib/traject/indexer.rb CHANGED

@@ -11,8 +11,38 @@ require 'traject/solrj_writer'
 require 'traject/macros/marc21'
 require 'traject/macros/basic'
+# This class does indexing for traject: Getting input records from a Reader
+# class, mapping the input records to an output hash, and then sending the output
+# hash off somewhere (usually Solr) with a Writer class.
+#
+# Traject config files are `instance_eval`d in an Indexer object, so `self` in
+# a config file is an Indexer, and any Indexer methods can be called.
+#
+# However, certain Indexer methods exist almost entirely for the purpose of
+# being called in config files; these methods are part of the expected
+# Domain-Specific Language ("DSL") for config files, and will ordinarily
+# form the bulk or entirety of config files:
+#
+# * #settings
+# * #to_field
+# * #each_record
+# * #after_procesing
+# * #logger (rarely used in config files, but in some cases to set up custom logging config)
+#
+# If accessing a Traject::Indexer programmatically (instead of via command line with
+# config files), additional methods of note include:
+#
+#     # to process a stream of input records from configured Reader,
+#     # to configured Writer:
+#     indexer.process(io_stream)
+#
+#     # To map a single input record manually to an ouput_hash,
+#     # ignoring Readers and Writers
+#     hash = indexer.map_record(record)
 #
-#  == Readers and Writers
+#
+#  ## Readers and Writers
 #
 #  The Indexer has a modularized architecture for readers and writers, for where
 #  source records come from (reader), and where output is sent to (writer).
@@ -73,28 +103,38 @@ class Traject::Indexer
   def initialize(arg_settings = {})
     @settings = Settings.new(arg_settings)
     @index_steps = []
+    @after_processing_steps = []
   end
-  # The Indexer's settings are a hash of key/values -- not
-  # nested, just one level -- of configuration settings. Keys
-  # are strings.
+  # Part of the config file DSL, for writing settings values.
+  #
+  # The Indexer's settings consist of a hash-like Traject::Settings
+  # object. The settings hash is *not*  nested hashes, just one level
+  # of configuration settings. Keys are always strings, and by convention
+  # use "." for namespacing, eg `log.file`
   #
-  # The settings method with no arguments returns that hash.
+  # The settings method with no arguments returns that Settings object.
   #
   # With a hash and/or block argument, can be used to set
   # new key/values. Each call merges onto the existing settings
-  # hash.
+  # hash.  The block is `instance_eval`d in the context
+  # of the Traject::Settings object.
   #
   #    indexer.settings("a" => "a", "b" => "b")
   #
   #    indexer.settings do
-  #      store "b", "new b"
+  #      provide "b", "new b"
   #    end
   #
   #    indexer.settings #=> {"a" => "a", "b" => "new b"}
   #
-  # even with arguments, returns settings hash too, so can
-  # be chained.
+  # Note the #provide method is defined on Traject::Settings to
+  # write to a setting only if previously not set. You can also
+  # use #store to force over-writing even if an existing setting.
+  #
+  # Even with arguments, Indexer#settings returns the Settings object,
+  # hash too, so can method calls can be chained.
+  #
   def settings(new_settings = nil, &block)
     @settings.merge!(new_settings) if new_settings
@@ -103,6 +143,24 @@ class Traject::Indexer
     return @settings
   end
+  # Part of DSL, used to define an indexing mapping. Register logic
+  # to be called for each record, and generate values for a particular
+  # output field.
+  def to_field(field_name, aLambda = nil, &block)
+    @index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
+  end
+  # Part of DSL, register logic to be called for each record
+  def each_record(aLambda = nil, &block)
+    @index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
+  end
+  # Part of DSL, register logic to be called once at the end
+  # of processing a stream of records.
+  def after_processing(aLambda = nil, &block)
+    @after_processing_steps << AfterProcessingStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first))
+  end
   def logger
     @logger ||= create_logger
   end
@@ -149,20 +207,6 @@ class Traject::Indexer
     return logger
   end
-  # Used to define an indexing mapping.
-  def to_field(field_name, aLambda = nil, &block)
-    @index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
-  end
-  def each_record(aLambda = nil, &block)
-    @index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
-  end
   # Processes a single record according to indexing rules set up in
   # this indexer. Returns the output hash (a hash whose keys are
   # string fields, and values are arrays of one or more values in that field)
@@ -293,7 +337,7 @@ class Traject::Indexer
       # of having it be bound to the original variable in a non-threadsafe way.
       # This is confusing, I might not be understanding things properly, but that's where i am.
       #thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
-      thread_pool.maybe_in_thread_pool do
+      thread_pool.maybe_in_thread_pool(record, settings, position) do |record, settings, position|
         context = Context.new(:source_record => record, :settings => settings, :position => position)
         context.logger = logger
         map_to_context!(context)
@@ -317,6 +361,15 @@ class Traject::Indexer
     writer.close if writer.respond_to?(:close)
+    @after_processing_steps.each do |step|
+      begin
+        step.execute
+      rescue Exception => e
+        logger.fatal("Unexpected exception #{e} when executing #{step}")
+        raise e
+      end
+    end
     elapsed        = Time.now - start_time
     avg_rps        = (count / elapsed)
     logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
@@ -513,6 +566,30 @@ class Traject::Indexer
   end
+  # A class representing a block of logic called after
+  # processing, registered with #after_processing
+  class AfterProcessingStep
+    attr_accessor :lambda, :block, :source_location
+    def initialize(lambda, block, source_location)
+      self.lambda = lambda
+      self.block = block
+      self.source_location = source_location
+    end
+    # after_processing steps get no args yielded to
+    # their blocks, they just are what they are.
+    def execute
+      [lambda, block].each do |aProc|
+        next unless aProc
+        aProc.call
+      end
+    end
+    def inspect
+      "(after_processing at #{self.source_location}"
+    end
+  end

data/lib/traject/indexer/settings.rb CHANGED

@@ -1,22 +1,23 @@
 require 'hashie'
-# A Hash of settings for a Traject::Indexer, which also ends up passed along
-# to other objects Traject::Indexer interacts with.
-#
-# Enhanced with a few features from Hashie, to make it for
-# instance string/symbol indifferent
-#
-# #provide(key, value) is added, to do like settings[key] ||= value,
-# set only if not already set (but unlike ||=, nil or false can count as already set)
-#
-# Also has an interesting 'defaults' system, meant to play along
-# with configuration file 'provide' statements. There is a built-in hash of
-# defaults, which will be lazily filled in if accessed and not yet
-# set. (nil can count as set, though!).  If they haven't been lazily
-# set yet, then #provide will still fill them in. But you can also call
-# fill_in_defaults! to fill all defaults in, if you know configuration
-# files have all been loaded, and want to fill them in for inspection.
 class Traject::Indexer
+  # A Hash of settings for a Traject::Indexer, which also ends up passed along
+  # to other objects Traject::Indexer interacts with.
+  #
+  # Enhanced with a few features from Hashie, to make it for
+  # instance string/symbol indifferent
+  #
+  # method #provide(key, value) is added, to do like settings[key] ||= value,
+  # set only if not already set (but unlike ||=, nil or false can count as already set)
+  #
+  # Also has an interesting 'defaults' system, meant to play along
+  # with configuration file 'provide' statements. There is a built-in hash of
+  # defaults, which will be lazily filled in if accessed and not yet
+  # set. (nil can count as set, though!).  If they haven't been lazily
+  # set yet, then #provide will still fill them in. But you can also call
+  # fill_in_defaults! to fill all defaults in, if you know configuration
+  # files have all been loaded, and want to fill them in for inspection.
   class Settings < Hash
     include Hashie::Extensions::MergeInitializer # can init with hash
     include Hashie::Extensions::IndifferentAccess
@@ -80,4 +81,4 @@ class Traject::Indexer
       end.inspect
     end
   end
-end
+end

data/lib/traject/json_writer.rb CHANGED

@@ -1,21 +1,42 @@
 require 'json'
 require 'traject/line_writer'
-# A writer for Traject::Indexer, that just writes out
-# all the output as Json. It's newline delimitted json, but
-# right now no checks to make sure there is no internal newlines
-# as whitespace in the json. TODO, add that.
+# The JsonWriter outputs one JSON hash per record, separated by newlines.
+#
+# It's newline delimitted json, which should be suitable for being
+# read by simple NDJ readers. (TODO: We have no checks right now to
+# make sure the standard json serializers we're using don't put any
+# internal newlines as whitespace in the json. Which would break NDJ
+# reading. Should we?)
 #
 # Should be thread-safe (ie, multiple worker threads can be calling #put
-# concurrently), by wrapping write to actual output file in a mutex synchronize.
+# concurrently), because output to file is wrapped in a mutex synchronize.
 # This does not seem to effect performance much, as far as I could tell
 # benchmarking.
 #
-# You can force pretty-printing with setting 'json_writer.pretty_print' of boolean
-# true or string 'true'.  Useful mostly for human checking of output.
+# ## Settings
+#
+# * output_file A filename to send output; default will use stdout.
+#
+# * json_writer.pretty_print: [default: false]: Pretty-print (e.g., include newlines, indentation, etc.)
+# each JSON record instead of just mashing it all together on one line. The default, no pretty-printing option
+# produces one record per line, easy to process with another program.
+#
+# ## Example output
+#
+# Without pretty printing, you end up with something like this (just two records shown):
+#
+#     {"id":["000001118"],"oclc":["ocm00085737"],"sdrnum":["sdr-nrlf.b170195454"],"isbn":["0137319924"],"lccn":["73120791"],"mainauthor":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"author2":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"authorSort":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author_top":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Edited by David A. Hamburg.","Hamburg, David A., 1925- ed."],"title":["Psychiatry as a behavioral science."],"title_a":["Psychiatry as a behavioral science."],"title_ab":["Psychiatry as a behavioral science."],"title_c":["Edited by David A. Hamburg."],"titleSort":["Psychiatry as a behavioral science"],"title_top":["Psychiatry as a behavioral science."],"title_rest":["A Spectrum book"],"series2":["A Spectrum book"],"callnumber":["RC327 .B41"],"broad_subject":["Medicine"],"pubdate":[1970],"format":["Book","Online","Print"],"publisher":["Prentice-Hall"],"language":["English"],"language008":["eng"],"editor":["David A. Hamburg."]}
+#     {"id":["000000794"],"oclc":["ocm00067181"],"lccn":["78011026"],"mainauthor":["Clark, Albert Curtis, 1859-1937."],"author":["Clark, Albert Curtis, 1859-1937."],"authorSort":["Clark, Albert Curtis, 1859-1937."],"author_top":["Clark, Albert Curtis, 1859-1937."],"title":["The descent of manuscripts.","descent of manuscripts."],"title_a":["The descent of manuscripts.","descent of manuscripts."],"title_ab":["The descent of manuscripts.","descent of manuscripts."],"titleSort":["descent of manuscripts"],"title_top":["The descent of manuscripts."],"callnumber":["PA47 .C45 1970"],"broad_subject":["Language & Literature"],"pubdate":[1918],"format":["Book","Online","Print"],"publisher":["Clarendon Press"],"language":["English"],"language008":["eng"]}
+#
+# ## Example configuration file
+#
+#     require 'traject/json_writer'
 #
-# Output will be sent to settings["output_file"] string path, or else
-# settings["output_stream"] (ruby IO object), or else stdout.
+#     settings do
+#       provide "writer_class_name", "Traject::JsonWriter"
+#       provide "output_file", "out.json"
+#     end
 class Traject::JsonWriter < Traject::LineWriter
   def serialize(context)
@@ -25,6 +46,6 @@ class Traject::JsonWriter < Traject::LineWriter
     else
       JSON.generate(hash)
     end
-  end
+  end
-end
+end

data/lib/traject/line_writer.rb CHANGED

@@ -1,19 +1,19 @@
 require 'thread'
 # A writer for Traject::Indexer, that just writes out
-# all the output as serialized text with #puts.
+# all the output as serialized text with #puts.
 #
 # Should be thread-safe (ie, multiple worker threads can be calling #put
 # concurrently), by wrapping write to actual output file in a mutex synchronize.
 # This does not seem to effect performance much, as far as I could tell
 # benchmarking.
 #
-# Output will be sent to settings["output_file"] string path, or else
-# settings["output_stream"] (ruby IO object), or else stdout.
+# Output will be sent to `settings["output_file"]` string path, or else
+# `settings["output_stream"]` (ruby IO object), or else stdout.
 #
 # This class can be sub-classed to write out different serialized
 # reprentations -- subclasses will just override the #serialize
-# method. For instance, see JsonWriter.
+# method. For instance, see JsonWriter.
 class Traject::LineWriter
   attr_reader :settings
   attr_reader :write_mutex
@@ -29,7 +29,7 @@ class Traject::LineWriter
   def serialize(context)
     context.output_hash
-  end
+  end
   def put(context)
     serialized = serialize(context)
@@ -56,4 +56,4 @@ class Traject::LineWriter
     @output_file.close unless (@output_file.nil? || @output_file.tty?)
   end
-end
+end

data/lib/traject/macros/basic.rb CHANGED

@@ -6,4 +6,4 @@ module Traject::Macros
       end
     end
   end
-end
+end

data/lib/traject/macros/marc21.rb CHANGED

@@ -20,29 +20,33 @@ module Traject::Macros
     # and others. By default, will de-duplicate results, but see :allow_duplicates
     #
     # * :first => true: take only first value
+    #
     # * :translation_map => String: translate with named translation map looked up in load
     #       path, uses Tranject::TranslationMap.new(translation_map_arg)
+    #
     # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
     #     have shown themselves useful with Marc, using Marc21.trim_punctuation
+    #
     # * :default => String: if otherwise empty, add default value
+    #
     # * :allow_duplicates => boolean, default false, if set to true then will avoid
     #       de-duplicating the result array (array.uniq!)
     #
     #
     # Examples:
     #
-    # to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
-    # to_field("id"),    extract_marc("001", :first => true)
-    # to_field("geo"),   extract_marc("040a", :separator => nil, :translation_map => "marc040")
+    #     to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
+    #     to_field("id"),    extract_marc("001", :first => true)
+    #     to_field("geo"),   extract_marc("040a", :separator => nil, :translation_map => "marc040")
     def extract_marc(spec, options = {})
       # Raise an error if there are any invalid options, indicating a
       # misspelled or illegal option, using a string instead of a symbol, etc.
       unless (options.keys - EXTRACT_MARC_VALID_OPTIONS).empty?
         raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_MARC_VALID_OPTIONS).join(', ')}' in extract_marc at #{Traject::Util.extract_caller_location(caller.first)}")
       end
       only_first              = options.delete(:first)
       trim_punctuation        = options.delete(:trim_punctuation)
       default_value           = options.delete(:default)
@@ -53,12 +57,12 @@ module Traject::Macros
       # ones, and not have to create a new one per-execution.
       #
       # Benchmarking shows for MarcExtractor at least, there is
-      # significant performance advantage.
+      # significant performance advantage.
       if translation_map_arg  = options.delete(:translation_map)
         translation_map = Traject::TranslationMap.new(translation_map_arg)
       end
       extractor = Traject::MarcExtractor.new(spec, options)
@@ -76,7 +80,7 @@ module Traject::Macros
         if trim_punctuation
           accumulator.collect! {|s| Marc21.trim_punctuation(s)}
         end
         unless allow_duplicates
           accumulator.uniq!
         end
@@ -84,14 +88,14 @@ module Traject::Macros
         if default_value && accumulator.empty?
           accumulator << default_value
         end
       end
     end
     #  A list of symbols that are valid keys in the options hash
-    EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
-                                  :allow_duplicates, :separator, :translation_map,
+    EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
+                                  :allow_duplicates, :separator, :translation_map,
                                   :alternate_script]
     # Serializes complete marc record to a serialization format.
     # required param :format,
     # serialize_marc(:format => :binary)