RubyGems - traject - Versions diffs - 0.9.1 → 0.10.0 - Mend

traject 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

data/.travis.yml +7 -0
data/Gemfile +5 -1
data/README.md +65 -17
data/bench/bench.rb +30 -0
data/bin/traject +4 -169
data/doc/batch_execution.md +177 -0
data/doc/extending.md +182 -0
data/doc/other_commands.md +49 -0
data/doc/settings.md +6 -2
data/lib/traject.rb +1 -0
data/lib/traject/command_line.rb +296 -0
data/lib/traject/debug_writer.rb +28 -0
data/lib/traject/indexer.rb +84 -20
data/lib/traject/indexer/settings.rb +9 -1
data/lib/traject/json_writer.rb +15 -38
data/lib/traject/line_writer.rb +59 -0
data/lib/traject/macros/marc21.rb +10 -5
data/lib/traject/macros/marc21_semantics.rb +57 -25
data/lib/traject/marc4j_reader.rb +9 -26
data/lib/traject/marc_extractor.rb +121 -48
data/lib/traject/mock_reader.rb +87 -0
data/lib/traject/mock_writer.rb +34 -0
data/lib/traject/solrj_writer.rb +1 -22
data/lib/traject/util.rb +107 -1
data/lib/traject/version.rb +1 -1
data/lib/traject/yaml_writer.rb +9 -0
data/test/debug_writer_test.rb +38 -0
data/test/indexer/each_record_test.rb +27 -2
data/test/indexer/macros_marc21_semantics_test.rb +12 -1
data/test/indexer/settings_test.rb +9 -2
data/test/indexer/to_field_test.rb +35 -5
data/test/marc4j_reader_test.rb +3 -0
data/test/marc_extractor_test.rb +94 -20
data/test/test_support/demo_config.rb +6 -3
data/traject.gemspec +1 -2
metadata +17 -20

data/lib/traject/debug_writer.rb ADDED

@@ -0,0 +1,28 @@
+require 'traject/line_writer'
+# A writer for Traject::Indexer that outputs each record as a series of
+# lines, prefixed by the id, one for each field and it's values.
+# Multiple values are separated by pipes
+#
+# Applicable settings:
+#
+#  - 'output_file' -- the name of the file to output to
+#  - 'output_stream' -- alternately, the IO stream
+#  - 'debug_writer.idfield' -- the solr field from which to pull the record ID (default: 'id')
+#  - 'debug_writer.format'  -- How to format the id/solr field/values (default: '%-12s %-25s %s')
+class Traject::DebugWriter < Traject::LineWriter
+  DEFAULT_FORMAT = '%-12s %-25s %s'
+  DEFAULT_IDFIELD = 'id'
+  def serialize(context)
+    idfield = settings["debug_writer.idfield"] || DEFAULT_IDFIELD
+    format  = settings['debug_writer.format']  || DEFAULT_FORMAT
+    h = context.output_hash
+    lines = h.keys.sort.map {|k| format % [h[idfield].first, k, h[k].join(' | ')] }
+    lines.push "\n"
+    lines.join("\n")
+  end
+end

data/lib/traject/indexer.rb CHANGED

@@ -50,6 +50,13 @@ require 'traject/macros/basic'
 #  with a String name of class meeting the Writer contract.
 #
 class Traject::Indexer
+  # Arity error on a passed block
+  class ArityError < ArgumentError; end
+  class NamingError < ArgumentError; end
   include Traject::QualifiedConstGet
   attr_writer :reader_class, :writer_class
@@ -143,20 +150,13 @@ class Traject::Indexer
   end
   # Used to define an indexing mapping.
   def to_field(field_name, aLambda = nil, &block)
-    if field_name.nil? || field_name.empty?
-      raise ArgumentError.new("to_field requires a non-blank first argument, field name")
-    end
-    [aLambda, block].each do |proc|
-      # allow negative arity, meaning variable/optional, trust em on that.
-      # but for positive arrity, we need 2 or 3 args
-      if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
-        raise ArgumentError.new("block/proc given to to_field needs 2 or 3 arguments: #{proc}")
-      end
-    end
+    verify_to_field_arguments(field_name, aLambda, block)
     @index_steps << {
       :field_name => field_name.to_s,
@@ -168,15 +168,7 @@ class Traject::Indexer
   end
   def each_record(aLambda = nil, &block)
-    # arity check
-    [aLambda, block].each do |proc|
-      # allow negative arity, meaning variable/optional, trust em on that.
-      # but for positive arrity, we need 1 or 2 args
-      if proc && (proc.arity == 0 || proc.arity > 2)
-        raise ArgumentError.new("block/proc given to to_field needs 1 or 2 arguments: #{proc}")
-      end
-    end
+    verify_each_record_arguments(aLambda, block)
     @index_steps << {
       :lambda => aLambda,
       :block  => block,
@@ -394,6 +386,78 @@ class Traject::Indexer
   end
+  # Verify that the field name is good, and throw a useful error if not
+  def verify_field_name(field_name)
+    if field_name.nil? || !field_name.is_a?(String) || field_name.empty?
+      raise NamingError.new("to_field requires the field name (String) as the first argument (#{last_named_step.message})")
+    end
+  end
+  # Verify the various, increasingly-complex things that can be sent to to_field
+  # to make sure it's all kosher.
+  #
+  # "Modification" takes place for zero-argument blocks that return a lambda
+  def verify_to_field_arguments(field_name, aLambda, block)
+    verify_field_name(field_name)
+    [aLambda, block].each do |proc|
+      # allow negative arity, meaning variable/optional, trust em on that.
+      # but for positive arrity, we need 2 or 3 args
+      if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
+        raise ArityError.new("error parsing field '#{field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{last_named_step.message})")
+      end
+    end
+  end
+  # Verify the procs sent to each_record to make sure it's all kosher.
+  def verify_each_record_arguments(aLambda, block)
+    unless aLambda or block
+      raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{last_named_step.message})")
+    end
+    [aLambda, block].each do |proc|
+      # allow negative arity, meaning variable/optional, trust em on that.
+      # but for positive arrity, we need 1 or 2 args
+      if proc
+        unless proc.is_a?(Proc)
+          raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} (#{last_named_step.message})")
+        end
+        if (proc.arity == 0 || proc.arity > 2)
+          raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: #{proc} (#{last_named_step.message})")
+        end
+      end
+    end
+  end
+  def last_named_step
+    return LastNamedStep.new(@index_steps)
+  end
+  # A convenient way to find, and generate error messages for, the last named step (for helping locate parse errors)
+  class LastNamedStep
+    attr_accessor :step, :message
+    # Get the last step for which we have a field_name (e.g., the last to_field, skipping over each_record)
+    def initialize(index_steps)
+      @step = index_steps.reverse_each.find{|step| step[:field_name]}
+      if @step
+        @message = "last successfully parsed field was '#{@step[:field_name]}'"
+      else
+        @message = "there were no previous named fields successfully parsed"
+      end
+    end
+  end
   # Represents the context of a specific record being indexed, passed
   # to indexing logic blocks
   #

data/lib/traject/indexer/settings.rb CHANGED

@@ -1,7 +1,7 @@
 require 'hashie'
 # A Hash of settings for a Traject::Indexer, which also ends up passed along
-# to other objects Traject::Indexer interacts with.
+# to other objects Traject::Indexer interacts with.
 #
 # Enhanced with a few features from Hashie, to make it for
 # instance string/symbol indifferent
@@ -71,5 +71,13 @@ class Traject::Indexer
       "processing_thread_pool"    => 3
       }
     end
+    def inspect
+      # Keep any key ending in password out of the inspect
+      self.inject({}) do |hash, (key, value)|
+        hash[key] = (key =~ /password\Z/) ? "[hidden]" : value
+        hash
+      end.inspect
+    end
   end
 end

data/lib/traject/json_writer.rb CHANGED

@@ -1,53 +1,30 @@
 require 'json'
+require 'traject/line_writer'
 # A writer for Traject::Indexer, that just writes out
 # all the output as Json. It's newline delimitted json, but
 # right now no checks to make sure there is no internal newlines
-# as whitespace in the json. TODO, add that.
+# as whitespace in the json. TODO, add that.
 #
-# Not currently thread-safe (have to make sure whole object and newline
-# get written without context switch. Can be made so.)
+# Should be thread-safe (ie, multiple worker threads can be calling #put
+# concurrently), by wrapping write to actual output file in a mutex synchronize.
+# This does not seem to effect performance much, as far as I could tell
+# benchmarking.
 #
 # You can force pretty-printing with setting 'json_writer.pretty_print' of boolean
-# true or string 'true'.  Useful mostly for human checking of output.
+# true or string 'true'.  Useful mostly for human checking of output.
 #
 # Output will be sent to settings["output_file"] string path, or else
-# settings["output_stream"] (ruby IO object), or else stdout.
-class Traject::JsonWriter
-  attr_reader :settings
+# settings["output_stream"] (ruby IO object), or else stdout.
+class Traject::JsonWriter < Traject::LineWriter
-  def initialize(argSettings)
-    @settings = argSettings
-  end
-  def put(context)
+  def serialize(context)
     hash = context.output_hash
-    serialized =
-      if settings["json_writer.pretty_print"]
-        JSON.pretty_generate(hash)
-      else
-        JSON.generate(hash)
-      end
-    output_file.puts(serialized)
-  end
-  def output_file
-    unless defined? @output_file
-      @output_file =
-        if settings["output_file"]
-          File.open(settings["output_file"], 'w:UTF-8')
-        elsif settings["output_stream"]
-          settings["output_stream"]
-        else
-          $stdout
-        end
+    if settings["json_writer.pretty_print"]
+      JSON.pretty_generate(hash)
+    else
+      JSON.generate(hash)
     end
-    return @output_file
-  end
-  def close
-    @output_file.close unless (@output_file.nil? || @output_file.tty?)
-  end
+  end
 end

data/lib/traject/line_writer.rb ADDED

@@ -0,0 +1,59 @@
+require 'thread'
+# A writer for Traject::Indexer, that just writes out
+# all the output as serialized text with #puts.
+#
+# Should be thread-safe (ie, multiple worker threads can be calling #put
+# concurrently), by wrapping write to actual output file in a mutex synchronize.
+# This does not seem to effect performance much, as far as I could tell
+# benchmarking.
+#
+# Output will be sent to settings["output_file"] string path, or else
+# settings["output_stream"] (ruby IO object), or else stdout.
+#
+# This class can be sub-classed to write out different serialized
+# reprentations -- subclasses will just override the #serialize
+# method. For instance, see JsonWriter.
+class Traject::LineWriter
+  attr_reader :settings
+  attr_reader :write_mutex
+  def initialize(argSettings)
+    @settings     = argSettings
+    @write_mutex  = Mutex.new
+    # trigger lazy loading now for thread-safety
+    output_file
+  end
+  def serialize(context)
+    context.output_hash
+  end
+  def put(context)
+    serialized = serialize(context)
+    write_mutex.synchronize do
+      output_file.puts(serialized)
+    end
+  end
+  def output_file
+    unless defined? @output_file
+      @output_file =
+        if settings["output_file"]
+          File.open(settings["output_file"], 'w:UTF-8')
+        elsif settings["output_stream"]
+          settings["output_stream"]
+        else
+          $stdout
+        end
+    end
+    return @output_file
+  end
+  def close
+    @output_file.close unless (@output_file.nil? || @output_file.tty?)
+  end
+end

data/lib/traject/macros/marc21.rb CHANGED

@@ -35,16 +35,21 @@ module Traject::Macros
       trim_punctuation        = options.delete(:trim_punctuation)
       default_value           = options.delete(:default)
-      # We create the TranslationMap here on load, not inside the closure
-      # where it'll be called for every record. Since TranslationMap is supposed
-      # to cache, prob doesn't matter, but doens't hurt. Also causes any syntax
-      # exceptions to raise on load.
+      # We create the TranslationMap and the MarcExtractor here
+      # on load, so the lambda can just refer to already created
+      # ones, and not have to create a new one per-execution.
+      #
+      # Benchmarking shows for MarcExtractor at least, there is
+      # significant performance advantage.
       if translation_map_arg  = options.delete(:translation_map)
         translation_map = Traject::TranslationMap.new(translation_map_arg)
       end
+      extractor = Traject::MarcExtractor.new(spec, options)
       lambda do |record, accumulator, context|
-        accumulator.concat Traject::MarcExtractor.extract_by_spec(record, spec, options)
+        accumulator.concat extractor.extract(record)
         if only_first
           Marc21.first! accumulator

data/lib/traject/macros/marc21_semantics.rb CHANGED

@@ -11,19 +11,30 @@ module Traject::Macros
     # shortcut
     MarcExtractor = Traject::MarcExtractor
-    # Extract OCLC numbers from, by default 035a's, then strip known prefixes to get
+    # Extract OCLC numbers from, by default 035a's by known prefixes, then stripped
     # just the num, and de-dup.
     def oclcnum(extract_fields = "035a")
+      extractor = MarcExtractor.new(extract_fields, :seperator => nil)
       lambda do |record, accumulator|
-        list = MarcExtractor.extract_by_spec(record, extract_fields, :seperator => nil).collect! do |o|
-          Marc21Semantics.oclcnum_trim(o)
-        end
+        list = extractor.extract(record).collect! do |o|
+          Marc21Semantics.oclcnum_extract(o)
+        end.compact
         accumulator.concat list.uniq if list
       end
     end
-    def self.oclcnum_trim(num)
-      num.gsub(/\A(ocm)|(ocn)|(on)|(\(OCoLC\))/, '')
+    # If a num begins with a known OCLC prefix, return it without the prefix.
+    # otherwise nil.
+    def self.oclcnum_extract(num)
+      stripped = num.gsub(/\A(ocm)|(ocn)|(on)|(\(OCoLC\))/, '')
+      if num != stripped
+        # it had the prefix, which we've now stripped
+        return stripped
+      else
+        # it didn't have the prefix
+        return nil
+      end
     end
@@ -47,12 +58,13 @@ module Traject::Macros
         accumulator << Marc21Semantics.get_sortable_author(record)
       end
     end
     def self.get_sortable_author(record)
-      onexx = MarcExtractor.extract_by_spec(record, "100:110:111", :first => true).first
+      onexx = MarcExtractor.cached("100:110:111", :first => true).extract(record).first
       onexx = onexx.strip if onexx
       titles = []
-      MarcExtractor.new(record, "240:245", :first => true).each_matching_line do |field, spec|
+      MarcExtractor.cached("240:245", :first => true).each_matching_line(record) do |field, spec|
         non_filing = field.indicator2.to_i
         str = field.subfields.collect {|sf| sf.value}.join(" ")
@@ -72,8 +84,9 @@ module Traject::Macros
         accumulator << Marc21Semantics.get_sortable_title(record)
       end
     end
     def self.get_sortable_title(record)
-      MarcExtractor.new(record, "245ab").collect_matching_lines do |field, spec, extractor|
+      MarcExtractor.cached("245ab").collect_matching_lines(record) do |field, spec, extractor|
         str = extractor.collect_subfields(field, spec).first
         if str.nil?
@@ -105,8 +118,10 @@ module Traject::Macros
     def marc_languages(spec = "008[35-37]:041a:041d")
       translation_map = Traject::TranslationMap.new("marc_languages")
+      extractor = MarcExtractor.new(spec, :seperator => nil)
       lambda do |record, accumulator|
-        codes = MarcExtractor.new(record, spec, :seperator => "nil").collect_matching_lines do |field, spec, extractor|
+        codes = extractor.collect_matching_lines(record) do |field, spec, extractor|
           if extractor.control_field?(field)
             (spec[:bytes] ? field.value.byteslice(spec[:bytes]) : field.value)
           else
@@ -134,10 +149,12 @@ module Traject::Macros
     # already covered by another field we're including, so we don't want to double count it, possibly
     # with slight variation.
     def marc_series_facet(spec = "440a:490a:800abcdt:810abcdt:811acdeft:830adfgklmnoprst")
+      extractor = MarcExtractor.new(spec)
       lambda do |record, accumulator|
-        MarcExtractor.new(record, spec).collect_matching_lines do |field, spec, extractor|
+        accumulator.concat( extractor.collect_matching_lines(record) do |field, spec, extractor|
           extractor.collect_subfields(field, spec) unless (field.tag == "490" && field.indicator1 == "1")
-        end
+        end.compact)
       end
     end
@@ -149,8 +166,10 @@ module Traject::Macros
     def marc_instrumentation_humanized(spec = "048ab", options = {})
       translation_map = Traject::TranslationMap.new(options[:translation_map] || "marc_instruments")
+      extractor = MarcExtractor.new(spec, :seperator => nil)
       lambda do |record, accumulator|
-        values = Traject::MarcExtractor.extract_by_spec(record, spec, :seperator => nil)
+        values = extractor.extract(record)
         human = values.collect do |value|
           translation_map[ value.slice(0, 2) ]
         end.uniq
@@ -169,9 +188,12 @@ module Traject::Macros
     # codes.
     def marc_instrument_codes_normalized(spec = "048")
       soloist_suffix = ".s"
+      extractor = MarcExtractor.new("048", :seperator => nil)
       return lambda do |record, accumulator|
         accumulator.concat(
-          MarcExtractor.new(record, "048", :seperator => nil).collect_matching_lines do |field, spec, extractor|
+          extractor.collect_matching_lines(record) do |field, spec, extractor|
             values = []
             field.subfields.each do |sf|
@@ -219,7 +241,7 @@ module Traject::Macros
     # See #marc_publication_date. Yeah, this is a holy mess.
     # Maybe it should actually be extracted to it's own class!
     def self.publication_date(record, estimate_tolerance = 15, min_year = 500, max_year = (Time.new.year + 6))
-      field008 = MarcExtractor.extract_by_spec(record, "008").first
+      field008 = MarcExtractor.cached("008").extract(record).first
       found_date = nil
       if field008 && field008.length >= 11
@@ -264,7 +286,7 @@ module Traject::Macros
       end
       # Okay, nothing from 008, try 260
       if found_date.nil?
-        v260c = MarcExtractor.extract_by_spec(record, "260c", :seperator => nil).first
+        v260c = MarcExtractor.cached("260c", :seperator => nil).extract(record).first
         # just try to take the first four digits out of there, we're not going to try
         # anything crazy.
         if v260c =~ /(\d{4})/
@@ -298,8 +320,10 @@ module Traject::Macros
       default_value = options.has_key?(:default) ? options[:default] : "Unknown"
       translation_map = Traject::TranslationMap.new("lcc_top_level")
+      extractor = MarcExtractor.new(spec, :seperator => nil)
       lambda do |record, accumulator|
-        candidates = MarcExtractor.extract_by_spec(record, spec, :seperator => nil)
+        candidates = extractor.extract(record)
         candidates.reject! do |candidate|
           !(candidate =~ lcc_regex)
@@ -328,10 +352,14 @@ module Traject::Macros
       a_fields_spec = options[:geo_a_fields] || "651a:691a"
       z_fields_spec = options[:geo_z_fields] || "600:610:611:630:648:650:654:655:656:690:651:691"
+      extractor_043a      = MarcExtractor.new("043a", :seperator => nil)
+      extractor_a_fields  = MarcExtractor.new(a_fields_spec, :seperator => nil)
+      extractor_z_fields  = MarcExtractor.new(z_fields_spec)
       lambda do |record, accumulator|
         accumulator.concat(
-          MarcExtractor.extract_by_spec(record, "043a", :seperator => nil).collect do |code|
+          extractor_043a.extract(record).collect do |code|
             # remove any trailing hyphens, then map
             marc_geo_map[code.gsub(/\-+\Z/, '')]
           end.compact
@@ -339,15 +367,15 @@ module Traject::Macros
         #LCSH 651a and 691a go in more or less normally.
         accumulator.concat(
-          MarcExtractor.extract_by_spec(record, a_fields_spec, :seperator => nil).collect do |s|
+          extractor_a_fields.extract(record).collect do |s|
             # remove trailing periods, which they sometimes have if they were
             # at end of LCSH.
             s.sub(/\. */, '')
           end
         )
-        # fields we take z's from have a bit more normalization
-        MarcExtractor.new(record, z_fields_spec).each_matching_line do |field, spec, extractor|
+        # fields we take z's from have a bit more normalization
+        extractor_z_fields.each_matching_line(record) do |field, spec, extractor|
           z_fields = field.subfields.find_all {|sf| sf.code == "z"}.collect {|sf| sf.value }
           # depending on position in total field, may be a period on the end
           # we want to remove.
@@ -376,17 +404,21 @@ module Traject::Macros
       ordinary_fields_spec = "600y:610y:611y:630y:648ay:650y:654y:656y:690y"
       special_fields_spec = "651:691"
       seperator = ": "
+      extractor_ordinary_fields = MarcExtractor.new(ordinary_fields_spec)
+      extractor_special_fields  = MarcExtractor.new(special_fields_spec)
       lambda do |record, accumulator|
         # straightforward ones
-        accumulator.concat( MarcExtractor.extract_by_spec(record, ordinary_fields_spec).collect do |v|
+        accumulator.concat( extractor_ordinary_fields.extract(record).collect do |v|
           # May have a period we have to remove, if it was at end of tag
           v.sub(/\. *\Z/, '')
         end)
-        # weird ones
-        MarcExtractor.new(record, special_fields_spec).each_matching_line do |field, spec, extractor|
+        # weird ones
+        extractor_special_fields.each_matching_line(record) do |field, spec, extractor|
           field.subfields.each do |sf|
             next unless sf.code == 'y'
             if sf.value =~ /\A\s*.+,\s+(ca.\s+)?\d\d\d\d?(-\d\d\d\d?)?( B\.C\.)?[.,; ]*\Z/
@@ -396,7 +428,7 @@ module Traject::Macros
               accumulator << sf.value.sub(/\. *\Z/, '')
             end
           end
-        end
+        end
       end
     end