RubyGems - traject - Versions diffs - 0.0.2 → 0.9.1 - Mend

traject 0.0.2 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

data/Gemfile +4 -0
data/README.md +85 -61
data/Rakefile +5 -0
data/bin/traject +31 -3
data/doc/settings.md +74 -13
data/lib/tasks/load_maps.rake +48 -0
data/lib/traject/indexer/settings.rb +75 -0
data/lib/traject/indexer.rb +255 -45
data/lib/traject/json_writer.rb +4 -2
data/lib/traject/macros/marc21.rb +18 -6
data/lib/traject/macros/marc21_semantics.rb +405 -0
data/lib/traject/macros/marc_format_classifier.rb +180 -0
data/lib/traject/marc4j_reader.rb +160 -0
data/lib/traject/marc_extractor.rb +33 -17
data/lib/traject/marc_reader.rb +14 -11
data/lib/traject/solrj_writer.rb +247 -9
data/lib/traject/thread_pool.rb +154 -0
data/lib/traject/translation_map.rb +46 -4
data/lib/traject/util.rb +30 -0
data/lib/traject/version.rb +1 -1
data/lib/translation_maps/lcc_top_level.yaml +26 -0
data/lib/translation_maps/marc_genre_007.yaml +9 -0
data/lib/translation_maps/marc_genre_leader.yaml +22 -0
data/lib/translation_maps/marc_geographic.yaml +589 -0
data/lib/translation_maps/marc_instruments.yaml +102 -0
data/lib/translation_maps/marc_languages.yaml +490 -0
data/test/indexer/each_record_test.rb +34 -0
data/test/indexer/macros_marc21_semantics_test.rb +206 -0
data/test/indexer/macros_marc21_test.rb +10 -1
data/test/indexer/map_record_test.rb +78 -8
data/test/indexer/read_write_test.rb +43 -10
data/test/indexer/settings_test.rb +60 -4
data/test/indexer/to_field_test.rb +39 -0
data/test/marc4j_reader_test.rb +75 -0
data/test/marc_extractor_test.rb +62 -0
data/test/marc_format_classifier_test.rb +91 -0
data/test/marc_reader_test.rb +12 -0
data/test/solrj_writer_test.rb +146 -43
data/test/test_helper.rb +50 -0
data/test/test_support/245_no_ab.marc +1 -0
data/test/test_support/880_with_no_6.utf8.marc +1 -0
data/test/test_support/bad_subfield_code.marc +1 -0
data/test/test_support/date_resort_to_260.marc +1 -0
data/test/test_support/date_type_r_missing_date2.marc +1 -0
data/test/test_support/date_with_u.marc +1 -0
data/test/test_support/demo_config.rb +153 -0
data/test/test_support/emptyish_record.marc +1 -0
data/test/test_support/louis_armstrong.marc +1 -0
data/test/test_support/manuscript_online_thesis.marc +1 -0
data/test/test_support/microform_online_conference.marc +1 -0
data/test/test_support/multi_era.marc +1 -0
data/test/test_support/multi_geo.marc +1 -0
data/test/test_support/musical_cage.marc +1 -0
data/test/test_support/one-marc8.mrc +1 -0
data/test/test_support/online_only.marc +1 -0
data/test/test_support/packed_041a_lang.marc +1 -0
data/test/test_support/the_business_ren.marc +1 -0
data/test/translation_map_test.rb +8 -0
data/test/translation_maps/properties_map.properties +5 -0
data/traject.gemspec +1 -1
data/vendor/marc4j/README.md +17 -0
data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
metadata +81 -2

data/lib/traject/indexer.rb CHANGED Viewed

@@ -1,8 +1,11 @@
-require 'hashie'
+require 'yell'
 require 'traject'
 require 'traject/qualified_const_get'
+require 'traject/indexer/settings'
 require 'traject/marc_reader'
+require 'traject/marc4j_reader'
 require 'traject/json_writer'
 require 'traject/solrj_writer'
@@ -28,11 +31,15 @@ require 'traject/macros/basic'
 #
 #
 #  A Writer is any class that:
-#  1) Has a one-argument initializer taking a Settings hash.
+#  1) Has a one-argument initializer taking a Settings hash. (The logger
+#     is provided to the Writer in settings["logger"])
 #  2) Responds to a one argument #put method, where the argument is
-#     a hash of mapped keys/values. The writer should write them
+#     a Traject::Indexer::Context, containing an #output_hash
+#     hash of mapped keys/values. The writer should write them
 #     to the appropriate place.
 #  3) Responds to a #close method, called when we're done.
+#  4) Optionally implements a #skipped_record_count method, returning int count of records
+#     that were skipped due to errors (and presumably logged)
 #
 #  The default writer (will be) the SolrWriter , which is configured
 #  through additional Settings as well. A JsonWriter is also available,
@@ -55,8 +62,9 @@ class Traject::Indexer
   include Traject::Macros::Basic
-  def initialize
-    @settings = Settings.new(self.class.default_settings)
+  # optional hash or Traject::Indexer::Settings object of settings.
+  def initialize(arg_settings = {})
+    @settings = Settings.new(arg_settings)
     @index_steps = []
   end
@@ -88,58 +96,270 @@ class Traject::Indexer
     return @settings
   end
+  def logger
+    @logger ||= create_logger
+  end
+  attr_writer :logger
+  # Just calculates the arg that's gonna be given to Yell.new
+  # or SomeLogger.new
+  def logger_argument
+    specified = settings["log.file"] || "STDERR"
+    case specified
+    when "STDOUT" then STDOUT
+    when "STDERR" then STDERR
+    else specified
+    end
+  end
+  # Second arg to Yell.new, options hash, calculated from
+  # settings
+  def logger_options
+    # formatter, default is fairly basic
+    format = settings["log.format"] || "%d %5L %m"
+    format = case format
+    when "false" then false
+    when "" then nil
+    else format
+    end
+    level = settings["log.level"] || "info"
+    {:format => format, :level => level}
+  end
+  # Create logger according to settings
+  def create_logger
+    # log everything to STDERR or specified logfile
+    logger = Yell.new( logger_argument, logger_options )
+    # ADDITIONALLY log error and higher to....
+    if settings["log.error_file"]
+      logger.adapter :file, settings["log.error_file"], :level => 'gte.error'
+    end
+    return logger
+  end
   # Used to define an indexing mapping.
   def to_field(field_name, aLambda = nil, &block)
+    if field_name.nil? || field_name.empty?
+      raise ArgumentError.new("to_field requires a non-blank first argument, field name")
+    end
+    [aLambda, block].each do |proc|
+      # allow negative arity, meaning variable/optional, trust em on that.
+      # but for positive arrity, we need 2 or 3 args
+      if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
+        raise ArgumentError.new("block/proc given to to_field needs 2 or 3 arguments: #{proc}")
+      end
+    end
     @index_steps << {
       :field_name => field_name.to_s,
       :lambda => aLambda,
-      :block  => block
+      :block  => block,
+      :type   => :to_field,
+      :source_location => Traject::Util.extract_caller_location(caller.first)
+    }
+  end
+  def each_record(aLambda = nil, &block)
+    # arity check
+    [aLambda, block].each do |proc|
+      # allow negative arity, meaning variable/optional, trust em on that.
+      # but for positive arrity, we need 1 or 2 args
+      if proc && (proc.arity == 0 || proc.arity > 2)
+        raise ArgumentError.new("block/proc given to to_field needs 1 or 2 arguments: #{proc}")
+      end
+    end
+    @index_steps << {
+      :lambda => aLambda,
+      :block  => block,
+      :type   => :each_record,
+      :source_location => Traject::Util.extract_caller_location(caller.first)
     }
   end
-  # Processes a single record, according to indexing rules
-  # set up in this Indexer. Returns a hash whose values are
-  # Arrays, and keys are strings.
+  # Processes a single record according to indexing rules set up in
+  # this indexer. Returns the output hash (a hash whose keys are
+  # string fields, and values are arrays of one or more values in that field)
   #
+  # This is a convenience shortcut for #map_to_context! -- use that one
+  # if you want to provide addtional context
+  # like position, and/or get back the full context.
   def map_record(record)
     context = Context.new(:source_record => record, :settings => settings)
+    map_to_context!(context)
+    return context.output_hash
+  end
+  # Maps a single record INTO the second argument, a Traject::Indexer::Context.
+  #
+  # Context must be passed with a #source_record and #settings, and optionally
+  # a #position.
+  #
+  # Context will be mutated by this method, most significantly by adding
+  # an #output_hash, a hash from fieldname to array of values in that field.
+  #
+  # Pass in a context with a set #position if you want that to be available
+  # to mapping routines.
+  #
+  # Returns the context passed in as second arg, as a convenience for chaining etc.
+  def map_to_context!(context)
     @index_steps.each do |index_step|
-      accumulator = []
-      field_name  = index_step[:field_name]
-      context.field_name = field_name
-      # Might have a lambda arg AND a block, we execute in order,
-      # with same accumulator.
-      [index_step[:lambda], index_step[:block]].each do |aProc|
-        if aProc
-          case aProc.arity
-          when 1 then aProc.call(record)
-          when 2 then aProc.call(record, accumulator)
-          else        aProc.call(record, accumulator, context)
+      if index_step[:type] == :to_field
+        accumulator = []
+        context.field_name = index_step[:field_name]
+        # Might have a lambda arg AND a block, we execute in order,
+        # with same accumulator.
+        [index_step[:lambda], index_step[:block]].each do |aProc|
+          if aProc
+            log_mapping_errors(context, index_step, aProc) do
+              if aProc.arity == 2
+                aProc.call(context.source_record, accumulator)
+              else
+                aProc.call(context.source_record, accumulator, context)
+              end
+            end
           end
         end
+        (context.output_hash[context.field_name] ||= []).concat accumulator unless accumulator.empty?
+        context.field_name = nil
-      end
+      elsif index_step[:type] == :each_record
+        # one or two arg
+        [index_step[:lambda], index_step[:block]].each do |aProc|
+          if aProc
+            log_mapping_errors(context, index_step, aProc) do
+              if aProc.arity == 1
+                aProc.call(context.source_record)
+              else
+                aProc.call(context.source_record, context)
+              end
+            end
+          end
+        end
-      (context.output_hash[field_name] ||= []).concat accumulator
-      context.field_name = nil
+      else
+        raise ArgumentError.new("An @index_step we don't know how to deal with: #{@index_step}")
+      end
     end
-    return context.output_hash
+    return context
+  end
+  # just a wrapper that captures and records any unexpected
+  # errors raised in mapping, along with contextual information
+  # on record and location in source file of mapping rule.
+  #
+  # Re-raises error at the moment.
+  #
+  # log_errors(context, some_lambda) do
+  #    all_sorts_of_stuff # that will have errors logged
+  # end
+  def log_mapping_errors(context, index_step, aProc)
+    begin
+      yield
+    rescue Exception => e
+      msg =  "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
+      conf = context.field_name ? "to_field '#{context.field_name}'" : "each_record"
+      msg += "    while executing #{conf} defined at #{index_step[:source_location]}\n"
+      msg += Traject::Util.exception_to_log_message(e)
+      logger.error msg
+      logger.debug "Record: " + context.source_record.to_s
+      raise e
+    end
   end
   # Processes a stream of records, reading from the configured Reader,
   # mapping according to configured mapping rules, and then writing
   # to configured Writer.
+  #
+  # returns 'false' as a signal to command line to return non-zero exit code
+  # for some reason (reason found in logs, presumably). This particular mechanism
+  # is open to complexification, starting simple. We do need SOME way to return
+  # non-zero to command line.
+  #
   def process(io_stream)
+    settings.fill_in_defaults!
+    count      =       0
+    start_time = batch_start_time = Time.now
+    logger.info "beginning Indexer#process with settings: #{settings.inspect}"
     reader = self.reader!(io_stream)
     writer = self.writer!
-    reader.each do |record|
-      writer.put map_record(record)
+    thread_pool = Traject::ThreadPool.new(settings["processing_thread_pool"].to_i)
+    logger.info "   with reader: #{reader.class.name} and writer: #{writer.class.name}"
+    reader.each do |record; position|
+      count += 1
+      # have to use a block local var, so the changing `count` one
+      # doesn't get caught in the closure. Weird, yeah.
+      position = count
+      thread_pool.raise_collected_exception!
+      if settings["debug_ascii_progress"].to_s == "true"
+        $stderr.write "." if count % settings["solrj_writer.batch_size"] == 0
+      end
+      if settings["log.batch_progress"] && (count % settings["log.batch_progress"].to_i == 0)
+        batch_rps = settings["log.batch_progress"].to_i / (Time.now - batch_start_time)
+        overall_rps = count / (Time.now - start_time)
+        logger.info "Traject::Indexer#process, read #{count} records at id:#{id_string(record)}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall"
+        batch_start_time = Time.now
+      end
+      # we have to use this weird lambda to properly "capture" the count, instead
+      # of having it be bound to the original variable in a non-threadsafe way.
+      # This is confusing, I might not be understanding things properly, but that's where i am.
+      #thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
+      thread_pool.maybe_in_thread_pool do
+        context = Context.new(:source_record => record, :settings => settings, :position => position)
+        map_to_context!(context)
+        writer.put context
+      end
     end
+    $stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
+    logger.debug "Shutting down #processing mapper threadpool..."
+    thread_pool.shutdown_and_wait
+    logger.debug "#processing mapper threadpool shutdown complete."
+    thread_pool.raise_collected_exception!
     writer.close if writer.respond_to?(:close)
+    elapsed        = Time.now - start_time
+    avg_rps        = (count / elapsed)
+    logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
+    if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
+      logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
+      return false
+    end
+    return true
   end
   def reader_class
@@ -159,33 +379,21 @@ class Traject::Indexer
   # Instantiate a Traject Reader, using class set
   # in #reader_class, initialized with io_stream passed in
   def reader!(io_stream)
-    return reader_class.new(io_stream, settings)
+    return reader_class.new(io_stream, settings.merge("logger" => logger))
   end
   # Instantiate a Traject Writer, suing class set in #writer_class
   def writer!
-    return writer_class.new(settings)
+    return writer_class.new(settings.merge("logger" => logger))
   end
-  def self.default_settings
-    {
-      "reader_class_name" => "Traject::MarcReader",
-      "writer_class_name" => "Traject::SolrJWriter"
-    }
+  # get a printable id from record for error logging.
+  # Maybe override this for a future XML version.
+  def id_string(record)
+    record && record['001'] && record['001'].value.to_s
   end
-  # Enhanced with a few features from Hashie, to make it for
-  # instance string/symbol indifferent
-  class Settings < Hash
-    include Hashie::Extensions::MergeInitializer # can init with hash
-    include Hashie::Extensions::IndifferentAccess
-    # Hashie bug Issue #100 https://github.com/intridea/hashie/pull/100
-    alias_method :store, :indifferent_writer
-  end
   # Represents the context of a specific record being indexed, passed
   # to indexing logic blocks
   #
@@ -203,5 +411,7 @@ class Traject::Indexer
     attr_accessor :clipboard, :output_hash
     attr_accessor :field_name, :source_record, :settings
+    # 1-based position in stream of processed records.
+    attr_accessor :position
   end
 end

data/lib/traject/json_writer.rb CHANGED Viewed

@@ -20,7 +20,9 @@ class Traject::JsonWriter
     @settings = argSettings
   end
-  def put(hash)
+  def put(context)
+    hash = context.output_hash
     serialized =
       if settings["json_writer.pretty_print"]
         JSON.pretty_generate(hash)
@@ -34,7 +36,7 @@ class Traject::JsonWriter
     unless defined? @output_file
       @output_file =
         if settings["output_file"]
-          File.open(settings["output_file"])
+          File.open(settings["output_file"], 'w:UTF-8')
         elsif settings["output_stream"]
           settings["output_stream"]
         else

data/lib/traject/macros/marc21.rb CHANGED Viewed

@@ -18,6 +18,13 @@ module Traject::Macros
     # Second arg is optional options, including options valid on MarcExtractor.new,
     # and others. (TODO)
     #
+    # * :first => true: take only first value
+    # * :translation_map => String: translate with named translation map looked up in load
+    #       path, uses Tranject::TranslationMap.new(translation_map_arg)
+    # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
+    #     have shown themselves useful with Marc, using Marc21.trim_punctuation
+    # * :default => String: if otherwise empty, add default value
+    #
     # Examples:
     #
     # to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
@@ -26,11 +33,12 @@ module Traject::Macros
     def extract_marc(spec, options = {})
       only_first              = options.delete(:first)
       trim_punctuation        = options.delete(:trim_punctuation)
+      default_value           = options.delete(:default)
       # We create the TranslationMap here on load, not inside the closure
       # where it'll be called for every record. Since TranslationMap is supposed
       # to cache, prob doesn't matter, but doens't hurt. Also causes any syntax
-      # exceptions to raise on load.
+      # exceptions to raise on load.
       if translation_map_arg  = options.delete(:translation_map)
         translation_map = Traject::TranslationMap.new(translation_map_arg)
       end
@@ -49,6 +57,10 @@ module Traject::Macros
         if trim_punctuation
           accumulator.collect! {|s| Marc21.trim_punctuation(s)}
         end
+        if default_value && accumulator.empty?
+          accumulator << default_value
+        end
       end
     end
@@ -97,7 +109,7 @@ module Traject::Macros
     # All fields in from-to must be marc DATA (not control fields), or weirdness
     #
     # Can always run this thing multiple times on the same field if you need
-    # non-contiguous ranges of fields.
+    # non-contiguous ranges of fields.
     def extract_all_marc_values(options = {})
       options = {:from => "100", :to => "899", :seperator => ' '}.merge(options)
@@ -123,15 +135,15 @@ module Traject::Macros
     # pretty simple.
     #
     # Removes
-    # * trailing: comma, slash, semicolon, colon (possibly followed by whitespace)
-    # * trailing period if it is preceded by at least three letters (possibly followed by whitespace)
+    # * trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
+    # * trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
     # * single square bracket characters if they are the start and/or end
     #   chars and there are no internal square brackets.
     #
     # Returns altered string, doesn't change original arg.
     def self.trim_punctuation(str)
-      str = str.sub(/[ ,\/;:] *\Z/, '')
-      str = str.sub(/(\w\w\w)\. *\Z/, '\1')
+      str = str.sub(/ *[ ,\/;:] *\Z/, '')
+      str = str.sub(/ *(\w\w\w)\. *\Z/, '\1')
       str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
       return str
     end