RubyGems - traject - Versions diffs - 0.13.2 → 0.14.1 - Mend

traject 0.13.2 → 0.14.1

Files changed (13) hide show

data/README.md +15 -9
data/lib/traject/indexer.rb +158 -157
data/lib/traject/macros/marc21.rb +30 -0
data/lib/traject/marc_extractor.rb +39 -21
data/lib/traject/solrj_writer.rb +1 -1
data/lib/traject/version.rb +1 -1
data/test/indexer/each_record_test.rb +4 -4
data/test/indexer/macros_marc21_test.rb +9 -0
data/test/indexer/read_write_test.rb +5 -1
data/test/indexer/to_field_test.rb +10 -23
data/test/marc_extractor_test.rb +73 -20
data/traject.gemspec +1 -1
metadata +2 -1

data/README.md CHANGED Viewed

@@ -13,19 +13,21 @@ them somewhere.
 ## Background/Goals
-Existing tools for indexing Marc to Solr exist, and have served us well for many years, and have many useful things about them -- which I've tried to preserve in traject.  But I was having more and more difficulty working with the existing tools, including difficulty providing the custom logic I needed in a maintainable way. I realized that for me, to create a tool with the flexibility, maintainability, and performance I wanted, I would need to do it in jruby (ruby on the JVM).
+Existing tools for indexing Marc to Solr served us well for many years, and have many features.
+But we were having more and more difficulty with them, including in extending/customizing in maintainable ways.
+We realized that to create a tool with the API (internal and external) we wanted, we could do a better
+job with jruby (ruby on the JVM).
-* *Easy to use*, getting started with standard use cases should be easy, even for non-rubyists.
-* *Support customization and flexiblity*, common customization use cases, including simple local
+* **Easy to use**, getting started with standard use cases should be easy, even for non-rubyists.
+* **Support customization and flexiblity**, common customization use cases, including simple local
   logic, should be very easy. More sophisticated and even complex customization use cases should still be possible,
   changing just the parts of traject you want to change.
-* *Maintainable local logic*, including supporting sharing of reusable logic via ruby gems.
-* *Maintainable understandable internal logic*; well-covered by tests, well-factored separation of concerns,
+* **Maintainable local logic**, supporting sharing of reusable logic via ruby gems.
+* **Comprehensible internal logic**; well-covered by tests, well-factored separation of concerns,
 easy for newcomer developers who know ruby to understand the codebase.
-* *High performance*, using multi-threaded concurrency where appropriate to maximize throughput.
-While it depends on your configuration and the size of your server(s), traject is likely higher
-performance than other similar solutions.
-* *Well-behaved shell script*, for painless integration in batch processes and cronjobs, with
+* **High performance**, using multi-threaded concurrency where appropriate to maximize throughput.
+traject likely will provide higher throughput than other similar solutions.
+* **Well-behaved shell script**, for painless integration in batch processes and cronjobs, with
 exit codes, sufficiently flexible control of logging, proper use of stderr, etc.
@@ -167,6 +169,10 @@ Other examples of the specification string, which can include multiple tag menti
   # each in separate strings:
   to_field "isbn", extract_marc("020az", :separator => nil)
+  # Same thing, but more explicit
+  to_field "isbn", extract_marc("020a:020z")
   # Make sure that you don't get any duplicates
   # by passing in ":deduplicate => true"
   to_field 'language008', extract_marc('008[35-37]', :deduplicate=>true)

data/lib/traject/indexer.rb CHANGED Viewed

@@ -50,13 +50,13 @@ require 'traject/macros/basic'
 #  with a String name of class meeting the Writer contract.
 #
 class Traject::Indexer
   # Arity error on a passed block
   class ArityError < ArgumentError; end
   class NamingError < ArgumentError; end
   include Traject::QualifiedConstGet
   attr_writer :reader_class, :writer_class
@@ -155,26 +155,11 @@ class Traject::Indexer
   # Used to define an indexing mapping.
   def to_field(field_name, aLambda = nil, &block)
-    verify_to_field_arguments(field_name, aLambda, block)
-    @index_steps << {
-      :field_name => field_name.to_s,
-      :lambda => aLambda,
-      :block  => block,
-      :type   => :to_field,
-      :source_location => Traject::Util.extract_caller_location(caller.first)
-    }
+    @index_steps << ToFieldStep.new(field_name, aLambda, block, Traject::Util.extract_caller_location(caller.first) )
   end
   def each_record(aLambda = nil, &block)
-    verify_each_record_arguments(aLambda, block)
-    @index_steps << {
-      :lambda => aLambda,
-      :block  => block,
-      :type   => :each_record,
-      :source_location => Traject::Util.extract_caller_location(caller.first)
-    }
+    @index_steps << EachRecordStep.new(aLambda, block, Traject::Util.extract_caller_location(caller.first) )
   end
@@ -203,51 +188,24 @@ class Traject::Indexer
   # to mapping routines.
   #
   # Returns the context passed in as second arg, as a convenience for chaining etc.
   def map_to_context!(context)
     @index_steps.each do |index_step|
       # Don't bother if we're skipping this record
       break if context.skip?
-      if index_step[:type] == :to_field
-        accumulator = []
-        context.field_name = index_step[:field_name]
-        # Might have a lambda arg AND a block, we execute in order,
-        # with same accumulator.
-        [index_step[:lambda], index_step[:block]].each do |aProc|
-          if aProc
-            log_mapping_errors(context, index_step, aProc) do
-              if aProc.arity == 2
-                aProc.call(context.source_record, accumulator)
-              else
-                aProc.call(context.source_record, accumulator, context)
-              end
-            end
-          end
-        end
-        accumulator.compact!
-        (context.output_hash[context.field_name] ||= []).concat accumulator unless accumulator.empty?
-        context.field_name = nil
-      elsif index_step[:type] == :each_record
-        # one or two arg
-        [index_step[:lambda], index_step[:block]].each do |aProc|
-          if aProc
-            log_mapping_errors(context, index_step, aProc) do
-              if aProc.arity == 1
-                aProc.call(context.source_record)
-              else
-                aProc.call(context.source_record, context)
-              end
-            end
-          end
-        end
-      else
-        raise ArgumentError.new("An @index_step we don't know how to deal with: #{@index_step}")
+      context.index_step = index_step
+      accumulator = log_mapping_errors(context, index_step) do
+        index_step.execute(context) # will always return [] for an each_record step
       end
+      context.index_step =
+      accumulator.compact!
+      if accumulator.size > 0
+        (context.output_hash[index_step.field_name] ||= []).concat accumulator
+      end
+      context.index_step = index_step
     end
     return context
@@ -255,22 +213,19 @@ class Traject::Indexer
   # just a wrapper that captures and records any unexpected
   # errors raised in mapping, along with contextual information
-  # on record and location in source file of mapping rule.
+  # on record and location in source file of mapping rule.
   #
-  # Re-raises error at the moment.
+  # Re-raises error at the moment.
   #
-  # log_errors(context, some_lambda) do
+  # log_mapping_errors(context, index_step) do
   #    all_sorts_of_stuff # that will have errors logged
   # end
-  def log_mapping_errors(context, index_step, aProc)
+  def log_mapping_errors(context, index_step)
     begin
       yield
     rescue Exception => e
       msg =  "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
-      conf = context.field_name ? "to_field '#{context.field_name}'" : "each_record"
-      msg += "    while executing #{conf} defined at #{index_step[:source_location]}\n"
+      msg += "    while executing #{index_step.inspect}\n"
       msg += Traject::Util.exception_to_log_message(e)
       logger.error msg
@@ -284,6 +239,12 @@ class Traject::Indexer
     end
   end
+  # get a printable id from record for error logging.
+  # Maybe override this for a future XML version.
+  def id_string(record)
+    record && record['001'] && record['001'].value.to_s
+  end
   # Processes a stream of records, reading from the configured Reader,
   # mapping according to configured mapping rules, and then writing
   # to configured Writer.
@@ -335,13 +296,14 @@ class Traject::Indexer
       #thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
       thread_pool.maybe_in_thread_pool do
         context = Context.new(:source_record => record, :settings => settings, :position => position)
+        context.logger = logger
         map_to_context!(context)
         if context.skip?
           log_skip(context)
         else
           writer.put context
         end
       end
     end
@@ -353,7 +315,7 @@ class Traject::Indexer
     thread_pool.raise_collected_exception!
     writer.close if writer.respond_to?(:close)
     elapsed        = Time.now - start_time
@@ -367,7 +329,7 @@ class Traject::Indexer
     return true
   end
   # Log that the current record is being skipped, using
   # data in context.position and context.skipmessage
   def log_skip(context)
@@ -399,89 +361,10 @@ class Traject::Indexer
     return writer_class.new(settings.merge("logger" => logger))
   end
-  # get a printable id from record for error logging.
-  # Maybe override this for a future XML version.
-  def id_string(record)
-    record && record['001'] && record['001'].value.to_s
-  end
-  # Verify that the field name is good, and throw a useful error if not
-  def verify_field_name(field_name)
-    if field_name.nil? || !field_name.is_a?(String) || field_name.empty?
-      raise NamingError.new("to_field requires the field name (String) as the first argument (#{last_named_step.message})")
-    end
-  end
-  # Verify the various, increasingly-complex things that can be sent to to_field
-  # to make sure it's all kosher.
-  #
-  # "Modification" takes place for zero-argument blocks that return a lambda
-  def verify_to_field_arguments(field_name, aLambda, block)
-    verify_field_name(field_name)
-    [aLambda, block].each do |proc|
-      # allow negative arity, meaning variable/optional, trust em on that.
-      # but for positive arrity, we need 2 or 3 args
-      if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
-        raise ArityError.new("error parsing field '#{field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{last_named_step.message})")
-      end
-    end
-  end
-  # Verify the procs sent to each_record to make sure it's all kosher.
-  def verify_each_record_arguments(aLambda, block)
-    unless aLambda or block
-      raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{last_named_step.message})")
-    end
-    [aLambda, block].each do |proc|
-      # allow negative arity, meaning variable/optional, trust em on that.
-      # but for positive arrity, we need 1 or 2 args
-      if proc
-        unless proc.is_a?(Proc)
-          raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} (#{last_named_step.message})")
-        end
-        if (proc.arity == 0 || proc.arity > 2)
-          raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: #{proc} (#{last_named_step.message})")
-        end
-      end
-    end
-  end
-  def last_named_step
-    return LastNamedStep.new(@index_steps)
-  end
-  # A convenient way to find, and generate error messages for, the last named step (for helping locate parse errors)
-  class LastNamedStep
-    attr_accessor :step, :message
-    # Get the last step for which we have a field_name (e.g., the last to_field, skipping over each_record)
-    def initialize(index_steps)
-      @step = index_steps.reverse_each.find{|step| step[:field_name]}
-      if @step
-        @message = "last successfully parsed field was '#{@step[:field_name]}'"
-      else
-        @message = "there were no previous named fields successfully parsed"
-      end
-    end
-  end
   # Represents the context of a specific record being indexed, passed
   # to indexing logic blocks
   #
-  class Traject::Indexer::Context
+  class Context
     def initialize(hash_init = {})
       # TODO, argument checking for required args?
@@ -491,29 +374,147 @@ class Traject::Indexer
       hash_init.each_pair do |key, value|
         self.send("#{key}=", value)
       end
       @skip = false
     end
-    attr_accessor :clipboard, :output_hash
-    attr_accessor :field_name, :source_record, :settings
+    attr_accessor :clipboard, :output_hash, :logger
+    attr_accessor :index_step, :source_record, :settings
     # 1-based position in stream of processed records.
     attr_accessor :position
     # Should we be skipping this record?
     attr_accessor :skipmessage
     # Set the fact that this record should be skipped, with an
     # optional message
     def skip!(msg = '(no message given)')
       @skipmessage = msg
       @skip = true
     end
     # Should we skip this record?
     def skip?
       @skip
     end
+  end
+  # An indexing step definition, including it's source location
+  # for logging
+  #
+  # This one represents an "each_record" step, a subclass below
+  # for "to_field"
+  #
+  # source_location is just a string with filename and line number for
+  # showing to devs in debugging.
+  class EachRecordStep
+    attr_accessor :source_location, :lambda, :block
+    def initialize(lambda, block, source_location)
+      self.lambda = lambda
+      self.block = block
+      self.source_location = source_location
+      self.validate!
+    end
+    # raises if bad data
+    def validate!
+      unless self.lambda or self.block
+        raise ArgumentError.new("Missing Argument: each_record must take a block/lambda as an argument (#{self.inspect})")
+      end
+      [self.lambda, self.block].each do |proc|
+        # allow negative arity, meaning variable/optional, trust em on that.
+        # but for positive arrity, we need 1 or 2 args
+        if proc
+          unless proc.is_a?(Proc)
+            raise NamingError.new("argument to each_record must be a block/lambda, not a #{proc.class} (#{self.inspect})")
+          end
+          if (proc.arity == 0 || proc.arity > 2)
+            raise ArityError.new("block/proc given to each_record needs 1 or 2 arguments: (#{self.inspect})")
+          end
+        end
+      end
+    end
+    # For each_record, always return an empty array as the
+    # accumulator, since it doesn't have those kinds of side effects
+    def execute(context)
+      [@lambda, @block].each do |aProc|
+        next unless aProc
+        if aProc.arity == 1
+          aProc.call(context.source_record)
+        else
+          aProc.call(context.source_record, context)
+        end
+      end
+      return [] # empty -- no accumulator for each_record
+    end
+    # Over-ride inspect for outputting error messages etc.
+    def inspect
+      "<each_record at #{source_location}>"
+    end
   end
+  # An indexing step definition for a "to_field" step to specific
+  # field.
+  class ToFieldStep
+    attr_accessor :field_name, :lambda, :block, :source_location
+    def initialize(fieldname, lambda, block, source_location)
+      self.field_name = fieldname
+      self.lambda = lambda
+      self.block = block
+      self.source_location = source_location
+      validate!
+    end
+    def validate!
+      if self.field_name.nil? || !self.field_name.is_a?(String) || self.field_name.empty?
+        raise NamingError.new("to_field requires the field name (as a string) as the first argument at #{self.source_location})")
+      end
+      [self.lambda, self.block].each do |proc|
+        # allow negative arity, meaning variable/optional, trust em on that.
+        # but for positive arrity, we need 2 or 3 args
+        if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
+          raise ArityError.new("error parsing field '#{self.field_name}': block/proc given to to_field needs 2 or 3 (or variable) arguments: #{proc} (#{self.inspect})")
+        end
+      end
+    end
+    # Override inspect for developer debug messages
+    def inspect
+      "<to_field #{self.field_name} at #{self.source_location}>"
+    end
+    def execute(context)
+      accumulator = []
+      [@lambda, @block].each do |aProc|
+        next unless aProc
+        if aProc.arity == 2
+          aProc.call(context.source_record, accumulator)
+        else
+          aProc.call(context.source_record, accumulator, context)
+        end
+      end
+      return accumulator
+    end
+  end
 end

data/lib/traject/macros/marc21.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'traject/marc_extractor'
 require 'traject/translation_map'
+require 'traject/util'
 require 'base64'
 require 'json'
@@ -30,7 +31,22 @@ module Traject::Macros
     # to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
     # to_field("id"),    extract_marc("001", :first => true)
     # to_field("geo"),   extract_marc("040a", :separator => nil, :translation_map => "marc040")
+    #  A list of symbols that are valid keys in the options hash
+    EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
+                                  :deduplicate, :uniq, :separator, :translation_map,
+                                  :alternate_script]
     def extract_marc(spec, options = {})
+      # Raise an error if there are any invalid options, indicating a
+      # misspelled or illegal option, using a string instead of a symbol, etc.
+      unless (options.keys - EXTRACT_MARC_VALID_OPTIONS).empty?
+        raise RuntimeError.new("Illegal/Unknown argument '#{options.keys.join(', ')}' in extract_marc at #{Traject::Util.extract_caller_location(caller.first)}")
+      end
       only_first              = options.delete(:first)
       trim_punctuation        = options.delete(:trim_punctuation)
       default_value           = options.delete(:default)
@@ -46,6 +62,7 @@ module Traject::Macros
       if translation_map_arg  = options.delete(:translation_map)
         translation_map = Traject::TranslationMap.new(translation_map_arg)
       end
       extractor = Traject::MarcExtractor.new(spec, options)
@@ -93,7 +110,14 @@ module Traject::Macros
     #          serialized, with certain header bytes filled with ascii 0's
     #          -- technically illegal MARC, but can still be read by
     #          ruby MARC::Reader in permissive mode.
+    SERIALZED_MARC_VALID_OPTIONS = [:format, :binary_escape, :allow_oversized, :format]
     def serialized_marc(options)
+      unless (options.keys - SERIALZED_MARC_VALID_OPTIONS).empty?
+        raise RuntimeError.new("Illegal/Unknown argument '#{options.keys.join(', ')}' in seralized_marc at #{Traject::Util.extract_caller_location(caller.first)}")
+      end
       format          = options[:format].to_s
       binary_escape   = (options[:binary_escape] != false)
       allow_oversized = (options[:allow_oversized] == true)
@@ -129,7 +153,13 @@ module Traject::Macros
     #
     # Can always run this thing multiple times on the same field if you need
     # non-contiguous ranges of fields.
+    EXTRACT_ALL_MARC_VALID_OPTIONS = [:separator, :from, :to]
     def extract_all_marc_values(options = {})
+      unless (options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).empty?
+        raise RuntimeError.new("Illegal/Unknown argument '#{options.keys.join(', ')}' in extract_all_marc at #{Traject::Util.extract_caller_location(caller.first)}")
+      end
       options = {:from => "100", :to => "899", :separator => ' '}.merge(options)
       lambda do |record, accumulator, context|

data/lib/traject/marc_extractor.rb CHANGED Viewed

@@ -135,13 +135,23 @@ module Traject
     #  "008[35-37]:LDR[5]"
     #  => bytes 35-37 inclusive of field 008, and byte 5 of the marc leader.
     #
-    # Returns a nested hash keyed by tags.
-    # { tag => {
-    #     :subfields => ['a', 'b', '2'] # actually, a SET. may be empty or nil
-    #     :indicators => ['1', '0'] # An array. may be empty or nil; duple, either one can be nil
-    #    }
-    #}
-    # For byte offsets, :bytes => 12 or :bytes => (7..10)
+    # Returns a nested hash whose keys are tags and whose value is an array
+    # of hash structures indicating what indicators and subfields (or
+    # byte-offsets for control fields) are needed, e.g.
+    #
+    # '245|1*|a:245ab:110:008[15-17]:008[17]' would give us
+    #
+    # {
+    #   '245' => [
+    #     {:indicators => ['1', nil], :subfields=>['a']},
+    #     {:subfields => ['a', 'b']}
+    #    ]
+    #    '110' => [{}] # all subfields, indicators don't matter
+    #    '008' => [
+    #      {:bytes => (15..17)}
+    #      {:bytes => 17}
+    #    ]
+    # }
     #
     # * subfields and indicators can only be provided for marc data/variable fields
     # * byte slice can only be provided for marc control fields (generally tags less than 010)
@@ -156,26 +166,31 @@ module Traject
           # variable field
           tag, indicators, subfields = $1, $3, $4
-          hash[tag] ||= {}
+          hash[tag] ||= []
+          spec = {}
-          if subfields
-            subfields.each_char do |subfield|
-              hash[tag][:subfields] ||= Array.new
-              hash[tag][:subfields] << subfield
-            end
+          if subfields and !subfields.empty?
+            spec[:subfields] = subfields.split('')
           end
           if indicators
-            hash[tag][:indicators] = [ (indicators[0] if indicators[0] != "*"), (indicators[1] if indicators[1] != "*") ]
+           spec[:indicators] = [ (indicators[0] if indicators[0] != "*"), (indicators[1] if indicators[1] != "*") ]
           end
+          hash[tag] << spec
         elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # "005[4-5]"
           tag, byte1, byte2 = $1, $3, $5
-          hash[tag] ||= {}
+          hash[tag] ||= []
+          spec = {}
           if byte1 && byte2
-            hash[tag][:bytes] = ((byte1.to_i)..(byte2.to_i))
+            spec[:bytes] = ((byte1.to_i)..(byte2.to_i))
           elsif byte1
-            hash[tag][:bytes] = byte1.to_i
+           spec[:bytes] = byte1.to_i
           end
+          hash[tag] << spec
         else
           raise ArgumentError.new("Unrecognized marc extract specification: #{part}")
         end
@@ -210,15 +225,18 @@ module Traject
     def each_matching_line(marc_record)
       marc_record.fields(@interesting_tags_hash.keys).each do |field|
-        spec = spec_covering_field(field)
+        specs = spec_covering_field(field)
         # Don't have a spec that addresses this field? Move on.
-        next unless spec
+        next unless specs
         # Make sure it matches indicators too, spec_covering_field
         # doens't check that.
-        if matches_indicators(field, spec)
-          yield(field, spec, self)
+        specs.each do |spec|
+          if matches_indicators(field, spec)
+            yield(field, spec, self)
+          end
         end
       end
     end

data/lib/traject/solrj_writer.rb CHANGED Viewed

@@ -109,7 +109,7 @@ class Traject::SolrJWriter
     @debug_ascii_progress = (@settings["debug_ascii_progress"].to_s == "true")
-    logger.info("   SolrJWriter writing to '#{settings['solr.url']}'")
+    logger.info("   #{self.class.name} writing to '#{settings['solr.url']}'")
   end
   # Loads solrj if not already loaded. By loading all jars found

data/lib/traject/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Traject
-  VERSION = "0.13.2"
+  VERSION = "0.14.1"
 end

data/test/indexer/each_record_test.rb CHANGED Viewed

@@ -31,14 +31,14 @@ describe "Traject::Indexer#each_record" do
       end
     end
-    it "finds first (only) field on each_record error" do
+    it "outputs error with source location" do
       begin
         @indexer.to_field('foo') {|one, two| }
         @indexer.each_record {|one, two, three| }   # bad arity
         flunk("Should have rejected bad arity ")
       rescue Traject::Indexer::ArityError => e
-        assert_match(/foo/, e.message)
-      rescue
+        assert_match(/each_record at .*\/.*:\d+/, e.message)
+      rescue
         flunk("Should only fail with a ArityError")
       end
     end
@@ -53,7 +53,7 @@ describe "Traject::Indexer#each_record" do
       assert_raises(ArgumentError) do
         @indexer.each_record()
       end
-    end
+    end
   end
 end

data/test/indexer/macros_marc21_test.rb CHANGED Viewed

@@ -75,6 +75,15 @@ describe "Traject::Macros::Marc21" do
     end
+    it "fails on an extra/misspelled argument to extract_marc" do
+      assert_raises(RuntimeError) do
+        @indexer.instance_eval do
+          to_field "foo", extract_marc("9999", :misspelled => "Who cares")
+        end
+      end
+    end
     it "Marc21::trim_punctuation class method" do

data/test/indexer/read_write_test.rb CHANGED Viewed

@@ -34,7 +34,11 @@ describe "Traject::Indexer#process" do
     @indexer.to_field("title") do |record, accumulator, context|
       times_called += 1
       accumulator << "ADDED TITLE"
-      assert_equal "title", context.field_name
+      assert context.index_step, "Context has #index_step set"
+      assert_equal "title", context.index_step.field_name
+      assert context.logger, "Context knows #logger"
       assert_equal times_called, context.position
     end

data/test/indexer/to_field_test.rb CHANGED Viewed

@@ -40,30 +40,17 @@ describe "Traject::Indexer.to_field" do
     end
   end
-  describe "gives location in error message" do
-    it "finds no previous field on initial error" do
-      begin
-        @indexer.to_field('') {|one, two| }   # bad field name
-        flunk("Should have rejected empty field name")
-      rescue Traject::Indexer::NamingError => e
-        assert_match(/no previous named fields/, e.message)
-      rescue
-        flunk("Should only fail with a NamingError")
-      end
-    end
-    it "finds first (only) field on error" do
-      begin
-        @indexer.to_field('foo') {|one, two| }
-        @indexer.to_field('') {|one, two| }   # bad field name
-        flunk("Should have rejected empty field name")
-      rescue Traject::Indexer::NamingError => e
-        assert_match(/foo/, e.message)
-      rescue
-        flunk("Should only fail with a NamingError")
-      end
+  it "outputs error with source location" do
+    begin
+      @indexer.to_field('foo') {|one, two| }
+      @indexer.to_field('') {|one, two| }   # bad field name
+      flunk("Should have rejected empty field name")
+    rescue Traject::Indexer::NamingError => e
+      assert_match(/at .*\/.*:\d+/, e.message)
+    rescue
+      flunk("Should only fail with a NamingError")
     end
   end
 end

data/test/marc_extractor_test.rb CHANGED Viewed

@@ -12,43 +12,47 @@ describe "Traject::MarcExtractor" do
       assert_kind_of Hash, parsed
       assert_equal 1, parsed.keys.length
-      assert_kind_of Hash, parsed["245"]
+      spec = parsed['245'].first
+      assert_kind_of Hash, spec
-      assert_kind_of Array, parsed["245"][:indicators]
-      assert_equal 2, parsed["245"][:indicators].length
-      assert_equal "1", parsed["245"][:indicators][0]
-      assert_nil parsed["245"][:indicators][1]
+      assert_kind_of Array, spec[:indicators]
+      assert_equal 2, spec[:indicators].length
+      assert_equal "1", spec[:indicators][0]
+      assert_nil spec[:indicators][1]
-      assert_kind_of Array, parsed["245"][:subfields]
+      assert_kind_of Array, spec[:subfields]
     end
     it "parses a mixed bag" do
       parsed = Traject::MarcExtractor.parse_string_spec("245abcde:810:700|*4|bcd")
+      spec245 = parsed['245'].first
+      spec810 = parsed['810'].first
+      spec700 = parsed['700'].first
       assert_length 3, parsed
       #245abcde
-      assert parsed["245"]
-      assert_nil parsed["245"][:indicators]
-      assert_equal %w{a b c d e}, parsed["245"][:subfields]
+      assert spec245
+      assert_nil spec245[:indicators]
+      assert_equal %w{a b c d e}, spec245[:subfields]
       #810
-      assert parsed["810"]
-      assert_nil parsed["810"][:indicators]
-      assert_nil parsed["810"][:subfields]
+      assert spec810
+      assert_nil spec810[:indicators]
+      assert_nil spec810[:subfields], "No subfields"
       #700-*4bcd
-      assert parsed["700"]
-      assert_equal [nil, "4"], parsed["700"][:indicators]
-      assert_equal %w{b c d}, parsed["700"][:subfields]
+      assert spec700
+      assert_equal [nil, "4"], spec700[:indicators]
+      assert_equal %w{b c d}, spec700[:subfields]
     end
     it "parses fixed field byte offsets" do
       parsed = Traject::MarcExtractor.parse_string_spec("005[5]:008[7-10]")
-      assert_equal 5, parsed["005"][:bytes]
-      assert_equal 7..10, parsed["008"][:bytes]
+      assert_equal 5, parsed["005"].first[:bytes]
+      assert_equal 7..10, parsed["008"].first[:bytes]
     end
     it "allows arrays of specs" do
@@ -98,7 +102,7 @@ describe "Traject::MarcExtractor" do
         assert ! @a880_100.nil?, "Found an 880-100 to test"
       end
       it "finds spec for relevant 880" do
-        assert_equal( {}, @extractor.spec_covering_field(@a880_245) )
+        assert_equal( [{}], @extractor.spec_covering_field(@a880_245) )
         assert_nil        @extractor.spec_covering_field(@a880_100)
       end
       it "does not find spec for 880 if disabled" do
@@ -108,7 +112,7 @@ describe "Traject::MarcExtractor" do
       it "finds only 880 if so configured" do
         @extractor = Traject::MarcExtractor.new("245", :alternate_script => :only)
         assert_nil @extractor.spec_covering_field(@a245)
-        assert_equal({},  @extractor.spec_covering_field(@a880_245))
+        assert_equal([{}],  @extractor.spec_covering_field(@a880_245))
       end
     end
   end
@@ -289,7 +293,7 @@ describe "Traject::MarcExtractor" do
   describe "MarcExtractor.cached" do
     it "creates" do
       ext = Traject::MarcExtractor.cached("245abc", :separator => nil)
-      assert_equal({"245"=>{:subfields=>["a", "b", "c"]}}, ext.spec_hash)
+      assert_equal({"245"=>[{:subfields=>["a", "b", "c"]}]}, ext.spec_hash)
       assert ext.options[:separator].nil?, "extractor options[:separator] is nil"
     end
     it "caches" do
@@ -301,4 +305,53 @@ describe "Traject::MarcExtractor" do
   end
+  describe "Allows multiple uses of the same tag" do
+    before do
+      @record = MARC::Reader.new(support_file_path  "manufacturing_consent.marc").to_a.first
+    end
+    it "allows repated tags for a variable field" do
+      extractor = Traject::MarcExtractor.new("245a:245b")
+      values = extractor.extract(@record)
+      assert_equal ['Manufacturing consent :', 'the political economy of the mass media /'], values
+    end
+    it "allows repeated tags with indicators specs" do
+      extractor = Traject::MarcExtractor.new("245|1*|a:245|2*|b")
+      @record.append(MARC::DataField.new('245', '2', '0', ['a', 'Subfield A Value'], ['b', 'Subfield B Value']))
+      results = extractor.extract(@record)
+      assert_equal ['Manufacturing consent :', 'Subfield B Value'], results
+    end
+    it "works the same as ::separator=>nil" do
+      ex1 = Traject::MarcExtractor.new("245a:245b")
+      ex2 = Traject::MarcExtractor.new("245ab", :separator=>nil)
+      assert_equal ex1.extract(@record), ex2.extract(@record)
+    end
+    it "allows repeated tags for a control field" do
+      extractor = Traject::MarcExtractor.new("001[0-1]:001[0-3]")
+      values = extractor.extract(@record)
+      assert_equal ["27", "2710"], values
+    end
+    it "associates indicators properly with repeated tags" do
+      @record = MARC::Record.new
+      @record.append MARC::DataField.new("100", '1', ' ', ['a', '100a first indicator 1'], ['b', 'should not include 100|1|b'])
+      @record.append MARC::DataField.new("100", '2', ' ', ['b', '100b first indicator 2'], ['a', 'should not include 100|2|a'])
+      extractor = Traject::MarcExtractor.new("100|1*|a:100|2*|b")
+      values = extractor.extract(@record)
+      assert_equal ['100a first indicator 1', '100b first indicator 2'], values
+    end
+  end
 end

data/traject.gemspec CHANGED Viewed

@@ -6,7 +6,7 @@ require 'traject/version'
 Gem::Specification.new do |spec|
   spec.name          = "traject"
   spec.version       = Traject::VERSION
-  spec.authors       = ["Jonathan Rochkind"]
+  spec.authors       = ["Jonathan Rochkind", "Bill Dueber"]
   spec.email         = ["none@nowhere.org"]
   spec.summary       = %q{Index MARC to Solr; or generally process source records to hash-like structures}
   spec.homepage      = "http://github.com/jrochkind/traject"

metadata CHANGED Viewed

@@ -2,10 +2,11 @@
 name: traject
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.13.2
+  version: 0.14.1
 platform: ruby
 authors:
 - Jonathan Rochkind
+- Bill Dueber
 autorequire:
 bindir: bin
 cert_chain: []