RubyGems - traject - Versions diffs - 2.3.4 → 3.0.0.alpha.1 - Mend

traject 2.3.4 → 3.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

checksums.yaml +5 -5
data/.travis.yml +16 -9
data/CHANGES.md +74 -1
data/Gemfile +2 -1
data/README.md +104 -53
data/Rakefile +8 -1
data/doc/indexing_rules.md +79 -63
data/doc/programmatic_use.md +218 -0
data/doc/settings.md +28 -1
data/doc/xml.md +134 -0
data/lib/traject.rb +5 -0
data/lib/traject/array_writer.rb +34 -0
data/lib/traject/command_line.rb +18 -22
data/lib/traject/debug_writer.rb +2 -5
data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
data/lib/traject/indexer.rb +321 -92
data/lib/traject/indexer/context.rb +39 -13
data/lib/traject/indexer/marc_indexer.rb +30 -0
data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
data/lib/traject/indexer/settings.rb +36 -53
data/lib/traject/indexer/step.rb +27 -33
data/lib/traject/macros/marc21.rb +37 -12
data/lib/traject/macros/nokogiri_macros.rb +43 -0
data/lib/traject/macros/transformation.rb +162 -0
data/lib/traject/marc_extractor.rb +2 -0
data/lib/traject/ndj_reader.rb +1 -1
data/lib/traject/nokogiri_reader.rb +179 -0
data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
data/lib/traject/solr_json_writer.rb +19 -12
data/lib/traject/thread_pool.rb +13 -0
data/lib/traject/util.rb +14 -2
data/lib/traject/version.rb +1 -1
data/test/debug_writer_test.rb +3 -3
data/test/delimited_writer_test.rb +3 -3
data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
data/test/indexer/context_test.rb +23 -13
data/test/indexer/error_handler_test.rb +59 -0
data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
data/test/indexer/macros/to_field_test.rb +2 -2
data/test/indexer/macros/transformation_test.rb +177 -0
data/test/indexer/map_record_test.rb +2 -3
data/test/indexer/nokogiri_indexer_test.rb +103 -0
data/test/indexer/process_record_test.rb +55 -0
data/test/indexer/process_with_test.rb +148 -0
data/test/indexer/read_write_test.rb +52 -2
data/test/indexer/settings_test.rb +34 -24
data/test/indexer/to_field_test.rb +27 -2
data/test/marc_extractor_test.rb +7 -7
data/test/marc_reader_test.rb +4 -4
data/test/nokogiri_reader_test.rb +158 -0
data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
data/test/solr_json_writer_test.rb +24 -28
data/test/test_helper.rb +8 -2
data/test/test_support/namespace-test.xml +7 -0
data/test/test_support/nokogiri_demo_config.rb +17 -0
data/test/test_support/oai-pmh-one-record-2.xml +24 -0
data/test/test_support/oai-pmh-one-record-first.xml +24 -0
data/test/test_support/sample-oai-no-namespace.xml +197 -0
data/test/test_support/sample-oai-pmh.xml +197 -0
data/test/thread_pool_test.rb +38 -0
data/test/translation_map_test.rb +3 -3
data/test/translation_maps/ruby_map.rb +2 -1
data/test/translation_maps/yaml_map.yaml +2 -1
data/traject.gemspec +4 -11
metadata +92 -6

data/lib/traject/indexer/context.rb CHANGED

@@ -1,6 +1,8 @@
 # Represents the context of a specific record being indexed, passed
 # to indexing logic blocks
 #
+# Arg source_record_id_proc is a lambda that takes one arg (indexer-specific source record),
+# and returns an ID for it suitable for use in log messages.
 class Traject::Indexer
   class Context
     def initialize(hash_init = {})
@@ -17,9 +19,13 @@ class Traject::Indexer
     end
     attr_accessor :clipboard, :output_hash, :logger
-    attr_accessor :index_step, :source_record, :settings
-    # 1-based position in stream of processed records.
+    attr_accessor :index_step, :source_record, :settings, :source_record_id_proc
+    # 'position' is a 1-based position in stream of processed records.
     attr_accessor :position
+    # sometimes we have multiple inputs, input_name describes the current one, and
+    # position_in_input the position of the record in the current input -- both can
+    # sometimes be blanl when we don't know.
+    attr_accessor :input_name, :position_in_input
     # Should we be skipping this record?
     attr_accessor :skipmessage
@@ -41,19 +47,39 @@ class Traject::Indexer
     # in output messages, especially since this method may sometimes
     # return empty string if info on record id is not available.
     #
-    # Returns MARC 001, then a slash, then output_hash["id"] -- if both
+    # Returns id from source_record (if we can get it from a source_record_id_proc),
+    # then a slash,then output_hash["id"] -- if both
     # are present. Otherwise may return just one, or even an empty string.
-    #
-    # Likely override this for a future XML or other source format version.
     def source_record_id
-      marc_id   = if self.source_record &&
-          self.source_record.kind_of?(MARC::Record) &&
-          self.source_record['001']
-                    self.source_record['001'].value
-                  end
-      output_id = self.output_hash["id"]
-      return [marc_id, output_id].compact.join("/")
+      source_record_id_proc && source_record_id_proc.call(source_record)
+    end
+    # a string label that can be used to refer to a particular record in log messages and
+    # exceptions. Includes various parts depending on what we got.
+    def record_inspect
+      str = "<"
+      str << "record ##{position}" if position
+      if input_name && position_in_input
+        str << " (#{input_name} ##{position_in_input}), "
+      elsif position
+        str << ", "
+      end
+      if source_id = source_record_id
+        str << "source_id:#{source_id} "
+      end
+      if output_id = self.output_hash["id"]
+        str << "output_id:#{[output_id].join(',')}"
+      end
+      str.chomp!(" ")
+      str.chomp!(",")
+      str << ">"
+      str
     end
   end

data/lib/traject/indexer/marc_indexer.rb ADDED

@@ -0,0 +1,30 @@
+module Traject
+  class Indexer
+    # An indexer sub-class that includes "extract_marc" and other macros from
+    # Traject::Macros::Marc21, and also adds some marc-specific default settings.
+    class MarcIndexer < ::Traject::Indexer
+      include Traject::Macros::Marc21
+      def self.default_settings
+        @default_settings ||= begin
+          marc_settings = {
+            "reader_class_name"       => "Traject::MarcReader",
+            "marc_source.type"        => "binary",
+          }
+          super.merge(marc_settings)
+        end
+      end
+      # Overridden from base Indexer, to get MARC 001 for log messages.
+      def source_record_id_proc
+        @source_record_id_proc ||= lambda do |source_marc_record|
+          if ( source_marc_record &&
+               source_marc_record.kind_of?(MARC::Record) &&
+               source_marc_record['001'] )
+            source_marc_record['001'].value
+          end
+        end
+      end
+    end
+  end
+end

data/lib/traject/indexer/nokogiri_indexer.rb ADDED

@@ -0,0 +1,30 @@
+require 'traject/nokogiri_reader'
+require 'traject/macros/nokogiri_macros'
+require 'traject/oai_pmh_nokogiri_reader'
+module Traject
+  class Indexer
+    # An indexer sub-class for XML, where the source records in the pipeline are
+    # Nokogiri::XML::Document objects. It sets a default reader of NokogiriReader, and
+    # includes Traject::Macros::Nokogiri (with `extract_xpath`).
+    #
+    # See docs on XML use. (TODO)
+    class NokogiriIndexer < ::Traject::Indexer
+      include Traject::Macros::NokogiriMacros
+      def self.default_settings
+        @default_settings ||= super.merge("reader_class_name" => "Traject::NokogiriReader")
+      end
+      # Overridden from base Indexer, try an `id` attribute or element on record.
+      def source_record_id_proc
+        @source_record_id_proc ||= lambda do |source_xml_record|
+          if ( source_xml_record &&
+               source_xml_record.kind_of?(Nokogiri::XML::Node) )
+            source_xml_record['id'] || (el = source_xml_record.at_xpath('./id') && el.text)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/traject/indexer/settings.rb CHANGED

@@ -11,33 +11,55 @@ class Traject::Indexer
   #
   # method #provide(key, value) is added, to do like settings[key] ||= value,
   # set only if not already set (but unlike ||=, nil or false can count as already set)
+  # provide WILL overwrite defaults.
   #
-  # Also has an interesting 'defaults' system, meant to play along
-  # with configuration file 'provide' statements. There is a built-in hash of
-  # defaults, which will be lazily filled in if accessed and not yet
-  # set. (nil can count as set, though!).  If they haven't been lazily
-  # set yet, then #provide will still fill them in. But you can also call
-  # fill_in_defaults! to fill all defaults in, if you know configuration
-  # files have all been loaded, and want to fill them in for inspection.
+  # Or you can use standard Hash `store` which will overwrite already set values as well
+  # as defaults.
+  #
+  # Has kind of a weird 'defaults' system, where you tell the hash what it's defaults
+  # are, but they aren't actually loaded until asked for (or you can call fill_in_defaults!
+  # to load em all for inspection), to accomodate the `provide` API, where a caller wants to set
+  # only if not already set, but DO overwrite defaults.
   class Settings < Hash
+    # Just a hash with indifferent access and hash initializer, to use for
+    # our defaults hash.
+    class DefaultsHash < Hash
+      include Hashie::Extensions::MergeInitializer # can init with hash
+      include Hashie::Extensions::IndifferentAccess
+    end
     include Hashie::Extensions::MergeInitializer # can init with hash
     include Hashie::Extensions::IndifferentAccess
     def initialize(*args)
       super
+      @defaults = {}
       self.default_proc = lambda do |hash, key|
-        if self.class.defaults.has_key?(key)
-          return hash[key] = self.class.defaults[key]
+        if @defaults.has_key?(key)
+          return hash[key] = @defaults[key]
         else
           return nil
         end
       end
+      @defaults_filled = Concurrent::AtomicBoolean.new(false)
+    end
+    def with_defaults(defaults)
+      @defaults = DefaultsHash.new(defaults).freeze
+      self
+    end
+    def keys
+      super + @defaults.keys
     end
     # a cautious store, which only saves key=value if
     # there was not already a value for #key. Can be used
     # to set settings that can be overridden on command line,
-    # or general first-set-wins settings.
+    # or general first-set-wins settings. DOES set over defaults.
     def provide(key, value)
       unless has_key? key
         store(key, value)
@@ -54,50 +76,11 @@ class Traject::Indexer
       replace(reverse_merge(other_hash))
     end
+    # Normally defaults are filled in on-demand, but you can trigger it here --
+    # but if you later try to load traject config, `provide` will no longer
+    # overwrite defaults!
     def fill_in_defaults!
-      self.reverse_merge!(self.class.defaults)
-    end
-    def self.mri_defaults
-      {
-          # Reader defaults
-          "reader_class_name"       => "Traject::MarcReader",
-          "marc_source.type"        => "binary",
-          # Writer defaults
-          "writer_class_name"       => "Traject::SolrJsonWriter",
-          "solr_writer.batch_size"  => 100,
-          "solr_writer.thread_pool" => 1,
-          # Threading and logging
-          "processing_thread_pool"  => self.default_processing_thread_pool,
-          "log.batch_size.severity" => "info",
-          # how to post-process the accumulator
-          "allow_nil_values"        => false,
-          "allow_duplicate_values"  => true,
-          "allow_empty_fields"      => false,
-      }
-    end
-    def self.jruby_defaults
-      {
-          'reader_class_name'        => "Traject::Marc4JReader",
-          'marc4j_reader.permissive' => true
-      }
-    end
-    def self.defaults
-      return @@defaults if defined? @@defaults
-      default_settings = self.mri_defaults
-      if defined? JRUBY_VERSION
-        default_settings.merge! self.jruby_defaults
-      end
-      @@defaults = default_settings
+      self.reverse_merge!(@defaults)
     end
     def inspect

data/lib/traject/indexer/step.rb CHANGED

@@ -30,15 +30,15 @@ class Traject::Indexer
     # Set the arity of the lambda expression just once, when we define it
     def lambda=(lam)
       @lambda_arity = 0 # assume
+      @lambda = lam
       return unless lam
-      @lambda = lam
       if @lambda.is_a?(Proc)
         @lambda_arity = @lambda.arity
       else
         raise NamingError.new("argument to each_record must be a block/lambda, not a #{lam.class} #{self.inspect}")
       end
     end
     # raises if bad data
@@ -89,17 +89,17 @@ class Traject::Indexer
   end
-# An indexing step definition for a "to_field" step to specific
-# field.
+  # An indexing step definition for a "to_field" step to specific
+  # field. The first field name argument can be an array of multiple field
+  # names, the processed values will be added to each one.
   class ToFieldStep
-    attr_accessor :field_name, :block, :source_location
-    attr_reader :lambda
+    attr_reader :field_name, :block, :source_location, :procs
-    def initialize(fieldname, lambda, block, source_location)
-      self.field_name      = fieldname.freeze
-      self.lambda          = lambda
-      self.block           = block
-      self.source_location = source_location
+    def initialize(field_name, procs, block, source_location)
+      @field_name      = field_name.freeze
+      @procs           = procs.freeze
+      @block           = block.freeze
+      @source_location = source_location.freeze
       validate!
     end
@@ -108,18 +108,13 @@ class Traject::Indexer
       true
     end
-    def lambda=(lam)
-      @lambda       = lam
-      @lambda_arity = @lambda ? @lambda.arity : 0
-    end
     def validate!
-      if self.field_name.nil? || !self.field_name.is_a?(String) || self.field_name.empty?
-        raise NamingError.new("to_field requires the field name (as a string) as the first argument at #{self.source_location})")
+      unless (field_name.is_a?(String) && ! field_name.empty?) || (field_name.is_a?(Array) && field_name.all? { |f| f.is_a?(String) && ! f.empty? })
+        raise NamingError.new("to_field requires the field name (as a string), or an array of such, as the first argument at #{self.source_location})")
       end
-      [self.lambda, self.block].each do |proc|
+      [*self.procs, self.block].each do |proc|
         # allow negative arity, meaning variable/optional, trust em on that.
         # but for positive arrity, we need 2 or 3 args
         if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
@@ -130,26 +125,22 @@ class Traject::Indexer
     # Override inspect for developer debug messages
     def inspect
-      "(to_field #{self.field_name} at #{self.source_location})"
+      "(to_field #{self.field_name.inspect} at #{self.source_location})"
     end
     def execute(context)
       accumulator = []
-      sr          = context.source_record
+      source_record = context.source_record
-      if @lambda
-        if @lambda_arity == 2
-          @lambda.call(sr, accumulator)
+      [*self.procs, self.block].each do |aProc|
+        next unless aProc
+        if aProc.arity == 2
+          aProc.call(source_record, accumulator)
         else
-          @lambda.call(sr, accumulator, context)
+          aProc.call(source_record, accumulator, context)
         end
       end
-      if @block
-        @block.call(sr, accumulator, context)
-      end
       add_accumulator_to_context!(accumulator, context)
       return accumulator
     end
@@ -165,10 +156,13 @@ class Traject::Indexer
       accumulator.compact! unless context.settings[ALLOW_NIL_VALUES]
       return if accumulator.empty? and not (context.settings[ALLOW_EMPTY_FIELDS])
-      context.output_hash[field_name] ||= []
+      # field_name can actually be an array of field names
+      Array(field_name).each do |a_field_name|
+        context.output_hash[a_field_name] ||= []
-      existing_accumulator = context.output_hash[field_name].concat(accumulator)
-      existing_accumulator.uniq! unless context.settings[ALLOW_DUPLICATE_VALUES]
+        existing_accumulator = context.output_hash[a_field_name].concat(accumulator)
+        existing_accumulator.uniq! unless context.settings[ALLOW_DUPLICATE_VALUES]
+      end
     end
   end

data/lib/traject/macros/marc21.rb CHANGED

@@ -11,8 +11,8 @@ module Traject::Macros
   # def specific to Marc21.
   module Marc21
-    # A combo function macro that will extract data from marc according to a string
-    # field/substring specification, then apply various optional post-processing to it too.
+    # A macro that will extract data from marc according to a string
+    # field/substring specification.
     #
     # First argument is a string spec suitable for the MarcExtractor, see
     # MarcExtractor::parse_string_spec.
@@ -20,25 +20,42 @@ module Traject::Macros
     # Second arg is optional options, including options valid on MarcExtractor.new,
     # and others. By default, will de-duplicate results, but see :allow_duplicates
     #
-    # * :first => true: take only first value
+    #
+    # * :allow_duplicates => boolean, default false, if set to true then will avoid
+    #       de-duplicating the result array (array.uniq!)
+    #
+    # * :separator: (default ' ' (space)), what to use when joining multiple subfield matches from
+    #   same field. Set to `nil` to leave them as separate values (which is actually default if only
+    #   one subfield is given in spec, like `100a`). See MarcExtractor docs for more info.
+    #
+    # * :alternate_script: (default true). True, automatically include
+    #   'alternate script' MARC 880 linked fields corresponding to matched specifications. `false`, do
+    #   not include.  `:only` include _only_ linked 880s corresponding to spec, not base tags.
+    #
+    # ## Soft-Deprecated options: post-processing transformations
+    #
+    # These don't produce a deprecation warning and there is no planned horizon for them to go away, but the
+    # alternative of using additional transformation macros (from Traject::Macros::Transformation) composed with
+    # extract_marc is recommended.
+    #
+    # * :first => true: take only first value. **Instead**, use `extract_marc(whatever), first_only`
     #
     # * :translation_map => String: translate with named translation map looked up in load
-    #       path, uses Tranject::TranslationMap.new(translation_map_arg)
+    #       path, uses Tranject::TranslationMap.new(translation_map_arg).
+    #       **Instead**, use `extract_marc(whatever), translation_map(translation_map_arg)
     #
     # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
-    #     have shown themselves useful with Marc, using Marc21.trim_punctuation
+    #     have shown themselves useful with Marc, using Marc21.trim_punctuation. **Instead**, use
+    #    `extract_marc(whatever), trim_punctuation
     #
-    # * :default => String: if otherwise empty, add default value
-    #
-    # * :allow_duplicates => boolean, default false, if set to true then will avoid
-    #       de-duplicating the result array (array.uniq!)
+    # * :default => String: if otherwise empty, add default value. **Instead**, use `extract_marc(whatever), default("default value")`
     #
     #
     # Examples:
     #
-    #     to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
-    #     to_field("id"),    extract_marc("001", :first => true)
-    #     to_field("geo"),   extract_marc("040a", :separator => nil, :translation_map => "marc040")
+    #     to_field("title"), extract_marc("245abcd"), trim_punctuation
+    #     to_field("id"),    extract_marc("001"), first_only
+    #     to_field("geo"),   extract_marc("040a", :separator => nil), translation_map("marc040")
     #
     # If you'd like extract_marc functionality but you're not creating an indexer
     # step, see Traject::Macros::Marc21.extract_marc_from module method.
@@ -122,6 +139,14 @@ module Traject::Macros
       end
     end
+    # A transformation macro version of trim_punctuation -- heuristics for trimming punctuation
+    # from AACR2/MARC style values, to get bare values.
+    def trim_punctuation
+      lambda do |rec, accumulator|
+        accumulator.collect! {|s| Marc21.trim_punctuation(s)}
+      end
+    end
     #  A list of symbols that are valid keys in the options hash
     EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,