RubyGems - traject - Versions diffs - 2.0.0-java - Mend

traject 2.0.0-java

Files changed (104) hide show

checksums.yaml +7 -0
data/.gitignore +18 -0
data/.travis.yml +27 -0
data/.yardopts +3 -0
data/Gemfile +12 -0
data/LICENSE.txt +20 -0
data/README.md +461 -0
data/Rakefile +21 -0
data/bench/bench.rb +30 -0
data/bin/traject +16 -0
data/doc/batch_execution.md +243 -0
data/doc/extending.md +190 -0
data/doc/indexing_rules.md +265 -0
data/doc/other_commands.md +47 -0
data/doc/settings.md +101 -0
data/lib/tasks/load_maps.rake +48 -0
data/lib/traject.rb +11 -0
data/lib/traject/command_line.rb +301 -0
data/lib/traject/csv_writer.rb +34 -0
data/lib/traject/debug_writer.rb +47 -0
data/lib/traject/delimited_writer.rb +110 -0
data/lib/traject/indexer.rb +613 -0
data/lib/traject/indexer/settings.rb +110 -0
data/lib/traject/json_writer.rb +51 -0
data/lib/traject/line_writer.rb +63 -0
data/lib/traject/macros/basic.rb +9 -0
data/lib/traject/macros/marc21.rb +223 -0
data/lib/traject/macros/marc21_semantics.rb +584 -0
data/lib/traject/macros/marc_format_classifier.rb +197 -0
data/lib/traject/marc_extractor.rb +410 -0
data/lib/traject/marc_reader.rb +89 -0
data/lib/traject/mock_reader.rb +97 -0
data/lib/traject/ndj_reader.rb +40 -0
data/lib/traject/null_writer.rb +22 -0
data/lib/traject/qualified_const_get.rb +40 -0
data/lib/traject/solr_json_writer.rb +277 -0
data/lib/traject/thread_pool.rb +161 -0
data/lib/traject/translation_map.rb +267 -0
data/lib/traject/util.rb +52 -0
data/lib/traject/version.rb +3 -0
data/lib/traject/yaml_writer.rb +9 -0
data/lib/translation_maps/lcc_top_level.yaml +26 -0
data/lib/translation_maps/marc_genre_007.yaml +9 -0
data/lib/translation_maps/marc_genre_leader.yaml +22 -0
data/lib/translation_maps/marc_geographic.yaml +589 -0
data/lib/translation_maps/marc_instruments.yaml +102 -0
data/lib/translation_maps/marc_languages.yaml +490 -0
data/test/debug_writer_test.rb +38 -0
data/test/delimited_writer_test.rb +104 -0
data/test/indexer/each_record_test.rb +59 -0
data/test/indexer/macros_marc21_semantics_test.rb +391 -0
data/test/indexer/macros_marc21_test.rb +190 -0
data/test/indexer/macros_test.rb +40 -0
data/test/indexer/map_record_test.rb +209 -0
data/test/indexer/read_write_test.rb +101 -0
data/test/indexer/settings_test.rb +152 -0
data/test/indexer/to_field_test.rb +77 -0
data/test/marc_extractor_test.rb +412 -0
data/test/marc_format_classifier_test.rb +98 -0
data/test/marc_reader_test.rb +110 -0
data/test/solr_json_writer_test.rb +248 -0
data/test/test_helper.rb +90 -0
data/test/test_support/245_no_ab.marc +1 -0
data/test/test_support/880_with_no_6.utf8.marc +1 -0
data/test/test_support/bad_subfield_code.marc +1 -0
data/test/test_support/bad_utf_byte.utf8.marc +1 -0
data/test/test_support/date_resort_to_260.marc +1 -0
data/test/test_support/date_type_r_missing_date2.marc +1 -0
data/test/test_support/date_with_u.marc +1 -0
data/test/test_support/demo_config.rb +155 -0
data/test/test_support/emptyish_record.marc +1 -0
data/test/test_support/escaped_character_reference.marc8.marc +1 -0
data/test/test_support/george_eliot.marc +1 -0
data/test/test_support/hebrew880s.marc +1 -0
data/test/test_support/louis_armstrong.marc +1 -0
data/test/test_support/manufacturing_consent.marc +1 -0
data/test/test_support/manuscript_online_thesis.marc +1 -0
data/test/test_support/microform_online_conference.marc +1 -0
data/test/test_support/multi_era.marc +1 -0
data/test/test_support/multi_geo.marc +1 -0
data/test/test_support/musical_cage.marc +1 -0
data/test/test_support/nature.marc +1 -0
data/test/test_support/one-marc8.mrc +1 -0
data/test/test_support/online_only.marc +1 -0
data/test/test_support/packed_041a_lang.marc +1 -0
data/test/test_support/test_data.utf8.json +30 -0
data/test/test_support/test_data.utf8.marc.xml +2609 -0
data/test/test_support/test_data.utf8.mrc +1 -0
data/test/test_support/test_data.utf8.mrc.gz +0 -0
data/test/test_support/the_business_ren.marc +1 -0
data/test/translation_map_test.rb +225 -0
data/test/translation_maps/bad_ruby.rb +8 -0
data/test/translation_maps/bad_yaml.yaml +1 -0
data/test/translation_maps/both_map.rb +1 -0
data/test/translation_maps/both_map.yaml +1 -0
data/test/translation_maps/default_literal.rb +10 -0
data/test/translation_maps/default_passthrough.rb +10 -0
data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
data/test/translation_maps/properties_map.properties +5 -0
data/test/translation_maps/ruby_map.rb +10 -0
data/test/translation_maps/translate_array_test.yaml +8 -0
data/test/translation_maps/yaml_map.yaml +7 -0
data/traject.gemspec +47 -0
metadata +382 -0

data/lib/traject/indexer/settings.rb ADDED Viewed

@@ -0,0 +1,110 @@
+require 'hashie'
+require 'concurrent'
+class Traject::Indexer
+  # A Hash of settings for a Traject::Indexer, which also ends up passed along
+  # to other objects Traject::Indexer interacts with.
+  #
+  # Enhanced with a few features from Hashie, to make it for
+  # instance string/symbol indifferent
+  #
+  # method #provide(key, value) is added, to do like settings[key] ||= value,
+  # set only if not already set (but unlike ||=, nil or false can count as already set)
+  #
+  # Also has an interesting 'defaults' system, meant to play along
+  # with configuration file 'provide' statements. There is a built-in hash of
+  # defaults, which will be lazily filled in if accessed and not yet
+  # set. (nil can count as set, though!).  If they haven't been lazily
+  # set yet, then #provide will still fill them in. But you can also call
+  # fill_in_defaults! to fill all defaults in, if you know configuration
+  # files have all been loaded, and want to fill them in for inspection.
+  class Settings < Hash
+    include Hashie::Extensions::MergeInitializer # can init with hash
+    include Hashie::Extensions::IndifferentAccess
+    def initialize(*args)
+      super
+      self.default_proc = lambda do |hash, key|
+        if self.class.defaults.has_key?(key)
+          return hash[key] = self.class.defaults[key]
+        else
+          return nil
+        end
+      end
+    end
+    # a cautious store, which only saves key=value if
+    # there was not already a value for #key. Can be used
+    # to set settings that can be overridden on command line,
+    # or general first-set-wins settings.
+    def provide(key, value)
+      unless has_key? key
+        store(key, value)
+      end
+    end
+    # reverse_merge copied from ActiveSupport, pretty straightforward,
+    # modified to make sure we return a Settings
+    def reverse_merge(other_hash)
+      self.class.new(other_hash).merge(self)
+    end
+    def reverse_merge!(other_hash)
+      replace(reverse_merge(other_hash))
+    end
+    def fill_in_defaults!
+      self.reverse_merge!(self.class.defaults)
+    end
+    def self.mri_defaults
+      {
+        "reader_class_name"         => "Traject::MarcReader",
+        "writer_class_name"         => "Traject::SolrJsonWriter",
+        "marc_source.type"          => "binary",
+        "solrj_writer.batch_size"   => 200,
+        "solrj_writer.thread_pool"  => 1,
+        "processing_thread_pool"    => self.default_processing_thread_pool,
+        "log.batch_size.severity"   => "info"
+      }
+    end
+    def self.jruby_defaults
+      {
+        'reader_class_name' => "Traject::Marc4JReader",
+        'marc4j_reader.permissive' => true
+      }
+    end
+    def self.defaults
+      return @@defaults if defined? @@defaults
+      default_settings = self.mri_defaults
+      if defined? JRUBY_VERSION
+        default_settings.merge! self.jruby_defaults
+      end
+      @@defaults = default_settings
+    end
+    def inspect
+      # Keep any key ending in password out of the inspect
+      self.inject({}) do |hash, (key, value)|
+        hash[key] = (key =~ /password\Z/) ? "[hidden]" : value
+        hash
+      end.inspect
+    end
+    protected
+    def self.default_processing_thread_pool
+      if ["jruby", "rbx"].include? ENV["RUBY_ENGINE"]
+        [1, Concurrent.processor_count - 1].max
+      else
+        1
+      end
+    end
+  end
+end

data/lib/traject/json_writer.rb ADDED Viewed

@@ -0,0 +1,51 @@
+require 'json'
+require 'traject/line_writer'
+# The JsonWriter outputs one JSON hash per record, separated by newlines.
+#
+# It's newline delimitted json, which should be suitable for being
+# read by simple NDJ readers. (TODO: We have no checks right now to
+# make sure the standard json serializers we're using don't put any
+# internal newlines as whitespace in the json. Which would break NDJ
+# reading. Should we?)
+#
+# Should be thread-safe (ie, multiple worker threads can be calling #put
+# concurrently), because output to file is wrapped in a mutex synchronize.
+# This does not seem to effect performance much, as far as I could tell
+# benchmarking.
+#
+# ## Settings
+#
+# * output_file A filename to send output; default will use stdout.
+#
+# * json_writer.pretty_print: [default: false]: Pretty-print (e.g., include newlines, indentation, etc.)
+# each JSON record instead of just mashing it all together on one line. The default, no pretty-printing option
+# produces one record per line, easy to process with another program.
+#
+# ## Example output
+#
+# Without pretty printing, you end up with something like this (just two records shown):
+#
+#     {"id":["000001118"],"oclc":["ocm00085737"],"sdrnum":["sdr-nrlf.b170195454"],"isbn":["0137319924"],"lccn":["73120791"],"mainauthor":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"author2":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Hamburg, David A., 1925-"],"authorSort":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel."],"author_top":["Behavioral and Social Sciences Survey Committee. Psychiatry Panel.","Edited by David A. Hamburg.","Hamburg, David A., 1925- ed."],"title":["Psychiatry as a behavioral science."],"title_a":["Psychiatry as a behavioral science."],"title_ab":["Psychiatry as a behavioral science."],"title_c":["Edited by David A. Hamburg."],"titleSort":["Psychiatry as a behavioral science"],"title_top":["Psychiatry as a behavioral science."],"title_rest":["A Spectrum book"],"series2":["A Spectrum book"],"callnumber":["RC327 .B41"],"broad_subject":["Medicine"],"pubdate":[1970],"format":["Book","Online","Print"],"publisher":["Prentice-Hall"],"language":["English"],"language008":["eng"],"editor":["David A. Hamburg."]}
+#     {"id":["000000794"],"oclc":["ocm00067181"],"lccn":["78011026"],"mainauthor":["Clark, Albert Curtis, 1859-1937."],"author":["Clark, Albert Curtis, 1859-1937."],"authorSort":["Clark, Albert Curtis, 1859-1937."],"author_top":["Clark, Albert Curtis, 1859-1937."],"title":["The descent of manuscripts.","descent of manuscripts."],"title_a":["The descent of manuscripts.","descent of manuscripts."],"title_ab":["The descent of manuscripts.","descent of manuscripts."],"titleSort":["descent of manuscripts"],"title_top":["The descent of manuscripts."],"callnumber":["PA47 .C45 1970"],"broad_subject":["Language & Literature"],"pubdate":[1918],"format":["Book","Online","Print"],"publisher":["Clarendon Press"],"language":["English"],"language008":["eng"]}
+#
+# ## Example configuration file
+#
+#     require 'traject/json_writer'
+#
+#     settings do
+#       provide "writer_class_name", "Traject::JsonWriter"
+#       provide "output_file", "out.json"
+#     end
+class Traject::JsonWriter < Traject::LineWriter
+  def serialize(context)
+    hash = context.output_hash
+    if settings["json_writer.pretty_print"]
+      JSON.pretty_generate(hash)
+    else
+      JSON.generate(hash)
+    end
+  end
+end

data/lib/traject/line_writer.rb ADDED Viewed

@@ -0,0 +1,63 @@
+require 'thread'
+# A writer for Traject::Indexer, that just writes out
+# all the output as serialized text with #puts.
+#
+# Should be thread-safe (ie, multiple worker threads can be calling #put
+# concurrently), by wrapping write to actual output file in a mutex synchronize.
+# This does not seem to effect performance much, as far as I could tell
+# benchmarking.
+#
+# Output will be sent to `settings["output_file"]` string path, or else
+# `settings["output_stream"]` (ruby IO object), or else stdout.
+#
+# This class can be sub-classed to write out different serialized
+# reprentations -- subclasses will just override the #serialize
+# method. For instance, see JsonWriter.
+class Traject::LineWriter
+  attr_reader :settings
+  attr_reader :write_mutex, :output_file
+  def initialize(argSettings)
+    @settings     = argSettings
+    @write_mutex  = Mutex.new
+    # trigger lazy loading now for thread-safety
+    @output_file = open_output_file
+  end
+  def _write(data)
+    output_file.puts(data)
+  end
+  def serialize(context)
+    context.output_hash
+  end
+  def put(context)
+    serialized = serialize(context)
+    write_mutex.synchronize do
+      _write(serialized)
+    end
+  end
+  def open_output_file
+    unless defined? @output_file
+      of =
+        if settings["output_file"]
+          File.open(settings["output_file"], 'w:UTF-8')
+        elsif settings["output_stream"]
+          settings["output_stream"]
+        else
+          $stdout
+        end
+    end
+    return of
+  end
+  def close
+    @output_file.close unless (@output_file.nil? || @output_file.tty?)
+  end
+end

data/lib/traject/macros/basic.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module Traject::Macros
+  module Basic
+    def literal(literal)
+      lambda do |record, accumulator, context|
+        accumulator << literal
+      end
+    end
+  end
+end

data/lib/traject/macros/marc21.rb ADDED Viewed

@@ -0,0 +1,223 @@
+require 'traject/marc_extractor'
+require 'traject/translation_map'
+require 'traject/util'
+require 'base64'
+require 'json'
+require 'marc/fastxmlwriter'
+module Traject::Macros
+  # Some of these may be generic for any MARC, but we haven't done
+  # the analytical work to think it through, some of this is
+  # def specific to Marc21.
+  module Marc21
+    # A combo function macro that will extract data from marc according to a string
+    # field/substring specification, then apply various optional post-processing to it too.
+    #
+    # First argument is a string spec suitable for the MarcExtractor, see
+    # MarcExtractor::parse_string_spec.
+    #
+    # Second arg is optional options, including options valid on MarcExtractor.new,
+    # and others. By default, will de-duplicate results, but see :allow_duplicates
+    #
+    # * :first => true: take only first value
+    #
+    # * :translation_map => String: translate with named translation map looked up in load
+    #       path, uses Tranject::TranslationMap.new(translation_map_arg)
+    #
+    # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
+    #     have shown themselves useful with Marc, using Marc21.trim_punctuation
+    #
+    # * :default => String: if otherwise empty, add default value
+    #
+    # * :allow_duplicates => boolean, default false, if set to true then will avoid
+    #       de-duplicating the result array (array.uniq!)
+    #
+    #
+    # Examples:
+    #
+    #     to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
+    #     to_field("id"),    extract_marc("001", :first => true)
+    #     to_field("geo"),   extract_marc("040a", :separator => nil, :translation_map => "marc040")
+    def extract_marc(spec, options = {})
+      # Raise an error if there are any invalid options, indicating a
+      # misspelled or illegal option, using a string instead of a symbol, etc.
+      unless (options.keys - EXTRACT_MARC_VALID_OPTIONS).empty?
+        raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_MARC_VALID_OPTIONS).join(', ')}' in extract_marc at #{Traject::Util.extract_caller_location(caller.first)}")
+      end
+      # We create the TranslationMap and the MarcExtractor here
+      # on load, so the lambda can just refer to already created
+      # ones, and not have to create a new one per-execution.
+      #
+      # Benchmarking shows for MarcExtractor at least, there is
+      # significant performance advantage.
+      if translation_map_arg  = options.delete(:translation_map)
+        translation_map = Traject::TranslationMap.new(translation_map_arg)
+      else
+        translation_map = nil
+      end
+      extractor = Traject::MarcExtractor.new(spec, options)
+      lambda do |record, accumulator, context|
+        accumulator.concat extractor.extract(record)
+        Marc21.apply_extraction_options(accumulator, options, translation_map)
+      end
+    end
+    # Side-effect the accumulator with the options
+    def self.apply_extraction_options(accumulator, options, translation_map=nil)
+      only_first              = options[:first]
+      trim_punctuation        = options[:trim_punctuation]
+      default_value           = options[:default]
+      allow_duplicates        = options[:allow_duplicates]
+      if only_first
+        accumulator.replace Array(accumulator[0])
+      end
+      if translation_map
+        translation_map.translate_array! accumulator
+      end
+      if trim_punctuation
+        accumulator.collect! {|s| Marc21.trim_punctuation(s)}
+      end
+      unless allow_duplicates
+        accumulator.uniq!
+      end
+      if default_value && accumulator.empty?
+        accumulator << default_value
+      end
+    end
+    #  A list of symbols that are valid keys in the options hash
+    EXTRACT_MARC_VALID_OPTIONS = [:first, :trim_punctuation, :default,
+                                  :allow_duplicates, :separator, :translation_map,
+                                  :alternate_script]
+    # Serializes complete marc record to a serialization format.
+    # required param :format,
+    # serialize_marc(:format => :binary)
+    #
+    # formats:
+    # [xml] MarcXML
+    # [json] marc-in-json (http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
+    # [binary] Standard ISO 2709 binary marc. By default WILL be base64-encoded,
+    #          assumed destination a solr 'binary' field.
+    #          * add option `:binary_escape => false` to do straight binary -- unclear
+    #          what Solr's documented behavior is when you do this, and add a string
+    #          with binary control chars to solr. May do different things in diff
+    #          Solr versions, including raising exceptions.
+    #          * add option `:allow_oversized => true` to pass that flat
+    #          to the MARC::Writer. Oversized records will then still be
+    #          serialized, with certain header bytes filled with ascii 0's
+    #          -- technically illegal MARC, but can still be read by
+    #          ruby MARC::Reader in permissive mode.
+    def serialized_marc(options)
+      unless (options.keys - SERIALZED_MARC_VALID_OPTIONS).empty?
+        raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - SERIALZED_MARC_VALID_OPTIONS).join(', ')}' in seralized_marc at #{Traject::Util.extract_caller_location(caller.first)}")
+      end
+      format          = options[:format].to_s
+      binary_escape   = (options[:binary_escape] != false)
+      allow_oversized = (options[:allow_oversized] == true)
+      raise ArgumentError.new("Need :format => [binary|xml|json] arg") unless %w{binary xml json}.include?(format)
+      lambda do |record, accumulator, context|
+        case format
+        when "binary"
+          binary = MARC::Writer.encode(record, allow_oversized)
+          binary = Base64.encode64(binary) if binary_escape
+          accumulator << binary
+        when "xml"
+          accumulator << MARC::FastXMLWriter.encode(record)
+        when "json"
+          accumulator << JSON.dump(record.to_hash)
+        end
+      end
+    end
+    SERIALZED_MARC_VALID_OPTIONS = [:format, :binary_escape, :allow_oversized]
+    # Takes the whole record, by default from tags 100 to 899 inclusive,
+    # all subfields, and adds them to output. Subfields in a record are all
+    # joined by space by default.
+    #
+    # options
+    # [:from] default 100, only tags >= lexicographically
+    # [:to]   default 899, only tags <= lexicographically
+    # [:separator] how to join subfields, default space, nil means don't join
+    #
+    # All fields in from-to must be marc DATA (not control fields), or weirdness
+    #
+    # Can always run this thing multiple times on the same field if you need
+    # non-contiguous ranges of fields.
+    def extract_all_marc_values(options = {})
+      unless (options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).empty?
+        raise RuntimeError.new("Illegal/Unknown argument '#{(options.keys - EXTRACT_ALL_MARC_VALID_OPTIONS).join(', ')}' in extract_all_marc at #{Traject::Util.extract_caller_location(caller.first)}")
+      end
+      options = {:from => "100", :to => "899", :separator => ' '}.merge(options)
+      lambda do |record, accumulator, context|
+        record.each do |field|
+          next unless field.tag >= options[:from] && field.tag <= options[:to]
+          subfield_values = field.subfields.collect {|sf| sf.value}
+          next unless subfield_values.length > 0
+          if options[:separator]
+            accumulator << subfield_values.join( options[:separator])
+          else
+            accumulator.concat subfield_values
+          end
+        end
+      end
+    end
+    EXTRACT_ALL_MARC_VALID_OPTIONS = [:separator, :from, :to]
+    # Trims punctuation mostly from end, and occasionally from beginning
+    # of string. Not nearly as complex logic as SolrMarc's version, just
+    # pretty simple.
+    #
+    # Removes
+    # * trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
+    # * trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
+    # * single square bracket characters if they are the start and/or end
+    #   chars and there are no internal square brackets.
+    #
+    # Returns altered string, doesn't change original arg.
+    def self.trim_punctuation(str)
+      # If something went wrong and we got a nil, just return it
+      return str unless str
+      # trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
+      str = str.sub(/ *[ ,\/;:] *\Z/, '')
+      # trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
+      str = str.sub(/( *\w\w\w)\. *\Z/, '\1')
+      # single square bracket characters if they are the start and/or end
+      #   chars and there are no internal square brackets.
+      str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
+      return str
+    end
+    def self.first!(arr)
+      # kind of esoteric, but slice used this way does mutating first, yep
+      arr.slice!(1, arr.length)
+    end
+  end
+end