RubyGems - traject - Versions diffs - 2.0.0-java - Mend

traject 2.0.0-java

Files changed (104) hide show

checksums.yaml +7 -0
data/.gitignore +18 -0
data/.travis.yml +27 -0
data/.yardopts +3 -0
data/Gemfile +12 -0
data/LICENSE.txt +20 -0
data/README.md +461 -0
data/Rakefile +21 -0
data/bench/bench.rb +30 -0
data/bin/traject +16 -0
data/doc/batch_execution.md +243 -0
data/doc/extending.md +190 -0
data/doc/indexing_rules.md +265 -0
data/doc/other_commands.md +47 -0
data/doc/settings.md +101 -0
data/lib/tasks/load_maps.rake +48 -0
data/lib/traject.rb +11 -0
data/lib/traject/command_line.rb +301 -0
data/lib/traject/csv_writer.rb +34 -0
data/lib/traject/debug_writer.rb +47 -0
data/lib/traject/delimited_writer.rb +110 -0
data/lib/traject/indexer.rb +613 -0
data/lib/traject/indexer/settings.rb +110 -0
data/lib/traject/json_writer.rb +51 -0
data/lib/traject/line_writer.rb +63 -0
data/lib/traject/macros/basic.rb +9 -0
data/lib/traject/macros/marc21.rb +223 -0
data/lib/traject/macros/marc21_semantics.rb +584 -0
data/lib/traject/macros/marc_format_classifier.rb +197 -0
data/lib/traject/marc_extractor.rb +410 -0
data/lib/traject/marc_reader.rb +89 -0
data/lib/traject/mock_reader.rb +97 -0
data/lib/traject/ndj_reader.rb +40 -0
data/lib/traject/null_writer.rb +22 -0
data/lib/traject/qualified_const_get.rb +40 -0
data/lib/traject/solr_json_writer.rb +277 -0
data/lib/traject/thread_pool.rb +161 -0
data/lib/traject/translation_map.rb +267 -0
data/lib/traject/util.rb +52 -0
data/lib/traject/version.rb +3 -0
data/lib/traject/yaml_writer.rb +9 -0
data/lib/translation_maps/lcc_top_level.yaml +26 -0
data/lib/translation_maps/marc_genre_007.yaml +9 -0
data/lib/translation_maps/marc_genre_leader.yaml +22 -0
data/lib/translation_maps/marc_geographic.yaml +589 -0
data/lib/translation_maps/marc_instruments.yaml +102 -0
data/lib/translation_maps/marc_languages.yaml +490 -0
data/test/debug_writer_test.rb +38 -0
data/test/delimited_writer_test.rb +104 -0
data/test/indexer/each_record_test.rb +59 -0
data/test/indexer/macros_marc21_semantics_test.rb +391 -0
data/test/indexer/macros_marc21_test.rb +190 -0
data/test/indexer/macros_test.rb +40 -0
data/test/indexer/map_record_test.rb +209 -0
data/test/indexer/read_write_test.rb +101 -0
data/test/indexer/settings_test.rb +152 -0
data/test/indexer/to_field_test.rb +77 -0
data/test/marc_extractor_test.rb +412 -0
data/test/marc_format_classifier_test.rb +98 -0
data/test/marc_reader_test.rb +110 -0
data/test/solr_json_writer_test.rb +248 -0
data/test/test_helper.rb +90 -0
data/test/test_support/245_no_ab.marc +1 -0
data/test/test_support/880_with_no_6.utf8.marc +1 -0
data/test/test_support/bad_subfield_code.marc +1 -0
data/test/test_support/bad_utf_byte.utf8.marc +1 -0
data/test/test_support/date_resort_to_260.marc +1 -0
data/test/test_support/date_type_r_missing_date2.marc +1 -0
data/test/test_support/date_with_u.marc +1 -0
data/test/test_support/demo_config.rb +155 -0
data/test/test_support/emptyish_record.marc +1 -0
data/test/test_support/escaped_character_reference.marc8.marc +1 -0
data/test/test_support/george_eliot.marc +1 -0
data/test/test_support/hebrew880s.marc +1 -0
data/test/test_support/louis_armstrong.marc +1 -0
data/test/test_support/manufacturing_consent.marc +1 -0
data/test/test_support/manuscript_online_thesis.marc +1 -0
data/test/test_support/microform_online_conference.marc +1 -0
data/test/test_support/multi_era.marc +1 -0
data/test/test_support/multi_geo.marc +1 -0
data/test/test_support/musical_cage.marc +1 -0
data/test/test_support/nature.marc +1 -0
data/test/test_support/one-marc8.mrc +1 -0
data/test/test_support/online_only.marc +1 -0
data/test/test_support/packed_041a_lang.marc +1 -0
data/test/test_support/test_data.utf8.json +30 -0
data/test/test_support/test_data.utf8.marc.xml +2609 -0
data/test/test_support/test_data.utf8.mrc +1 -0
data/test/test_support/test_data.utf8.mrc.gz +0 -0
data/test/test_support/the_business_ren.marc +1 -0
data/test/translation_map_test.rb +225 -0
data/test/translation_maps/bad_ruby.rb +8 -0
data/test/translation_maps/bad_yaml.yaml +1 -0
data/test/translation_maps/both_map.rb +1 -0
data/test/translation_maps/both_map.yaml +1 -0
data/test/translation_maps/default_literal.rb +10 -0
data/test/translation_maps/default_passthrough.rb +10 -0
data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
data/test/translation_maps/properties_map.properties +5 -0
data/test/translation_maps/ruby_map.rb +10 -0
data/test/translation_maps/translate_array_test.yaml +8 -0
data/test/translation_maps/yaml_map.yaml +7 -0
data/traject.gemspec +47 -0
metadata +382 -0

data/test/indexer/macros_marc21_test.rb ADDED Viewed

@@ -0,0 +1,190 @@
+require 'test_helper'
+require 'traject/indexer'
+require 'traject/macros/marc21'
+require 'json'
+require 'marc'
+# See also marc_extractor_test.rb for more detailed tests on marc extraction,
+# this is just a basic test to make sure our macro works passing through to there
+# and other options.
+describe "Traject::Macros::Marc21" do
+  Marc21 = Traject::Macros::Marc21 # shortcut
+  before do
+    @indexer = Traject::Indexer.new
+    @record = MARC::Reader.new(support_file_path  "manufacturing_consent.marc").to_a.first
+  end
+  describe "extract_marc" do
+    it "extracts marc" do
+      @indexer.instance_eval do
+        to_field "title", extract_marc("245ab")
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["Manufacturing consent : the political economy of the mass media /"], output["title"]
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+    it "respects :first=>true option" do
+      @indexer.instance_eval do
+        to_field "other_id", extract_marc("035a", :first => true)
+      end
+      output = @indexer.map_record(@record)
+      assert_length 1, output["other_id"]
+    end
+    it "trims punctuation with :trim_punctuation => true" do
+      @indexer.instance_eval do
+        to_field "title", extract_marc("245ab", :trim_punctuation => true)
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title"]
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+    it "respects :default option" do
+      @indexer.instance_eval do
+        to_field "only_default", extract_marc("9999", :default => "DEFAULT VALUE")
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["DEFAULT VALUE"], output["only_default"]
+    end
+    it "de-duplicates by default, respects :allow_duplicates" do
+      # Add a second 008
+      f = @record.fields('008').first
+      @record.append(f)
+      @indexer.instance_eval do
+        to_field "lang1", extract_marc('008[35-37]')
+        to_field "lang2", extract_marc('008[35-37]', :allow_duplicates=>true)
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["eng"], output['lang1']
+      assert_equal ["eng", "eng"], output['lang2']
+      assert_equal({}, @indexer.map_record(empty_record))
+    end
+    it "fails on an extra/misspelled argument to extract_marc" do
+      assert_raises(RuntimeError) do
+        @indexer.instance_eval do
+          to_field "foo", extract_marc("9999", :misspelled => "Who cares")
+        end
+      end
+    end
+    it "Marc21::trim_punctuation class method" do
+      assert_equal "one two three", Marc21.trim_punctuation("one two three")
+      assert_equal "one two three", Marc21.trim_punctuation("one two three,")
+      assert_equal "one two three", Marc21.trim_punctuation("one two three/")
+      assert_equal "one two three", Marc21.trim_punctuation("one two three;")
+      assert_equal "one two three", Marc21.trim_punctuation("one two three:")
+      assert_equal "one two three .", Marc21.trim_punctuation("one two three .")
+      assert_equal "one two three", Marc21.trim_punctuation("one two three.")
+      assert_equal "one two [three]", Marc21.trim_punctuation("one two [three]")
+      assert_equal "one two three", Marc21.trim_punctuation("one two three]")
+      assert_equal "one two three", Marc21.trim_punctuation("[one two three")
+      assert_equal "one two three", Marc21.trim_punctuation("[one two three]")
+      # This one was a bug before
+      assert_equal "Feminism and art", Marc21.trim_punctuation("Feminism and art.")
+    end
+    it "uses :translation_map" do
+      @indexer.instance_eval do
+        to_field "cataloging_agency", extract_marc("040a", :separator => nil, :translation_map => "marc_040a_translate_test")
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["Library of Congress"], output["cataloging_agency"]
+    end
+  end
+  describe "serialized_marc" do
+    it "serializes xml" do
+      @indexer.instance_eval do
+        to_field "marc_record", serialized_marc(:format => "xml")
+      end
+      output = @indexer.map_record(@record)
+      assert_length 1, output["marc_record"]
+      assert_kind_of String, output["marc_record"].first
+      roundtrip_record = MARC::XMLReader.new(StringIO.new(output["marc_record"].first)).first
+      assert_equal @record, roundtrip_record
+    end
+    it "serializes binary UUEncoded" do
+      @indexer.instance_eval do
+        to_field "marc_record", serialized_marc(:format => "binary")
+      end
+      output = @indexer.map_record(@record)
+      assert_length 1, output["marc_record"]
+      assert_kind_of String, output["marc_record"].first
+      decoded = Base64.decode64( output["marc_record"].first )
+      # just check the marc header for now
+      assert_start_with "02067cam a2200469", decoded
+    end
+    it "serializes binary raw" do
+      @indexer.instance_eval do
+        to_field "marc_record", serialized_marc(:format => "binary", :binary_escape => false)
+      end
+      output = @indexer.map_record(@record)
+      assert_length 1, output["marc_record"]
+      assert_kind_of String, output["marc_record"].first
+      # just check the marc header for now
+      assert_start_with "02067cam a2200469", output["marc_record"].first
+    end
+    it "serializes json" do
+      @indexer.instance_eval do
+        to_field "marc_record", serialized_marc(:format => "json")
+      end
+      output = @indexer.map_record(@record)
+      assert_length 1, output["marc_record"]
+      # okay, let's actually deserialize it, why not
+      hash = JSON.parse( output["marc_record"].first )
+      deserialized = MARC::Record.new_from_hash(hash)
+      assert_equal @record, deserialized
+    end
+  end
+  it "#extract_all_marc_values" do
+    @indexer.instance_eval do
+      to_field "text", extract_all_marc_values
+    end
+    output = @indexer.map_record(@record)
+    assert_length 13, output["text"]
+  end
+end

data/test/indexer/macros_test.rb ADDED Viewed

@@ -0,0 +1,40 @@
+require 'test_helper'
+describe "Indexer Macros:" do
+  before do
+    @indexer = Traject::Indexer.new
+    @record = MARC::Reader.new(support_file_path  "manufacturing_consent.marc").to_a.first
+  end
+  it "works with simple literal" do
+    @indexer.instance_eval do
+      extend Traject::Macros::Basic
+      to_field "source", literal("MY LIBRARY")
+    end
+    output = @indexer.map_record(@record)
+    assert_equal ["MY LIBRARY"], output["source"]
+  end
+  it "works with macro AND block" do
+    called = false
+    @indexer.instance_eval do
+      extend Traject::Macros::Basic
+      to_field "source", literal("MY LIBRARY") do |record, accumulator, context|
+        called = true
+        accumulator << "SECOND VALUE"
+      end
+    end
+    output = @indexer.map_record(@record)
+    assert called
+    assert_equal ["MY LIBRARY", "SECOND VALUE"], output["source"]
+  end
+end

data/test/indexer/map_record_test.rb ADDED Viewed

@@ -0,0 +1,209 @@
+require 'test_helper'
+describe "Traject::Indexer#map_record" do
+  before do
+    @indexer = Traject::Indexer.new
+    @record = MARC::Reader.new(support_file_path  "manufacturing_consent.marc").to_a.first
+  end
+  describe "with no indexing rules" do
+    it "returns empty hash" do
+      output = @indexer.map_record(@record)
+      assert_kind_of Hash, output
+      assert_empty output
+    end
+  end
+  describe "#to_field" do
+    it "works with block" do
+      called  = false
+      @indexer.to_field("title") do |record, accumulator|
+        assert_kind_of MARC::Record, record
+        assert_kind_of Array, accumulator
+        called = true # by the power of closure!
+        accumulator << "Some Title"
+      end
+      output = @indexer.map_record(@record)
+      assert called
+      assert_kind_of Hash, output
+      assert_equal ["Some Title"], output["title"]
+    end
+    it "works with a lambda arg" do
+      called  = false
+      logic = lambda do |record, accumulator|
+        assert_kind_of MARC::Record, record
+        assert_kind_of Array, accumulator
+        called = true # by the power of closure!
+        accumulator << "Some Title"
+      end
+      @indexer.to_field("title", logic)
+      output = @indexer.map_record(@record)
+      assert called
+      assert_kind_of Hash, output
+      assert_equal ["Some Title"], output["title"]
+    end
+    it "works with both lambda and Proc" do
+      block_called = false
+      lambda_arg = lambda do |record, accumulator|
+        accumulator << "Lambda-provided Value"
+      end
+      @indexer.to_field("title", lambda_arg) do |record, accumulator|
+        assert_includes accumulator, "Lambda-provided Value"
+        accumulator << "Block-provided Value"
+        block_called = true
+      end
+      output = @indexer.map_record(@record)
+      assert block_called
+      assert_includes output["title"], "Lambda-provided Value"
+      assert_includes output["title"], "Block-provided Value"
+    end
+  end
+  describe "multiple to_field blocks" do
+    it "get called in order" do
+      order = []
+      @indexer.to_field("title") do |rec, acc|
+        order << :first_one
+        acc << "First"
+      end
+      @indexer.to_field("title") do |rec, acc|
+        order << :second_one
+        acc << "Second"
+      end
+      output = @indexer.map_record(@record)
+      assert_equal [:first_one, :second_one], order
+      assert_equal ["First", "Second"], output["title"]
+    end
+  end
+  describe "context argument" do
+    it "is third argument to block" do
+      called = false
+      @indexer.to_field("title") do |record, accumulator, context|
+        called = true
+        assert_kind_of Traject::Indexer::Context, context
+        assert_kind_of Hash, context.clipboard
+        assert_kind_of Hash, context.output_hash
+        assert_same @record, record
+        assert_same record, context.source_record
+        assert_same @indexer.settings, context.settings
+      end
+      @indexer.map_record @record
+      assert called
+    end
+  end
+  describe "#each_record" do
+    it "is called with one-arg record" do
+      called = false
+      @indexer.each_record do |record|
+        called = true
+        assert_kind_of MARC::Record, record
+      end
+      @indexer.map_record(@record)
+      assert called, "each_record was called"
+    end
+    it "is called with two-arg record and context" do
+      called = false
+      @indexer.each_record do |record, context|
+        called = true
+        assert_kind_of MARC::Record, record
+        assert_kind_of Traject::Indexer::Context, context
+      end
+      @indexer.map_record(@record)
+      assert called, "each_record was called"
+    end
+    it "accepts lambda AND block" do
+      lambda_arg = lambda do |record, context|
+        context.output_hash["field"] ||= []
+        context.output_hash["field"] << "first"
+      end
+      @indexer.each_record(lambda_arg) do |record, context|
+        context.output_hash["field"] ||= []
+        context.output_hash["field"] << "second"
+      end
+      output = @indexer.map_record(@record)
+      assert_equal %w{first second}, output["field"]
+    end
+    it "is called in order with #to_field" do
+      @indexer.to_field("foo") {|record, accumulator| accumulator << "first"}
+      @indexer.each_record {|record, context| context.output_hash["foo"] << "second" }
+      @indexer.to_field("foo") {|record, accumulator| accumulator << "third"}
+      output = @indexer.map_record(@record)
+      assert_equal %w{first second third}, output["foo"]
+    end
+  end
+  describe "map_to_context!" do
+    before do
+      @context = Traject::Indexer::Context.new(:source_record => @record, :settings => @indexer.settings, :position => 10 )
+    end
+    it "passes context to indexing routines"  do
+      called = false
+      @indexer.to_field("title") do |record, accumulator, context|
+        called = true
+        assert_kind_of Traject::Indexer::Context, context
+        assert_same @context, context
+      end
+      context = @indexer.map_to_context!(@context)
+      assert_same @context, context
+      assert called, "Called mapping routine"
+    end
+    it "skips records" do
+      @indexer.to_field("beforeSkip") do |rec, acc|
+        acc << "Before"
+      end
+      @indexer.to_field('radical') do |rec, acc, context|
+        context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
+      end
+      @indexer.to_field('afterSkip') do |rec, acc|
+        acc << "After. Should never happen"
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ['Before'], output['beforeSkip']
+      assert_nil output['afterSkip']
+    end
+  end
+end

data/test/indexer/read_write_test.rb ADDED Viewed

@@ -0,0 +1,101 @@
+require 'test_helper'
+# A little Traject Writer that just keeps everything
+# in an array, just added to settings for easy access
+memory_writer_class = Class.new do
+    def initialize(settings)
+      # store them in a class variable so we can test em later
+      @@last_writer_settings = @settings = settings
+      @settings["memory_writer.added"] = []
+    end
+    def put(hash)
+      @settings["memory_writer.added"] << hash
+    end
+    def close
+      @settings["memory_writer.closed"] = true
+    end
+  end
+describe "Traject::Indexer#process" do
+  before do
+    # no threading for these tests
+    @indexer = Traject::Indexer.new("processing_thread_pool" => nil)
+    @indexer.writer_class = memory_writer_class
+    @file = File.open(support_file_path "test_data.utf8.mrc")
+  end
+  it "works" do
+    # oops, this times_called counter isn't thread-safe under multi-threading
+    # is why this fails sometimes.
+    # fixed to be single-threaded for these tests.
+    times_called = 0
+    @indexer.to_field("title") do |record, accumulator, context|
+      times_called += 1
+      accumulator << "ADDED TITLE"
+      assert context.index_step, "Context has #index_step set"
+      assert_equal "title", context.index_step.field_name
+      assert context.logger, "Context knows #logger"
+      assert_equal times_called, context.position
+    end
+    return_value = @indexer.process( @file )
+    assert return_value, "Returns `true` on success"
+    # Grab the settings out of a class variable where we left em,
+    # as a convenient place to store outcomes so we can test em.
+    writer_settings = memory_writer_class.class_variable_get("@@last_writer_settings")
+    assert writer_settings["memory_writer.added"]
+    assert_equal 30, writer_settings["memory_writer.added"].length
+    assert_kind_of Traject::Indexer::Context, writer_settings["memory_writer.added"].first
+    assert_equal ["ADDED TITLE"], writer_settings["memory_writer.added"].first.output_hash["title"]
+    # logger provided in settings
+    assert writer_settings["logger"]
+    assert writer_settings["memory_writer.closed"]
+  end
+  require 'traject/null_writer'
+  it "calls after_processing after processing" do
+    @indexer = Traject::Indexer.new(
+      "writer_class_name" => "Traject::NullWriter"
+    )
+    @file = File.open(support_file_path "test_data.utf8.mrc")
+    called = []
+    @indexer.after_processing do
+      called << :one
+    end
+    @indexer.after_processing do
+      called << :two
+    end
+    @indexer.process(@file)
+    assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
+  end
+  describe "demo_config.rb" do
+    before do
+      @indexer = Traject::Indexer.new(
+        "writer_class_name" => "Traject::NullWriter"
+      )
+    end
+    it "parses and loads" do
+      conf_path = support_file_path "demo_config.rb"
+      File.open(conf_path) do |file_io|
+        @indexer.instance_eval(file_io.read, conf_path)
+      end
+    end
+  end
+end