RubyGems - traject - Versions diffs - 2.0.0-java - Mend

traject 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

checksums.yaml +7 -0
data/.gitignore +18 -0
data/.travis.yml +27 -0
data/.yardopts +3 -0
data/Gemfile +12 -0
data/LICENSE.txt +20 -0
data/README.md +461 -0
data/Rakefile +21 -0
data/bench/bench.rb +30 -0
data/bin/traject +16 -0
data/doc/batch_execution.md +243 -0
data/doc/extending.md +190 -0
data/doc/indexing_rules.md +265 -0
data/doc/other_commands.md +47 -0
data/doc/settings.md +101 -0
data/lib/tasks/load_maps.rake +48 -0
data/lib/traject.rb +11 -0
data/lib/traject/command_line.rb +301 -0
data/lib/traject/csv_writer.rb +34 -0
data/lib/traject/debug_writer.rb +47 -0
data/lib/traject/delimited_writer.rb +110 -0
data/lib/traject/indexer.rb +613 -0
data/lib/traject/indexer/settings.rb +110 -0
data/lib/traject/json_writer.rb +51 -0
data/lib/traject/line_writer.rb +63 -0
data/lib/traject/macros/basic.rb +9 -0
data/lib/traject/macros/marc21.rb +223 -0
data/lib/traject/macros/marc21_semantics.rb +584 -0
data/lib/traject/macros/marc_format_classifier.rb +197 -0
data/lib/traject/marc_extractor.rb +410 -0
data/lib/traject/marc_reader.rb +89 -0
data/lib/traject/mock_reader.rb +97 -0
data/lib/traject/ndj_reader.rb +40 -0
data/lib/traject/null_writer.rb +22 -0
data/lib/traject/qualified_const_get.rb +40 -0
data/lib/traject/solr_json_writer.rb +277 -0
data/lib/traject/thread_pool.rb +161 -0
data/lib/traject/translation_map.rb +267 -0
data/lib/traject/util.rb +52 -0
data/lib/traject/version.rb +3 -0
data/lib/traject/yaml_writer.rb +9 -0
data/lib/translation_maps/lcc_top_level.yaml +26 -0
data/lib/translation_maps/marc_genre_007.yaml +9 -0
data/lib/translation_maps/marc_genre_leader.yaml +22 -0
data/lib/translation_maps/marc_geographic.yaml +589 -0
data/lib/translation_maps/marc_instruments.yaml +102 -0
data/lib/translation_maps/marc_languages.yaml +490 -0
data/test/debug_writer_test.rb +38 -0
data/test/delimited_writer_test.rb +104 -0
data/test/indexer/each_record_test.rb +59 -0
data/test/indexer/macros_marc21_semantics_test.rb +391 -0
data/test/indexer/macros_marc21_test.rb +190 -0
data/test/indexer/macros_test.rb +40 -0
data/test/indexer/map_record_test.rb +209 -0
data/test/indexer/read_write_test.rb +101 -0
data/test/indexer/settings_test.rb +152 -0
data/test/indexer/to_field_test.rb +77 -0
data/test/marc_extractor_test.rb +412 -0
data/test/marc_format_classifier_test.rb +98 -0
data/test/marc_reader_test.rb +110 -0
data/test/solr_json_writer_test.rb +248 -0
data/test/test_helper.rb +90 -0
data/test/test_support/245_no_ab.marc +1 -0
data/test/test_support/880_with_no_6.utf8.marc +1 -0
data/test/test_support/bad_subfield_code.marc +1 -0
data/test/test_support/bad_utf_byte.utf8.marc +1 -0
data/test/test_support/date_resort_to_260.marc +1 -0
data/test/test_support/date_type_r_missing_date2.marc +1 -0
data/test/test_support/date_with_u.marc +1 -0
data/test/test_support/demo_config.rb +155 -0
data/test/test_support/emptyish_record.marc +1 -0
data/test/test_support/escaped_character_reference.marc8.marc +1 -0
data/test/test_support/george_eliot.marc +1 -0
data/test/test_support/hebrew880s.marc +1 -0
data/test/test_support/louis_armstrong.marc +1 -0
data/test/test_support/manufacturing_consent.marc +1 -0
data/test/test_support/manuscript_online_thesis.marc +1 -0
data/test/test_support/microform_online_conference.marc +1 -0
data/test/test_support/multi_era.marc +1 -0
data/test/test_support/multi_geo.marc +1 -0
data/test/test_support/musical_cage.marc +1 -0
data/test/test_support/nature.marc +1 -0
data/test/test_support/one-marc8.mrc +1 -0
data/test/test_support/online_only.marc +1 -0
data/test/test_support/packed_041a_lang.marc +1 -0
data/test/test_support/test_data.utf8.json +30 -0
data/test/test_support/test_data.utf8.marc.xml +2609 -0
data/test/test_support/test_data.utf8.mrc +1 -0
data/test/test_support/test_data.utf8.mrc.gz +0 -0
data/test/test_support/the_business_ren.marc +1 -0
data/test/translation_map_test.rb +225 -0
data/test/translation_maps/bad_ruby.rb +8 -0
data/test/translation_maps/bad_yaml.yaml +1 -0
data/test/translation_maps/both_map.rb +1 -0
data/test/translation_maps/both_map.yaml +1 -0
data/test/translation_maps/default_literal.rb +10 -0
data/test/translation_maps/default_passthrough.rb +10 -0
data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
data/test/translation_maps/properties_map.properties +5 -0
data/test/translation_maps/ruby_map.rb +10 -0
data/test/translation_maps/translate_array_test.yaml +8 -0
data/test/translation_maps/yaml_map.yaml +7 -0
data/traject.gemspec +47 -0
metadata +382 -0

data/test/marc_format_classifier_test.rb ADDED Viewed

@@ -0,0 +1,98 @@
+require 'test_helper'
+require 'traject/macros/marc_format_classifier'
+MarcFormatClassifier = Traject::Macros::MarcFormatClassifier
+def classifier_for(filename)
+  record = MARC::Reader.new(support_file_path  filename).to_a.first
+  return MarcFormatClassifier.new( record  )
+end
+describe "MarcFormatClassifier" do
+  it "returns 'Print' when there's no other data" do
+    assert_equal ['Print'],  MarcFormatClassifier.new( empty_record  ).formats
+  end
+  describe "genre" do
+    # We don't have the patience to test every case, just a sampling
+    it "says book" do
+      assert_equal ["Book"], classifier_for("manufacturing_consent.marc").genre
+    end
+    it "says Book for a weird one" do
+      assert_equal ["Book"], classifier_for("microform_online_conference.marc").genre
+    end
+    it "says Musical Recording" do
+      assert_equal ["Musical Recording"], classifier_for("musical_cage.marc").genre
+    end
+    it "says Journal" do
+      assert_equal ["Journal/Newspaper"], classifier_for("the_business_ren.marc").genre
+    end
+  end
+  describe "print?" do
+    it "says print when it is" do
+      assert classifier_for("manufacturing_consent.marc").print?
+    end
+    it "does not say print for online only" do
+      assert ! classifier_for("online_only.marc").print?
+    end
+  end
+  describe "online?" do
+    it "says online when it is" do
+      assert classifier_for("online_only.marc").online?
+      assert classifier_for("microform_online_conference.marc").online?
+      assert classifier_for("manuscript_online_thesis.marc").online?
+    end
+    it "does not say online for a print only" do
+      assert ! classifier_for("manufacturing_consent.marc").online?
+    end
+  end
+  describe "microform?" do
+    it "says microform when it is" do
+      assert classifier_for("microform_online_conference.marc").microform?
+    end
+    it "does not say microform when it ain't" do
+       assert ! classifier_for("manufacturing_consent.marc").microform?
+       assert ! classifier_for("online_only.marc").microform?
+    end
+    it "catches microform in an 007" do
+      assert classifier_for("nature.marc").microform?
+    end
+  end
+  describe "conference?" do
+    it "says conference when it is" do
+      assert classifier_for("microform_online_conference.marc").proceeding?
+    end
+    it "does not say conference when it ain't" do
+      assert ! classifier_for("manufacturing_consent.marc").proceeding?
+      assert ! classifier_for("online_only.marc").proceeding?
+    end
+  end
+  describe "thesis?" do
+    it "says thesis when it is" do
+      assert classifier_for("manuscript_online_thesis.marc").thesis?
+    end
+    it "does not say thesis when it ain't" do
+      assert ! classifier_for("manufacturing_consent.marc").thesis?
+      assert ! classifier_for("online_only.marc").thesis?
+    end
+  end
+  describe "manuscript_archive?" do
+    it "says manuscript when it is" do
+      assert classifier_for("manuscript_online_thesis.marc").manuscript_archive?
+    end
+    it "does not say manuscript when it ain't" do
+      assert ! classifier_for("manufacturing_consent.marc").manuscript_archive?
+      assert ! classifier_for("online_only.marc").manuscript_archive?
+    end
+  end
+end

data/test/marc_reader_test.rb ADDED Viewed

@@ -0,0 +1,110 @@
+# Encoding: UTF-8
+require 'test_helper'
+require 'traject/marc_reader'
+require 'marc'
+describe "Traject::MarcReader" do
+  it "reads XML" do
+    file = File.new(support_file_path "test_data.utf8.marc.xml")
+    settings = Traject::Indexer::Settings.new("marc_source.type" => "xml")
+    reader = Traject::MarcReader.new(file, settings)
+    array = reader.to_a
+    assert_equal 30, array.length
+  end
+  describe "MARC binary" do
+    it "reads" do
+      file = File.new(support_file_path "test_data.utf8.mrc")
+      settings = Traject::Indexer::Settings.new() # binary type is default
+      reader = Traject::MarcReader.new(file, settings)
+      array = reader.to_a
+      assert_equal 30, array.length
+      first = array.first
+      assert_kind_of MARC::Record, first
+      assert first['245']['a'].encoding.name, "UTF-8"
+      assert_equal "Fikr-i Ayāz /", first['245']['a']
+    end
+    it "reads Marc binary in Marc8 encoding, transcoding to UTF-8" do
+      file = File.new(support_file_path("one-marc8.mrc"))
+      settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC-8")
+      reader = Traject::MarcReader.new(file, settings)
+      array = reader.to_a
+      assert_length 1, array
+      assert_kind_of MARC::Record, array.first
+      a245a = array.first['245']['a']
+      assert a245a.encoding.name, "UTF-8"
+      assert a245a.valid_encoding?
+      assert_equal "Por uma outra globalização :", a245a
+    end
+    it "replaces unicode character reference in Marc8 transcode" do
+      file = File.new(support_file_path("escaped_character_reference.marc8.marc"))
+      settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC-8") # binary type is default
+      record = Traject::MarcReader.new(file, settings).to_a.first
+      assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
+    end
+    it "raises on unrecognized encoding for binary type" do
+      file = File.new(support_file_path "one-marc8.mrc")
+      settings = Traject::Indexer::Settings.new("marc_source.encoding" => "ADFADFADF")
+      assert_raises(ArgumentError) do
+        record = Traject::MarcReader.new(file, settings).to_a.first
+      end
+    end
+    it "replaces bad byte in UTF8 marc binary" do
+      file = File.new(support_file_path "bad_utf_byte.utf8.marc")
+      settings = Traject::Indexer::Settings.new() # binary type is default
+      reader = Traject::MarcReader.new(file, settings)
+      record = reader.to_a.first
+      value = record['300']['a']
+      assert_equal value.encoding.name, "UTF-8"
+      assert value.valid_encoding?, "Has valid encoding"
+      assert_equal "This is a bad byte: '\uFFFD' and another: '\uFFFD'", value
+    end
+  end
+  it "reads JSON" do
+    file = File.new(support_file_path "test_data.utf8.json")
+    settings = Traject::Indexer::Settings.new("marc_source.type" => "json")
+    reader = Traject::MarcReader.new(file, settings)
+    array = reader.to_a
+    assert_equal 30, array.length
+    first = array.first
+    assert_kind_of MARC::Record, first
+    assert first['245']['a'].encoding.name, "UTF-8"
+    assert_equal "Fikr-i Ayāz /", first['245']['a']
+  end
+end

data/test/solr_json_writer_test.rb ADDED Viewed

@@ -0,0 +1,248 @@
+require 'test_helper'
+require 'httpclient'
+require 'traject/solr_json_writer'
+require 'thread'
+require 'json'
+require 'stringio'
+require 'logger'
+# Some basic tests, using a mocked HTTPClient so we can see what it did --
+# these tests do not run against a real solr server at present.
+describe "Traject::SolrJsonWriter" do
+  #######
+  # A bunch of utilities to help testing
+  #######
+  class FakeHTTPClient
+    # Always reply with this status, normally 200, can
+    # be reset for testing error conditions.
+    attr_accessor :response_status
+    attr_accessor :allow_update_json_path
+    def initialize(*args)
+      @post_args = []
+      @get_args  = []
+      @response_status = 200
+      @allow_update_json_path = true
+      @mutex = Monitor.new
+    end
+    def post(*args)
+      @mutex.synchronize do
+        @post_args << args
+      end
+      resp = HTTP::Message.new_response("")
+      resp.status = self.response_status
+      return resp
+    end
+    def get (*args)
+      @mutex.synchronize do
+        @get_args << args
+      end
+      resp = HTTP::Message.new_response("")
+      resp.status = self.response_status
+      if args.first.end_with?("/update/json") && ! self.allow_update_json_path
+        # Need to test auto-detection of /update/json being available
+        resp.status = 404
+      end
+      return resp
+    end
+    def post_args
+      @mutex.synchronize do
+        @post_args.dup
+      end
+    end
+    def get_args
+      @mutex.synchronize do
+        @get_args.dup
+      end
+    end
+    # Everything else, just return nil please
+    def method_missing(*args)
+    end
+  end
+  def context_with(hash)
+    Traject::Indexer::Context.new(:output_hash => hash)
+  end
+  def create_writer(settings = {})
+    settings = {
+      "solr.url" => "http://example.com/solr",
+      "solr_json_writer.http_client" => FakeHTTPClient.new
+      }.merge!(settings)
+    @fake_http_client = settings["solr_json_writer.http_client"]
+    writer = Traject::SolrJsonWriter.new(settings)
+    return writer
+  end
+  # strio = StringIO.new
+  # logger_to_strio(strio)
+  #
+  # Later check for strio.string for contents
+  def logger_to_strio(strio)
+    # Yell makes this hard, let's do it with an ordinary logger, think
+    # it's okay.
+    Logger.new(strio)
+  end
+  #########
+  # Actual tests
+  #########
+  before do
+    @writer = create_writer
+  end
+  it "defaults to 1 bg thread" do
+    assert_equal 1, @writer.thread_pool_size
+  end
+  it "adds a document" do
+    @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
+    @writer.close
+    post_args = @fake_http_client.post_args.first
+    refute_nil post_args
+    assert_equal "http://example.com/solr/update/json", post_args[0]
+    refute_nil post_args[1]
+    posted_json = JSON.parse(post_args[1])
+    assert_equal [{"id" => "one", "key" => ["value1", "value2"]}], posted_json
+  end
+  it "adds more than a batch in batches" do
+    (Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE + 1).times do |i|
+      doc = {"id" => "doc_#{i}", "key" => "value"}
+      @writer.put context_with(doc)
+    end
+    @writer.close
+    post_args = @fake_http_client.post_args
+    assert_length 2, post_args, "Makes two posts to Solr for two batches"
+    assert_length Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE, JSON.parse(post_args[0][1]), "first batch posted with batch size docs"
+    assert_length 1, JSON.parse(post_args[1][1]), "second batch posted with last remaining doc"
+  end
+  it "commits on close when set" do
+    @writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
+    @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
+    @writer.close
+    last_solr_get = @fake_http_client.get_args.last
+    assert_equal "http://example.com/update/json", last_solr_get[0]
+    assert_equal( {"commit" => "true"}, last_solr_get[1] )
+  end
+  describe "skipped records" do
+    it "skips and reports under max_skipped" do
+      strio = StringIO.new
+      @writer = create_writer("solr_writer.max_skipped" => 10, "logger" => logger_to_strio(strio))
+      @fake_http_client.response_status = 500
+      10.times do |i|
+        @writer.put context_with("id" => "doc_#{i}", "key" => "value")
+      end
+      @writer.close
+      assert_equal 10, @writer.skipped_record_count
+      logged = strio.string
+      10.times do |i|
+        assert_match /ERROR.*Could not add record doc_#{i} at source file position : Solr error response: 500/, logged
+      end
+    end
+    it "raises when skipped more than max_skipped" do
+      @writer = create_writer("solr_writer.max_skipped" => 5)
+      @fake_http_client.response_status = 500
+      e = assert_raises(RuntimeError) do
+        6.times do |i|
+          @writer.put context_with("id" => "doc_#{i}", "key" => "value")
+        end
+        @writer.close
+      end
+      assert_includes e.message, "Exceeded maximum number of skipped records"
+    end
+    it "raises on one skipped record when max_skipped is 0" do
+      @writer = create_writer("solr_writer.max_skipped" => 0)
+      @fake_http_client.response_status = 500
+      e = assert_raises(RuntimeError) do
+        @writer.put context_with("id" => "doc_1", "key" => "value")
+        @writer.close
+      end
+    end
+  end
+  describe "auto-discovers proper update path" do
+    it "finds /update/json" do
+      assert_equal "http://example.com/solr/update/json", @writer.determine_solr_update_url
+    end
+    it "resorts to plain /update" do
+      @fake_http_client = FakeHTTPClient.new
+      @fake_http_client.allow_update_json_path = false
+      @writer = create_writer("solr.url" => "http://example.com/solr",
+        "solr_json_writer.http_client" => @fake_http_client)
+      assert_equal "http://example.com/solr/update", @writer.determine_solr_update_url
+    end
+  end
+  describe "Record id from context" do
+    before do
+      @record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
+      @context = Traject::Indexer::Context.new
+      @writer = create_writer
+      @record_001 = "   00282214 " # from the mrc file
+    end
+    it "gets it from 001" do
+      @context.source_record = @record
+      assert_equal @record_001, @writer.record_id_from_context(@context)
+    end
+    it "gets it from the id" do
+      @context.output_hash['id'] = 'the_record_id'
+      assert_equal 'the_record_id', @writer.record_id_from_context(@context)
+    end
+    it "gets it from both 001 and id" do
+      @context.output_hash['id'] = 'the_record_id'
+      @context.source_record = @record
+      assert_equal [@record_001, 'the_record_id'].join('/'), @writer.record_id_from_context(@context)
+    end
+  end
+end