RubyGems - traject - Versions diffs - 3.0.0 → 3.4.0 - Mend

traject 3.0.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +4 -4
data/.travis.yml +3 -4
data/CHANGES.md +65 -0
data/README.md +9 -4
data/doc/indexing_rules.md +5 -6
data/doc/programmatic_use.md +25 -1
data/doc/settings.md +4 -0
data/doc/xml.md +12 -0
data/lib/traject/indexer.rb +40 -4
data/lib/traject/indexer/context.rb +45 -0
data/lib/traject/indexer/step.rb +8 -12
data/lib/traject/line_writer.rb +36 -4
data/lib/traject/macros/marc21.rb +2 -2
data/lib/traject/macros/marc21_semantics.rb +15 -12
data/lib/traject/macros/nokogiri_macros.rb +9 -3
data/lib/traject/nokogiri_reader.rb +17 -19
data/lib/traject/oai_pmh_nokogiri_reader.rb +9 -3
data/lib/traject/solr_json_writer.rb +167 -29
data/lib/traject/version.rb +1 -1
data/lib/translation_maps/marc_languages.yaml +77 -48
data/test/delimited_writer_test.rb +14 -16
data/test/indexer/class_level_configuration_test.rb +127 -0
data/test/indexer/context_test.rb +64 -1
data/test/indexer/error_handler_test.rb +18 -0
data/test/indexer/macros/macros_marc21_semantics_test.rb +4 -0
data/test/indexer/nokogiri_indexer_test.rb +35 -0
data/test/nokogiri_reader_test.rb +66 -3
data/test/solr_json_writer_test.rb +175 -7
data/test/test_support/date_resort_to_264.marc +1 -0
data/traject.gemspec +4 -4
metadata +37 -16

data/test/indexer/class_level_configuration_test.rb ADDED

@@ -0,0 +1,127 @@
+require 'test_helper'
+describe "Class-level configuration of Indexer sub-class" do
+  # Declaring a class inline in minitest isn't great, this really is a globally
+  # available class now, other tests shouldn't re-use this class name. But it works
+  # for testing for now.
+  class TestIndexerSubclass < Traject::Indexer
+    configure do
+      settings do
+        provide "class_level", "TestIndexerSubclass"
+      end
+      to_field "field", literal("value")
+      each_record do |rec, context|
+        context.output_hash["from_each_record"] ||= []
+        context.output_hash["from_each_record"] << "value"
+      end
+    end
+    def self.default_settings
+      @default_settings ||= super.merge(
+        "set_by_default_setting_no_override" => "TestIndexerSubclass",
+        "set_by_default_setting" => "TestIndexerSubclass"
+      )
+    end
+  end
+  before do
+    @indexer = TestIndexerSubclass.new
+  end
+  it "uses class-level configuration" do
+    result = @indexer.map_record(Object.new)
+    assert_equal ['value'], result['field']
+    assert_equal ['value'], result['from_each_record']
+  end
+  it "uses class-level configuration and instance-level configuration" do
+    @indexer.configure do
+      to_field "field", literal("from-instance-config")
+      to_field "instance_field", literal("from-instance-config")
+    end
+    result = @indexer.map_record(Object.new)
+    assert_equal ['value', 'from-instance-config'], result['field']
+    assert_equal ['from-instance-config'], result["instance_field"]
+  end
+  describe "multiple class-level configure" do
+    class MultipleConfigureIndexer < Traject::Indexer
+      configure do
+        to_field "field", literal("value")
+      end
+      configure do
+        to_field "field", literal("value from second configure")
+        to_field "second_call", literal("value from second configure")
+      end
+    end
+    before do
+      @indexer = MultipleConfigureIndexer.new
+    end
+    it "lets you call class-level configure multiple times and aggregates" do
+      result = @indexer.map_record(Object.new)
+      assert_equal ['value', 'value from second configure'], result['field']
+      assert_equal ['value from second configure'], result['second_call']
+    end
+  end
+  describe "with multi-level subclass" do
+    class TestIndexerSubclassSubclass < TestIndexerSubclass
+      configure do
+        settings do
+          provide "class_level", "TestIndexerSubclassSubclass"
+        end
+        to_field "field", literal("from-sub-subclass")
+        to_field "subclass_field", literal("from-sub-subclass")
+      end
+      def self.default_settings
+        @default_settings ||= super.merge(
+          "set_by_default_setting" => "TestIndexerSubclassSubclass"
+        )
+      end
+    end
+    before do
+      @indexer = TestIndexerSubclassSubclass.new
+    end
+    it "lets subclass override settings 'provide'" do
+      skip("This would be nice but is currently architecturally hard")
+      assert_equal "TestIndexerSubclassSubclass", @indexer.settings["class_level"]
+    end
+    it "lets subclass override default settings" do
+      assert_equal "TestIndexerSubclassSubclass", @indexer.settings["set_by_default_setting"]
+      assert_equal "TestIndexerSubclass", @indexer.settings["set_by_default_setting_no_override"]
+    end
+    it "uses configuraton from all inheritance" do
+      result = @indexer.map_record(Object.new)
+      assert_equal ['value', 'from-sub-subclass'], result['field']
+      assert_equal ['value'], result['from_each_record']
+      assert_equal ['from-sub-subclass'], result['subclass_field']
+    end
+    it "uses configuraton from all inheritance plus instance" do
+      @indexer.configure do
+        to_field "field", literal("from-instance")
+        to_field "instance_field", literal("from-instance")
+      end
+      result = @indexer.map_record(Object.new)
+      assert_equal ['value', 'from-sub-subclass', 'from-instance'], result['field']
+      assert_equal ['from-instance'], result['instance_field']
+    end
+  end
+end

data/test/indexer/context_test.rb CHANGED

@@ -38,8 +38,71 @@ describe "Traject::Indexer::Context" do
       assert_equal "<record ##{@position} (#{@input_name} ##{@position_in_input}), source_id:#{@record_001} output_id:output_id>", @context.record_inspect
     end
   end
+  describe "#add_output" do
+    before do
+      @context = Traject::Indexer::Context.new
+    end
+    it "adds one value to nil" do
+      @context.add_output(:key, "value")
+      assert_equal @context.output_hash, { "key" => ["value"] }
+    end
+    it "adds multiple values to nil" do
+      @context.add_output(:key, "value1", "value2")
+      assert_equal @context.output_hash, { "key" => ["value1", "value2"] }
+    end
+    it "adds one value to existing accumulator" do
+      @context.output_hash["key"] = ["value1"]
+      @context.add_output(:key, "value2")
+      assert_equal @context.output_hash, { "key" => ["value1", "value2"] }
+    end
+    it "uniqs by default" do
+      @context.output_hash["key"] = ["value1"]
+      @context.add_output(:key, "value1")
+      assert_equal @context.output_hash, { "key" => ["value1"] }
+    end
+    it "does not unique if allow_duplicate_values" do
+      @context.settings = { Traject::Indexer::ToFieldStep::ALLOW_DUPLICATE_VALUES => true }
+      @context.output_hash["key"] = ["value1"]
+      @context.add_output(:key, "value1")
+      assert_equal @context.output_hash, { "key" => ["value1", "value1"] }
+    end
+    it "ignores nil values by default" do
+      @context.add_output(:key, "value1", nil, "value2")
+      assert_equal @context.output_hash, { "key" => ["value1", "value2"] }
+    end
+    it "allows nil values if allow_nil_values" do
+      @context.settings = { Traject::Indexer::ToFieldStep::ALLOW_NIL_VALUES => true }
+      @context.add_output(:key, "value1", nil, "value2")
+      assert_equal @context.output_hash, { "key" => ["value1", nil, "value2"] }
+    end
+    it "ignores empty array by default" do
+      @context.add_output(:key)
+      @context.add_output(:key, nil)
+      assert_nil @context.output_hash["key"]
+    end
+    it "allows empty field if allow_empty_fields" do
+      @context.settings = { Traject::Indexer::ToFieldStep::ALLOW_EMPTY_FIELDS => true }
+      @context.add_output(:key, nil)
+      assert_equal @context.output_hash, { "key" => [] }
+    end
+    it "can add to multiple fields" do
+      @context.add_output(["field1", "field2"], "value1", "value2")
+      assert_equal @context.output_hash, { "field1" => ["value1", "value2"], "field2" => ["value1", "value2"] }
+    end
+  end
 end

data/test/indexer/error_handler_test.rb CHANGED

@@ -56,4 +56,22 @@ describe 'Custom mapping error handler' do
     assert_nil indexer.map_record({})
   end
+  it "uses logger from settings" do
+    desired_logger = Logger.new("/dev/null")
+    set_logger = nil
+    indexer.configure do
+      settings do
+        provide "logger", desired_logger
+        provide "mapping_rescue", -> (ctx, e) {
+          set_logger = ctx.logger
+        }
+      end
+      to_field 'id' do |_context , _exception|
+        raise 'this was always going to fail'
+      end
+    end
+    indexer.map_record({})
+    assert_equal desired_logger.object_id, set_logger.object_id
+  end
 end

data/test/indexer/macros/macros_marc21_semantics_test.rb CHANGED

@@ -197,6 +197,10 @@ describe "Traject::Macros::Marc21Semantics" do
       # we take the first date. And need to deal with the u.
       assert_equal 1845, Marc21Semantics.publication_date(@record)
     end
+    it "resorts to 264c" do
+      @record = MARC::Reader.new(support_file_path  "date_resort_to_264.marc").to_a.first
+      assert_equal 2015, Marc21Semantics.publication_date(@record)
+    end
     it "resorts to 260c" do
       @record = MARC::Reader.new(support_file_path  "date_resort_to_260.marc").to_a.first
       assert_equal 1980, Marc21Semantics.publication_date(@record)

data/test/indexer/nokogiri_indexer_test.rb CHANGED

@@ -109,6 +109,41 @@ describe "Traject::NokogiriIndexer" do
         result["name"].name == "name"
       })
     end
+  end
+  describe "xpath to attribute" do
+    let(:indexer) do
+      namespaces = @namespaces
+      Traject::Indexer::NokogiriIndexer.new("nokogiri.namespaces" => namespaces,
+                                            "nokogiri.each_record_xpath" => "//oai:record") do
+        to_field "status", extract_xpath("//oai:record/oai:header/@status")
+      end
+    end
+    let(:records) { Traject::NokogiriReader.new(StringIO.new(
+                                                      <<-XML
+      <?xml version="1.0" encoding="UTF-8"?>
+      <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
+        <responseDate>2020-03-03T04:16:09Z</responseDate>
+        <request verb="ListRecords" metadataPrefix="marc21" set="blacklight" from="2020-03-02T20:47:11Z">https://na02.alma.exlibrisgroup.com/view/oai/01TULI_INST/request</request>
+        <ListRecords>
+          <record>
+            <header status="deleted">
+              <identifier>oai:alma.01TULI_INST:991025803889703811</identifier>
+              <datestamp>2020-03-03T03:54:35Z</datestamp>
+              <setSpec>blacklight</setSpec>
+              <setSpec>rapid_print_journals</setSpec>
+              <setSpec>blacklight_qa</setSpec>
+            </header>
+          </record>
+        </ListRecords>
+      </OAI-PMH>
+          XML
+    ), []).to_a }
+    it "extracts the correct attribute" do
+      statuses = indexer.map_record(records.first)["status"]
+      assert_equal ["deleted"], statuses
+    end
   end
 end

data/test/nokogiri_reader_test.rb CHANGED

@@ -1,6 +1,12 @@
 require 'test_helper'
 require 'traject/nokogiri_reader'
+# Note that JRuby Nokogiri can treat namespaces differently than MRI nokogiri.
+# Particularly when we extract elements from a larger document with `each_record_xpath`,
+# and put them in their own document, in JRuby nokogiri the xmlns declarations
+# can end up on different elements than expected, although the document should
+# be semantically equivalent to an XML-namespace-aware processor. See:
+# https://github.com/sparklemotion/nokogiri/issues/1875
 describe "Traject::NokogiriReader" do
   describe "with namespaces" do
     before do
@@ -80,8 +86,22 @@ describe "Traject::NokogiriReader" do
         assert yielded_records.length > 0
         expected_namespaces = {"xmlns"=>"http://example.org/top", "xmlns:a"=>"http://example.org/a", "xmlns:b"=>"http://example.org/b"}
-        yielded_records.each do |rec|
-          assert_equal expected_namespaces, rec.namespaces
+        if !Traject::Util.is_jruby?
+          yielded_records.each do |rec|
+            assert_equal expected_namespaces, rec.namespaces
+          end
+        else
+          # jruby nokogiri shuffles things around, all we can really do is test that the namespaces
+          # are somehwere in the doc :( We rely on other tests to test semantic equivalence.
+          yielded_records.each do |rec|
+            assert_equal expected_namespaces, rec.collect_namespaces
+          end
+          whole_doc = Nokogiri::XML.parse(File.open(support_file_path("namespace-test.xml")))
+          whole_doc.xpath("//mytop:record", mytop: "http://example.org/top").each_with_index do |original_el, i|
+            assert ns_semantic_equivalent_xml?(original_el, yielded_records[i])
+          end
         end
       end
     end
@@ -114,6 +134,16 @@ describe "Traject::NokogiriReader" do
     end
   end
+  describe "strict_mode" do
+    it "raises on non-well-formed" do
+      # invalid because two sibling root nodes, XML requiers one root node
+      reader = Traject::NokogiriReader.new(StringIO.new("<doc></doc><doc></doc>"), {"nokogiri.strict_mode" => "true" })
+      assert_raises(Nokogiri::XML::SyntaxError) {
+        reader.each { |r| }
+      }
+    end
+  end
   def shared_tests
     @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
@@ -139,7 +169,40 @@ describe "Traject::NokogiriReader" do
     assert_length manually_extracted.size, yielded_records
     assert yielded_records.all? {|r| r.kind_of? Nokogiri::XML::Document }
-    assert_equal manually_extracted.collect(&:to_xml), yielded_records.collect(&:root).collect(&:to_xml)
+    expected_xml = manually_extracted
+    actual_xml   = yielded_records.collect(&:root)
+    expected_xml.size.times do |i|
+      if !Traject::Util.is_jruby?
+        assert_equal expected_xml[i-1].to_xml, actual_xml[i-1].to_xml
+      else
+        # jruby shuffles the xmlns declarations around, but they should
+        # be semantically equivalent to an namespace-aware processor
+        assert ns_semantic_equivalent_xml?(expected_xml[i-1], actual_xml[i-1])
+      end
+    end
+  end
+  # Jruby nokogiri can shuffle around where the `xmlns:ns` declarations appear, although it
+  # _ought_ not to be semantically different for a namespace-aware parser -- nodes are still in
+  # same namespaces.  JRuby may differ from what MRI does with same code, and may differ from
+  # the way an element appeared in input when extracting records from a larger input doc.
+  # There isn't much we can do about this, but we can write a recursive method
+  # that hopefully compares XML to make sure it really is semantically equivalent to
+  # a namespace, and hope we got that right.
+  def ns_semantic_equivalent_xml?(noko_a, noko_b)
+    noko_a = noko_a.root if noko_a.kind_of?(Nokogiri::XML::Document)
+    noko_b = noko_b.root if noko_b.kind_of?(Nokogiri::XML::Document)
+    noko_a.name == noko_b.name &&
+      noko_a.namespace&.prefix == noko_b.namespace&.prefix &&
+      noko_a.namespace&.href   == noko_b.namespace&.href &&
+      noko_a.attributes        == noko_b.attributes &&
+      noko_a.children.length   == noko_b.children.length &&
+      noko_a.children.each_with_index.all? do |a_child, index|
+        ns_semantic_equivalent_xml?(a_child, noko_b.children[index])
+      end
   end
   describe "without each_record_xpath" do

data/test/solr_json_writer_test.rb CHANGED

@@ -137,6 +137,26 @@ describe "Traject::SolrJsonWriter" do
     assert_length 1, JSON.parse(post_args[1][1]), "second batch posted with last remaining doc"
   end
+  it "retries batch as individual records on failure" do
+    @writer = create_writer("solr_writer.batch_size" => 2, "solr_writer.max_skipped" => 10)
+    @fake_http_client.response_status = 500
+    2.times do |i|
+      @writer.put context_with({"id" => "doc_#{i}", "key" => "value"})
+    end
+    @writer.close
+    # 1 batch, then 2 for re-trying each individually
+    assert_length 3, @fake_http_client.post_args
+    batch_update = @fake_http_client.post_args.first
+    assert_length 2, JSON.parse(batch_update[1])
+    individual_update1, individual_update2 = @fake_http_client.post_args[1], @fake_http_client.post_args[2]
+    assert_length 1, JSON.parse(individual_update1[1])
+    assert_length 1, JSON.parse(individual_update2[1])
+  end
   it "can #flush" do
     2.times do |i|
       doc = {"id" => "doc_#{i}", "key" => "value"}
@@ -150,15 +170,137 @@ describe "Traject::SolrJsonWriter" do
     assert_length 1, @fake_http_client.post_args, "Has flushed to solr"
   end
-  it "commits on close when set" do
-    @writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
-    @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
-    @writer.close
+  it "defaults to not setting basic authentication" do
+    settings = { "solr.url" => "http://example.com/solr/foo" }
+    writer = Traject::SolrJsonWriter.new(settings)
+    auth = writer.instance_variable_get("@http_client")
+      .www_auth.basic_auth.instance_variable_get("@auth")
+    assert(auth.empty?)
+  end
+  it "allows basic authentication setup" do
+    settings = {
+      "solr.url" => "http://example.com/solr/foo",
+      "solr_writer.basic_auth_user" => "foo",
+      "solr_writer.basic_auth_password" => "bar",
+    }
+    writer = Traject::SolrJsonWriter.new(settings)
+    auth = writer.instance_variable_get("@http_client")
+      .www_auth.basic_auth.instance_variable_get("@auth")
+    assert(!auth.empty?)
+  end
+  describe "commit" do
+    it "commits on close when set" do
+      @writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
+      @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
+      @writer.close
+      last_solr_get = @fake_http_client.get_args.last
+      assert_equal "http://example.com/update/json?commit=true", last_solr_get[0]
+    end
+    it "commits on close with commit_solr_update_args" do
+      @writer = create_writer(
+        "solr.url" => "http://example.com",
+        "solr_writer.commit_on_close" => "true",
+        "solr_writer.commit_solr_update_args" => { softCommit: true }
+      )
+      @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
+      @writer.close
+      last_solr_get = @fake_http_client.get_args.last
+      assert_equal "http://example.com/update/json?softCommit=true", last_solr_get[0]
+    end
+    it "can manually send commit" do
+      @writer = create_writer("solr.url" => "http://example.com")
+      @writer.commit
+      last_solr_get = @fake_http_client.get_args.last
+      assert_equal "http://example.com/update/json?commit=true", last_solr_get[0]
+    end
+    it "can manually send commit with specified args" do
+      @writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_solr_update_args" => { softCommit: true })
+      @writer.commit(commit: true, optimize: true, waitFlush: false)
+      last_solr_get = @fake_http_client.get_args.last
+      assert_equal "http://example.com/update/json?commit=true&optimize=true&waitFlush=false", last_solr_get[0]
+    end
+    it "uses commit_solr_update_args settings by default" do
+      @writer = create_writer(
+        "solr.url" => "http://example.com",
+        "solr_writer.commit_solr_update_args" => { softCommit: true }
+      )
+      @writer.commit
+      last_solr_get = @fake_http_client.get_args.last
+      assert_equal "http://example.com/update/json?softCommit=true", last_solr_get[0]
+    end
-    last_solr_get = @fake_http_client.get_args.last
+    it "overrides commit_solr_update_args with method arg" do
+      @writer = create_writer(
+        "solr.url" => "http://example.com",
+        "solr_writer.commit_solr_update_args" => { softCommit: true, foo: "bar" }
+      )
+      @writer.commit(commit: true)
-    assert_equal "http://example.com/update/json", last_solr_get[0]
-    assert_equal( {"commit" => "true"}, last_solr_get[1] )
+      last_solr_get = @fake_http_client.get_args.last
+      assert_equal "http://example.com/update/json?commit=true", last_solr_get[0]
+    end
+  end
+  describe "solr_writer.solr_update_args" do
+    before do
+      @writer = create_writer("solr_writer.solr_update_args" => { softCommit: true } )
+    end
+    it "sends update args" do
+      @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
+      @writer.close
+      assert_equal 1, @fake_http_client.post_args.count
+      post_args = @fake_http_client.post_args.first
+      assert_equal "http://example.com/solr/update/json?softCommit=true", post_args[0]
+    end
+    it "sends update args with delete" do
+      @writer.delete("test-id")
+      @writer.close
+      assert_equal 1, @fake_http_client.post_args.count
+      post_args = @fake_http_client.post_args.first
+      assert_equal "http://example.com/solr/update/json?softCommit=true", post_args[0]
+    end
+    it "sends update args on individual-retry after batch failure" do
+      @writer = create_writer(
+        "solr_writer.batch_size" => 2,
+        "solr_writer.max_skipped" => 10,
+        "solr_writer.solr_update_args" => { softCommit: true }
+      )
+      @fake_http_client.response_status = 500
+      2.times do |i|
+        @writer.put context_with({"id" => "doc_#{i}", "key" => "value"})
+      end
+      @writer.close
+      # 1 batch, then 2 for re-trying each individually
+      assert_length 3, @fake_http_client.post_args
+      individual_update1, individual_update2 = @fake_http_client.post_args[1], @fake_http_client.post_args[2]
+      assert_equal "http://example.com/solr/update/json?softCommit=true", individual_update1[0]
+      assert_equal "http://example.com/solr/update/json?softCommit=true", individual_update2[0]
+    end
   end
   describe "skipped records" do
@@ -225,6 +367,32 @@ describe "Traject::SolrJsonWriter" do
        logged = strio.string
       assert_includes logged, 'ArgumentError: bad stuff'
     end
+  end
+  describe "#delete" do
+    it "deletes" do
+      id = "123456"
+      @writer.delete(id)
+      post_args = @fake_http_client.post_args.first
+      assert_equal "http://example.com/solr/update/json", post_args[0]
+      assert_equal JSON.generate({"delete" => id}), post_args[1]
+    end
+    it "raises on non-200 http response" do
+      @fake_http_client.response_status = 500
+      assert_raises(RuntimeError) do
+        @writer.delete("12345")
+      end
+    end
+  end
+  describe "#delete_all!" do
+    it "deletes all" do
+      @writer.delete_all!
+      post_args = @fake_http_client.post_args.first
+      assert_equal "http://example.com/solr/update/json", post_args[0]
+      assert_equal JSON.generate({"delete" => { "query" => "*:*"}}), post_args[1]
+    end
   end
 end