RubyGems - traject - Versions diffs - 2.3.4 → 3.0.0.alpha.1 - Mend

traject 2.3.4 → 3.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

checksums.yaml +5 -5
data/.travis.yml +16 -9
data/CHANGES.md +74 -1
data/Gemfile +2 -1
data/README.md +104 -53
data/Rakefile +8 -1
data/doc/indexing_rules.md +79 -63
data/doc/programmatic_use.md +218 -0
data/doc/settings.md +28 -1
data/doc/xml.md +134 -0
data/lib/traject.rb +5 -0
data/lib/traject/array_writer.rb +34 -0
data/lib/traject/command_line.rb +18 -22
data/lib/traject/debug_writer.rb +2 -5
data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
data/lib/traject/indexer.rb +321 -92
data/lib/traject/indexer/context.rb +39 -13
data/lib/traject/indexer/marc_indexer.rb +30 -0
data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
data/lib/traject/indexer/settings.rb +36 -53
data/lib/traject/indexer/step.rb +27 -33
data/lib/traject/macros/marc21.rb +37 -12
data/lib/traject/macros/nokogiri_macros.rb +43 -0
data/lib/traject/macros/transformation.rb +162 -0
data/lib/traject/marc_extractor.rb +2 -0
data/lib/traject/ndj_reader.rb +1 -1
data/lib/traject/nokogiri_reader.rb +179 -0
data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
data/lib/traject/solr_json_writer.rb +19 -12
data/lib/traject/thread_pool.rb +13 -0
data/lib/traject/util.rb +14 -2
data/lib/traject/version.rb +1 -1
data/test/debug_writer_test.rb +3 -3
data/test/delimited_writer_test.rb +3 -3
data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
data/test/indexer/context_test.rb +23 -13
data/test/indexer/error_handler_test.rb +59 -0
data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
data/test/indexer/macros/to_field_test.rb +2 -2
data/test/indexer/macros/transformation_test.rb +177 -0
data/test/indexer/map_record_test.rb +2 -3
data/test/indexer/nokogiri_indexer_test.rb +103 -0
data/test/indexer/process_record_test.rb +55 -0
data/test/indexer/process_with_test.rb +148 -0
data/test/indexer/read_write_test.rb +52 -2
data/test/indexer/settings_test.rb +34 -24
data/test/indexer/to_field_test.rb +27 -2
data/test/marc_extractor_test.rb +7 -7
data/test/marc_reader_test.rb +4 -4
data/test/nokogiri_reader_test.rb +158 -0
data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
data/test/solr_json_writer_test.rb +24 -28
data/test/test_helper.rb +8 -2
data/test/test_support/namespace-test.xml +7 -0
data/test/test_support/nokogiri_demo_config.rb +17 -0
data/test/test_support/oai-pmh-one-record-2.xml +24 -0
data/test/test_support/oai-pmh-one-record-first.xml +24 -0
data/test/test_support/sample-oai-no-namespace.xml +197 -0
data/test/test_support/sample-oai-pmh.xml +197 -0
data/test/thread_pool_test.rb +38 -0
data/test/translation_map_test.rb +3 -3
data/test/translation_maps/ruby_map.rb +2 -1
data/test/translation_maps/yaml_map.yaml +2 -1
data/traject.gemspec +4 -11
metadata +92 -6

data/test/indexer/macros/transformation_test.rb ADDED

@@ -0,0 +1,177 @@
+# Encoding: UTF-8
+require 'test_helper'
+require 'traject/indexer'
+# should be built into every indexer
+describe "Traject::Macros::Transformation" do
+  before do
+    @indexer = Traject::Indexer.new
+    @record = nil
+  end
+  describe "translation_map" do
+    it "translates" do
+      @indexer.configure do
+        to_field "cataloging_agency", literal("DLC"), translation_map("marc_040a_translate_test")
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["Library of Congress"], output["cataloging_agency"]
+    end
+    it "can merge multiple" do
+      @indexer.configure do
+        to_field "result", literal("key_to_be_overridden"), translation_map("ruby_map", "yaml_map")
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["value_from_yaml"], output["result"]
+    end
+    it "can merge multiple with hash" do
+      @indexer.configure do
+        to_field "result", literal("key_to_be_overridden"), translation_map("ruby_map", "yaml_map", {"key_to_be_overridden" => "value_from_inline_hash"})
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["value_from_inline_hash"], output["result"]
+    end
+  end
+  describe "transform" do
+    it "transforms with block" do
+      @indexer.configure do
+        to_field "sample_field", literal("one"), literal("two"), transform(&:upcase)
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["ONE", "TWO"], output["sample_field"]
+    end
+    it "transforms with proc arg" do
+      @indexer.configure do
+        to_field "sample_field", literal("one"), literal("two"), transform(->(val) { val.tr('aeiou', '!') })
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["!n!", "tw!"], output["sample_field"]
+    end
+    it "transforms with both, in correct order" do
+      @indexer.configure do
+        to_field "sample_field", literal("one"), literal("two"), transform(->(val) { val.tr('aeiou', '!') }, &:upcase)
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["!N!", "TW!"], output["sample_field"]
+    end
+  end
+  describe "default" do
+    it "adds default to empty accumulator" do
+      @indexer.configure do
+        to_field "test", default("default")
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["default"], output["test"]
+    end
+    it "does not add default if value present" do
+      @indexer.configure do
+        to_field "test", literal("value"), default("defaut")
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["value"], output["test"]
+    end
+  end
+  describe "first_only" do
+    it "takes only first in multi-value" do
+      @indexer.configure do
+        to_field "test", literal("one"), literal("two"), literal("three"), first_only
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["one"], output["test"]
+    end
+    it "no-ops on nil" do
+      @indexer.configure do
+        to_field "test", first_only
+      end
+      output = @indexer.map_record(@record)
+      assert_nil output["test"]
+    end
+    it "no-ops on single value" do
+      @indexer.configure do
+        to_field "test", literal("one"), first_only
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["one"], output["test"]
+    end
+  end
+  describe "unique" do
+    it "uniqs" do
+      @indexer.configure do
+        to_field "test", literal("one"), literal("two"), literal("one"), literal("three"), unique
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["one", "two", "three"], output["test"]
+    end
+  end
+  describe "strip" do
+    it "strips" do
+      @indexer.configure do
+        to_field "test", literal("  one"), literal(" two  "), strip
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["one", "two"], output["test"]
+    end
+    it "strips unicode whitespace" do
+      @indexer.configure do
+        to_field "test", literal(" \u00A0 \u2002 one \u202F "), strip
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["one"], output["test"]
+    end
+  end
+  describe "split" do
+    it "splits" do
+      @indexer.configure do
+        to_field "test", literal("one.two"), split(".")
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["one", "two"], output["test"]
+    end
+  end
+  describe "append" do
+    it "appends suffix" do
+      @indexer.configure do
+        to_field "test", literal("one"), literal("two"), append(".suffix")
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["one.suffix", "two.suffix"], output["test"]
+    end
+  end
+  describe "prepend" do
+    it "prepends prefix" do
+      @indexer.configure do
+        to_field "test", literal("one"), literal("two"), prepend("prefix.")
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["prefix.one", "prefix.two"], output["test"]
+    end
+  end
+  describe "gsub" do
+    it "gsubs" do
+      @indexer.configure do
+        to_field "test", literal("one1212two23three"), gsub(/\d+/, ' ')
+      end
+      output = @indexer.map_record(@record)
+      assert_equal ["one two three"], output["test"]
+    end
+  end
+end

data/test/indexer/map_record_test.rb CHANGED

@@ -196,12 +196,11 @@ describe "Traject::Indexer#map_record" do
       end
       @indexer.to_field('afterSkip') do |rec, acc|
-        acc << "After. Should never happen"
+        raise ArgumentError, "intentional, should never happen"
       end
       output = @indexer.map_record(@record)
-      assert_equal ['Before'], output['beforeSkip']
-      assert_nil output['afterSkip']
+      assert_nil output
     end

data/test/indexer/nokogiri_indexer_test.rb ADDED

@@ -0,0 +1,103 @@
+require 'test_helper'
+describe "Traject::NokogiriIndexer" do
+  before do
+    Traject::Indexer.send(:default_settings=, Traject::Indexer.default_settings.merge("solr_writer.thread_pool" => 0, "processing_thread_pool" => 0))
+    @xml_sample_path = support_file_path("sample-oai-pmh.xml")
+    @indexer = Traject::Indexer::NokogiriIndexer.new("writer_class_name" => "Traject::ArrayWriter", "solr_writer.thread_pool" => 0, "processing_thread_pool" => 0)
+    @namespaces = {
+      "oai" => "http://www.openarchives.org/OAI/2.0/",
+      "dc" => "http://purl.org/dc/elements/1.1/",
+      "oai_dc" => "http://www.openarchives.org/OAI/2.0/oai_dc/",
+      "edm" => "http://www.europeana.eu/schemas/edm/"
+    }
+  end
+  it "smoke test" do
+    namespaces = @namespaces
+    @indexer.configure do
+      settings do
+        provide "nokogiri.namespaces", namespaces
+        provide "nokogiri.each_record_xpath", "//oai:record"
+      end
+      to_field "id", extract_xpath("//oai:metadata/oai_dc:dc/dc:identifier"), first_only
+      to_field "title", extract_xpath("//oai:metadata/oai_dc:dc/dc:title")
+    end
+    @indexer.process(File.open(@xml_sample_path))
+    results = @indexer.writer.values
+    source_doc = Nokogiri::XML.parse(File.open(@xml_sample_path))
+    assert_equal source_doc.xpath("//oai:record", @namespaces).count, results.count
+    assert(results.all? { |hash|
+      hash["id"] && hash["id"].length == 1 &&
+      hash["title"] && hash["title"].length >= 1
+    }, "expected results have expected values")
+  end
+  it "namespaces to extract_xpath" do
+    namespaces = @namespaces.merge(edm: "http://this.is.wrong")
+    @indexer.configure do
+      settings do
+        provide "nokogiri.namespaces", namespaces
+        provide "nokogiri.each_record_xpath", "//oai:record"
+      end
+      to_field "rights", extract_xpath("//oai:metadata/oai_dc:dc/edm:rights", ns: { edm: "http://www.europeana.eu/schemas/edm/" })
+    end
+    @indexer.process(File.open(@xml_sample_path))
+    results = @indexer.writer.values
+    refute_empty results.last["rights"]
+  end
+  describe "xpath to non-terminal element" do
+    before do
+      @xml = <<-EOS
+      <record>
+        <name>
+          <first>José</first>
+          <last>Lopez</last>
+        </name>
+        <name>
+          <first>Sue</first>
+          <last>Jones</last>
+        </name>
+      </record>
+      EOS
+      @indexer.configure do
+        settings do
+          provide "nokogiri.each_record_xpath", "//record"
+        end
+      end
+    end
+    it "outputs text" do
+      @indexer.configure { to_field "name", extract_xpath("/record/name") }
+      @indexer.process(StringIO.new(@xml))
+      results = @indexer.writer.values
+      assert_equal( {"name" => ["José Lopez", "Sue Jones"]}, results.first )
+    end
+    it "outputs Nokogiri::XML::Element with to_text: false" do
+      @indexer.configure { to_field "name", extract_xpath("/record/name", to_text: false) }
+      @indexer.process(StringIO.new(@xml))
+      results = @indexer.writer.values
+      values = results.first["name"]
+      assert(values.each { |result|
+        result["name"].kind_of?(Nokogiri::XML::Element) &&
+        result["name"].name == "name"
+      })
+    end
+  end
+end

data/test/indexer/process_record_test.rb ADDED

@@ -0,0 +1,55 @@
+require 'test_helper'
+describe "Traject::Indexer#process_record" do
+  before do
+    @writer = Traject::ArrayWriter.new
+    @indexer = Traject::Indexer.new(writer: @writer) do
+      to_field "record", lambda { |rec, acc| acc << rec }
+    end
+    @record = {key: "value"}
+  end
+  it "sends to writer" do
+    @indexer.process_record(@record)
+    assert_equal [{"record" => [@record] }], @writer.values
+  end
+  it "returns context" do
+    context = @indexer.process_record(@record)
+    assert context.is_a?(Traject::Indexer::Context)
+    assert_equal @record, context.source_record
+  end
+  it "skips if skipped" do
+    @indexer = Traject::Indexer.new(writer: @writer) do
+      to_field "record", lambda { |rec, acc, context| acc << rec; context.skip! }
+    end
+    context = @indexer.process_record(@record)
+    assert context.skip?
+    assert_equal [], @writer.values
+  end
+  it "raises exceptions out" do
+    @indexer = Traject::Indexer.new(writer: @writer) do
+      to_field "record", lambda { |rec, acc, context| acc << rec; raise ArgumentError, "intentional" }
+    end
+    assert_raises(ArgumentError) do
+      @indexer.process_record(@record)
+    end
+  end
+  it "aliases <<" do
+    assert_equal @indexer.method(:process_record), @indexer.method(:<<)
+    @indexer << @record
+  end
+  it "raises on completed indexer" do
+    @indexer.complete
+    assert_raises Traject::Indexer::CompletedStateError do
+      @indexer.process_record(@record)
+    end
+  end
+end

data/test/indexer/process_with_test.rb ADDED

@@ -0,0 +1,148 @@
+require 'test_helper'
+describe "Traject::Indexer#process_with" do
+  let(:input_records) { [
+    { one: "one" },
+    { two: "two" },
+    { three: "three" }
+  ] }
+  let(:array_writer) { Traject::ArrayWriter.new }
+  let(:indexer) {
+    Traject::Indexer.new do
+      to_field "records", lambda { |rec, acc|
+        acc << rec
+      }
+    end
+  }
+  it "processes" do
+    writer = indexer.process_with(input_records, array_writer)
+    assert_equal([{"records"=>[{:one=>"one"}]}, {"records"=>[{:two=>"two"}]}, {"records"=>[{:three=>"three"}]}], writer.values)
+  end
+  describe "calls close" do
+    before do
+      array_writer.extend(Module.new do
+        def close
+          @close_called = true
+        end
+        def close_called?
+          @close_called
+        end
+      end)
+    end
+    it "calls by default" do
+      writer = indexer.process_with(input_records, array_writer)
+      assert writer.close_called?
+    end
+    it "does not call if told not to" do
+      writer = indexer.process_with(input_records, array_writer, close_writer: false)
+      assert ! writer.close_called?
+    end
+  end
+  describe "after_processing steps" do
+      let(:indexer) {
+        Traject::Indexer.new do
+          after_processing do
+            raise "Don't call me"
+          end
+        end
+      }
+    it "are not called" do
+      # should not raise
+      indexer.process_with(input_records, array_writer)
+    end
+  end
+  describe "with block as destination" do
+    it "calls block for each record" do
+      received = []
+      indexer.process_with(input_records) do |context|
+        received << context
+      end
+      assert_equal 3, received.length
+      assert received.all? { |o| o.kind_of?(Traject::Indexer::Context)}
+      assert_equal input_records.collect { |r| [r] }, received.collect { |c| c.output_hash["records"] }
+    end
+  end
+  describe "exceptions" do
+    let(:indexer) {
+      Traject::Indexer.new do
+        to_field "foo", lambda { |rec, acc|
+          if rec.keys.include?(:one)
+            raise ArgumentError, "intentional"
+          end
+          acc << rec
+        }
+      end
+    }
+    describe "by default" do
+      it "raises" do
+        assert_raises(ArgumentError) do
+          indexer.process_with(input_records, array_writer)
+        end
+      end
+    end
+    describe "with rescue_with" do
+      it "calls block and keeps processing" do
+        rescued = []
+        rescue_lambda = lambda do |context, exception|
+          rescued << {
+            context: context,
+            exception: exception
+          }
+        end
+        _writer = indexer.process_with(input_records, array_writer, rescue_with: rescue_lambda)
+        # not including the one that raised
+        assert_equal 2, array_writer.contexts.length
+        # and raise was called
+        assert_equal 1, rescued.length
+        assert rescued.first[:context].is_a?(Traject::Indexer::Context)
+        assert_equal ArgumentError, rescued.first[:exception].class
+        assert_equal "intentional", rescued.first[:exception].message
+      end
+      it "can raise from rescue" do
+        rescue_lambda = lambda do |context, exception|
+          raise exception
+        end
+        assert_raises(ArgumentError) do
+          indexer.process_with(input_records, array_writer, rescue: rescue_lambda)
+        end
+      end
+    end
+    describe "skipped records" do
+      let(:indexer) {
+        Traject::Indexer.new do
+          to_field "foo", literal("value")
+          each_record do |record, context|
+            context.skip!
+          end
+        end
+      }
+      it "calls on_skipped, does not send to writer" do
+        skip_calls = []
+        on_skipped = lambda { |*args| skip_calls << args }
+        writer = indexer.process_with(input_records, array_writer, on_skipped: on_skipped)
+        assert_equal writer.values, [], "nothing sent to writer"
+        assert_equal input_records.count, skip_calls.count, "skip proc called"
+        assert skip_calls.all? {|a| a.length == 1 && a[0].kind_of?(Traject::Indexer::Context) }, "skip proc called with single arg"
+      end
+    end
+  end
+end