RubyGems - traject - Versions diffs - 2.3.4 → 3.0.0.alpha.1 - Mend

traject 2.3.4 → 3.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

checksums.yaml +5 -5
data/.travis.yml +16 -9
data/CHANGES.md +74 -1
data/Gemfile +2 -1
data/README.md +104 -53
data/Rakefile +8 -1
data/doc/indexing_rules.md +79 -63
data/doc/programmatic_use.md +218 -0
data/doc/settings.md +28 -1
data/doc/xml.md +134 -0
data/lib/traject.rb +5 -0
data/lib/traject/array_writer.rb +34 -0
data/lib/traject/command_line.rb +18 -22
data/lib/traject/debug_writer.rb +2 -5
data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
data/lib/traject/indexer.rb +321 -92
data/lib/traject/indexer/context.rb +39 -13
data/lib/traject/indexer/marc_indexer.rb +30 -0
data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
data/lib/traject/indexer/settings.rb +36 -53
data/lib/traject/indexer/step.rb +27 -33
data/lib/traject/macros/marc21.rb +37 -12
data/lib/traject/macros/nokogiri_macros.rb +43 -0
data/lib/traject/macros/transformation.rb +162 -0
data/lib/traject/marc_extractor.rb +2 -0
data/lib/traject/ndj_reader.rb +1 -1
data/lib/traject/nokogiri_reader.rb +179 -0
data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
data/lib/traject/solr_json_writer.rb +19 -12
data/lib/traject/thread_pool.rb +13 -0
data/lib/traject/util.rb +14 -2
data/lib/traject/version.rb +1 -1
data/test/debug_writer_test.rb +3 -3
data/test/delimited_writer_test.rb +3 -3
data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
data/test/indexer/context_test.rb +23 -13
data/test/indexer/error_handler_test.rb +59 -0
data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
data/test/indexer/macros/to_field_test.rb +2 -2
data/test/indexer/macros/transformation_test.rb +177 -0
data/test/indexer/map_record_test.rb +2 -3
data/test/indexer/nokogiri_indexer_test.rb +103 -0
data/test/indexer/process_record_test.rb +55 -0
data/test/indexer/process_with_test.rb +148 -0
data/test/indexer/read_write_test.rb +52 -2
data/test/indexer/settings_test.rb +34 -24
data/test/indexer/to_field_test.rb +27 -2
data/test/marc_extractor_test.rb +7 -7
data/test/marc_reader_test.rb +4 -4
data/test/nokogiri_reader_test.rb +158 -0
data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
data/test/solr_json_writer_test.rb +24 -28
data/test/test_helper.rb +8 -2
data/test/test_support/namespace-test.xml +7 -0
data/test/test_support/nokogiri_demo_config.rb +17 -0
data/test/test_support/oai-pmh-one-record-2.xml +24 -0
data/test/test_support/oai-pmh-one-record-first.xml +24 -0
data/test/test_support/sample-oai-no-namespace.xml +197 -0
data/test/test_support/sample-oai-pmh.xml +197 -0
data/test/thread_pool_test.rb +38 -0
data/test/translation_map_test.rb +3 -3
data/test/translation_maps/ruby_map.rb +2 -1
data/test/translation_maps/yaml_map.yaml +2 -1
data/traject.gemspec +4 -11
metadata +92 -6

data/lib/traject/solr_json_writer.rb CHANGED

@@ -47,6 +47,8 @@ require 'concurrent' # for atomic_fixnum
 class Traject::SolrJsonWriter
   include Traject::QualifiedConstGet
+  URI_REGEXP = URI::Parser.new.make_regexp.freeze
   DEFAULT_MAX_SKIPPED = 0
   DEFAULT_BATCH_SIZE  = 100
@@ -105,6 +107,18 @@ class Traject::SolrJsonWriter
     end
   end
+  # Not part of standard writer API.
+  #
+  # If we are batching adds, and have some not-yet-written ones queued up --
+  # flush em all to solr.
+  #
+  # This should be thread-safe to call, but the write does take place in
+  # the caller's thread, no threading is done for you here, regardless of setting
+  # of solr_writer.thread_pool
+  def flush
+    send_batch( Traject::Util.drain_queue(@batched_queue) )
+  end
   # Send the given batch of contexts. If something goes wrong, send
   # them one at a time.
   # @param [Array<Traject::Indexer::Context>] an array of contexts
@@ -147,7 +161,7 @@ class Traject::SolrJsonWriter
       else
         msg = "Solr error response: #{resp.status}: #{resp.body}"
       end
-      logger.error "Could not add record #{c.source_record_id} at source file position #{c.position}: #{msg}"
+      logger.error "Could not add record #{c.record_inspect}: #{msg}"
       logger.debug(c.source_record.to_s)
       @skipped_record_incrementer.increment
@@ -236,7 +250,7 @@ class Traject::SolrJsonWriter
   # If we've got a solr.update_url, make sure it's ok
   def check_solr_update_url(url)
-    unless /^#{URI::regexp}$/.match(url)
+    unless /^#{URI_REGEXP}$/.match(url)
       raise ArgumentError.new("#{self.class.name} setting `solr.update_url` doesn't look like a URL: `#{url}`")
     end
     url
@@ -249,18 +263,11 @@ class Traject::SolrJsonWriter
     end
     # Not a URL? Bail
-    unless  /^#{URI::regexp}$/.match(url)
+    unless  /^#{URI_REGEXP}$/.match(url)
       raise ArgumentError.new("#{self.class.name} setting `solr.url` doesn't look like a URL: `#{url}`")
     end
-    # First, try the /update/json handler
-    candidate = [url.chomp('/'), 'update', 'json'].join('/')
-    resp      = @http_client.get(candidate)
-    if resp.status == 404
-      candidate = [url.chomp('/'), 'update'].join('/')
-    end
-    candidate
+    # Assume the /update/json handler
+    return [url.chomp('/'), 'update', 'json'].join('/')
   end
 end

data/lib/traject/thread_pool.rb CHANGED

@@ -50,11 +50,24 @@ module Traject
   class ThreadPool
     attr_reader :pool_size, :queue_capacity
+    @@disable_concurrency = false
+    # Calling Traject::ThreadPool.disable_concurrency! permanently and irrevocably (for program execution)
+    # forces all ThreadPools to have a pool_size of 0 -- running all work inline -- so should disable all
+    # use of threads in Traject.
+    def self.disable_concurrency! ;  @@disable_concurrency = true ; end
+    def self.concurrency_disabled? ; @@disable_concurrency ; end
     # First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
     # work in caller thread.
     def initialize(pool_size)
       @thread_pool             = nil # assume we don't have one
       @exceptions_caught_queue = [] # start off without exceptions
+      if self.class.concurrency_disabled?
+        pool_size = 0
+      end
       unless pool_size.nil? || pool_size == 0
         @pool_size      = pool_size.to_i
         @queue_capacity = pool_size * 3

data/lib/traject/util.rb CHANGED

@@ -60,7 +60,6 @@ module Traject
           if line.start_with?(file_path)
             if m = /\A.*\:(\d+)\:in/.match(line)
               return m[1].to_i
-              break
             end
           end
         end
@@ -116,11 +115,24 @@ module Traject
           result << queue.deq(:raise_if_empty)
         end
       rescue ThreadError
-        # Need do nothing, queue was concurrently popped, no biggie
+        # Need do nothing, queue was concurrently popped, no biggie, but let's
+        # stop iterating and return what we've got.
+        return result
       end
       return result
     end
+    def self.is_jruby?
+      unless defined?(@is_jruby)
+        @is_jruby = defined?(JRUBY_VERSION)
+      end
+      @is_jruby
+    end
+    # How can we refer to an io object input in logs? For now, if it's a file-like
+    # object, we can use #path.
+    def self.io_name(io_like_object)
+      io_like_object.path if io_like_object.respond_to?(:path)
+    end
   end
 end

data/lib/traject/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Traject
-  VERSION = "2.3.4"
+  VERSION = "3.0.0.alpha.1"
 end

data/test/debug_writer_test.rb CHANGED

@@ -9,7 +9,7 @@ describe 'Simple output' do
   before do
     @record = MARC::Reader.new(support_file_path  "manufacturing_consent.marc").to_a.first
     @indexer = Traject::Indexer.new
-    @indexer.instance_eval do
+    @indexer.configure do
       to_field "id", extract_marc("001", :first => true)
       to_field "title", extract_marc("245ab")
     end
@@ -46,7 +46,7 @@ describe 'Simple output' do
         "record_num_1 title #{@title}",
     ]
     assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
-    assert_match /At least one record \(\#1\) doesn't define field 'id'/, logger_strio.string
+    assert_match(/At least one record \(<record #1>\) doesn't define field 'id'/, logger_strio.string)
     @writer.close
   end
@@ -68,7 +68,7 @@ describe 'Simple output' do
         "record_num_1 title #{@title}",
     ]
     assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
-    assert_match /At least one record \(\#1\) doesn't define field 'iden'/, logger_strio.string
+    assert_match(/At least one record \(<record #1, output_id:2710183>\) doesn't define field 'iden'/, logger_strio.string)
     writer.close
   end

data/test/delimited_writer_test.rb CHANGED

@@ -39,13 +39,13 @@ describe "Delimited/CSV Writers" do
     end
     it "outputs a header if asked to" do
-      dw = Traject::DelimitedWriter.new(@settings)
+      Traject::DelimitedWriter.new(@settings)
       @out.string.chomp.must_equal %w[four one two].join("\t")
     end
     it "doesn't output a header if asked not to" do
       @settings['delimited_writer.header'] = 'false'
-      dw                                   = Traject::DelimitedWriter.new(@settings)
+      Traject::DelimitedWriter.new(@settings)
       @out.string.must_be_empty
     end
@@ -69,7 +69,7 @@ describe "Delimited/CSV Writers" do
     end
     it "writes the header" do
-      cw = Traject::CSVWriter.new(@settings)
+      Traject::CSVWriter.new(@settings)
       @out.string.chomp.must_equal 'four,one,two'
     end

data/test/experimental_nokogiri_streaming_reader_test.rb ADDED

@@ -0,0 +1,169 @@
+require 'test_helper'
+require 'traject/experimental_nokogiri_streaming_reader'
+# Streaming nokogiri reader is experimental, half-finished, and not supported for real use.
+describe "Traject::ExperimentalNokogiriStreamingReader" do
+  describe "with namespaces" do
+    before do
+      @namespaces = { "oai" => "http://www.openarchives.org/OAI/2.0/" }
+      @xml_sample_path = support_file_path("sample-oai-pmh.xml")
+    end
+    describe "invalid settings" do
+      it "default_namespaces not a hash raises" do
+        error = assert_raises(ArgumentError) {
+          @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
+            "nokogiri.namespaces" => "i am not a hash",
+          })
+        }
+        assert(error.message =~ /nokogiri.namespaces must be a hash/)
+      end
+      it "each_record_xpath with unregistered prefix raises" do
+        error = assert_raises(ArgumentError) {
+          @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
+            "nokogiri.namespaces" => @namespaces,
+            "nokogiri.each_record_xpath" => "//foo:bar"
+          })
+        }
+        assert(error.message =~ %r{Can't find namespace prefix 'foo' in '//foo:bar'})
+      end
+      it "raises on some unsupported xpath" do
+        error = assert_raises(ArgumentError) {
+          @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
+            "nokogiri.namespaces" => @namespaces,
+            "nokogiri.each_record_xpath" => "//oai:record[@id='foo']"
+          })
+        }
+        assert(error.message =~ /Only very simple xpaths supported\./)
+      end
+    end
+    describe "fixed path" do
+      before do
+        @each_record_xpath = "/oai:OAI-PMH/oai:ListRecords/oai:record"
+      end
+      it "reads" do
+        shared_tests
+      end
+    end
+    describe "floating path" do
+      before do
+        @each_record_xpath = "//oai:record"
+      end
+      it "reads" do
+        shared_tests
+      end
+    end
+    describe "extra_xpath_hooks" do
+      it "catches oai-pmh resumption token" do
+        @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
+          "nokogiri.namespaces" => @namespaces,
+          "nokogiri.each_record_xpath" => "//oai:record",
+          "nokogiri_reader.extra_xpath_hooks" => {
+            "//oai:resumptionToken" => lambda do |node, clipboard|
+              clipboard[:resumptionToken] = node.text
+            end
+          }
+        })
+        _records = @reader.to_a
+        assert_equal "oai_dc.f(2018-05-03T18:09:08Z).u(2018-06-15T19:25:21Z).t(6387):100", @reader.clipboard[:resumptionToken]
+      end
+    end
+    describe "outer namespaces" do
+      it "are preserved" do
+        @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(support_file_path("namespace-test.xml")), {
+          "nokogiri.namespaces" => { mytop: "http://example.org/top" },
+          "nokogiri.each_record_xpath" => "//mytop:record"
+        })
+        yielded_records = []
+        @reader.each { |record|
+          yielded_records << record
+        }
+        assert yielded_records.length > 0
+        expected_namespaces = {"xmlns"=>"http://example.org/top", "xmlns:a"=>"http://example.org/a", "xmlns:b"=>"http://example.org/b"}
+        yielded_records.each do |rec|
+          assert_equal expected_namespaces, rec.namespaces
+        end
+      end
+    end
+  end
+  describe "without namespaces" do
+    before do
+      @namespaces = {}
+      @xml_sample_path = support_file_path("sample-oai-no-namespace.xml")
+    end
+    describe "fixed path" do
+      before do
+        @each_record_xpath = "/OAI-PMH/ListRecords/record"
+      end
+      it "reads" do
+        shared_tests
+      end
+    end
+    describe "floating path" do
+      before do
+        @each_record_xpath = "//record"
+      end
+      it "reads" do
+        shared_tests
+      end
+    end
+  end
+  def shared_tests
+    @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
+      "nokogiri.namespaces" => @namespaces,
+      "nokogiri.each_record_xpath" => @each_record_xpath
+    })
+    yielded_records = []
+    @reader.each { |record|
+      yielded_records << record
+    }
+    manually_extracted = Nokogiri::XML.parse(File.open(@xml_sample_path)).xpath(@each_record_xpath, @namespaces)
+    manually_extracted.collect do |node|
+      # nokogiri makes it so hard to reliably get an Element to serialize to XML with all
+      # it's inherited namespace declerations. :(  We're only doing this for testing purposes
+      # anyway.  This may not handle everything, but handles what we need in the test right now
+      if node.namespace
+        node["xmlns"] = node.namespace.href
+      end
+    end
+    assert_length manually_extracted.size, yielded_records
+    assert yielded_records.all? {|r| r.kind_of? Nokogiri::XML::Document }
+    assert_equal manually_extracted.collect(&:to_xml), yielded_records.collect(&:root).collect(&:to_xml)
+  end
+  describe "without each_record_xpath" do
+    before do
+      @xml_sample_path = support_file_path("namespace-test.xml")
+    end
+    it "yields whole file as one record" do
+      @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {})
+      yielded_records = @reader.to_a
+      assert_length 1, yielded_records
+      assert_equal Nokogiri::XML.parse(File.open(@xml_sample_path)).to_xml, yielded_records.first.to_xml
+    end
+  end
+end

data/test/indexer/context_test.rb CHANGED

@@ -5,7 +5,7 @@ describe "Traject::Indexer::Context" do
   describe "source_record_id" do
     before do
       @record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
-      @context = Traject::Indexer::Context.new
+      @context = Traject::Indexer::Context.new(source_record_id_proc: Traject::Indexer::MarcIndexer.new.source_record_id_proc)
       @record_001 = "   00282214 " # from the mrc file
     end
@@ -13,23 +13,33 @@ describe "Traject::Indexer::Context" do
       @context.source_record = @record
       assert_equal @record_001, @context.source_record_id
     end
+  end
-    it "gets it from the id" do
-      @context.output_hash['id'] = 'the_record_id'
-      assert_equal 'the_record_id', @context.source_record_id
-    end
+  describe "#record_inspect" do
+    before do
+      @record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
+      @source_record_id_proc = Traject::Indexer::MarcIndexer.new.source_record_id_proc
+      @record_001 = "   00282214 " # from the mrc file
-    it "gets from the id with non-MARC source" do
-      @context.source_record = Object.new
-      @context.output_hash['id'] = 'the_record_id'
-      assert_equal 'the_record_id', @context.source_record_id
+      @position = 10
+      @input_name = "some_file.mrc"
+      @position_in_input = 10
     end
-    it "gets it from both 001 and id" do
-      @context.output_hash['id'] = 'the_record_id'
-      @context.source_record = @record
-      assert_equal [@record_001, 'the_record_id'].join('/'), @context.source_record_id
+    it "can print complete inspect label" do
+      @context = Traject::Indexer::Context.new(
+        source_record:  @record,
+        source_record_id_proc: @source_record_id_proc,
+        position: @position,
+        input_name: @input_name,
+        position_in_input: @position_in_input
+      )
+      @context.output_hash["id"] = "output_id"
+      assert_equal "<record ##{@position} (#{@input_name} ##{@position_in_input}), source_id:#{@record_001} output_id:output_id>", @context.record_inspect
     end
   end
 end

data/test/indexer/error_handler_test.rb ADDED

@@ -0,0 +1,59 @@
+require 'test_helper'
+describe 'Custom mapping error handler' do
+  # the exception thrown by the custom handler
+  class CustomFakeException < StandardError; end
+  let(:indexer) { Traject::Indexer.new }
+  it 'invokes the default handler when custom handler is not set' do
+    output = StringIO.new
+    logger =Logger.new(output)
+    indexer.logger = logger
+    indexer.configure do
+      to_field 'id' do |_, _, _|
+        raise CustomFakeException, "I just like raising errors"
+      end
+    end
+    e = assert_raises(CustomFakeException) do
+      indexer.map_record({})
+    end
+    assert_equal "I just like raising errors", e.message
+    assert output.string =~ /while executing \(to_field \"id\" at .*error_handler_test.rb:\d+\)/
+    assert output.string =~ /CustomFakeException: I just like raising errors/
+  end
+  it 'invokes the custom handler when set' do
+    indexer.configure do
+      settings do
+        provide 'mapping_rescue', -> (ctx, e) {
+          raise CustomFakeException, "custom handler called #{ctx.record_inspect}: #{ctx.index_step.inspect}, #{e.inspect}"
+        }
+      end
+      to_field 'id' do |_context , _exception|
+        raise 'this was always going to fail'
+      end
+    end
+    e = assert_raises(CustomFakeException) { indexer.map_record({}) }
+    assert e.message =~ /\(to_field \"id\" at .*error_handler_test.rb:\d+\)/
+  end
+  it "custom handler can skip and continue" do
+    indexer.configure do
+      settings do
+        provide "mapping_rescue", -> (context, exception) {
+          context.skip!
+        }
+      end
+      to_field 'id' do |_context , _exception|
+        raise 'this was always going to fail'
+      end
+    end
+    assert_nil indexer.map_record({})
+  end
+end