traject 2.3.4 → 3.0.0.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +16 -9
- data/CHANGES.md +74 -1
- data/Gemfile +2 -1
- data/README.md +104 -53
- data/Rakefile +8 -1
- data/doc/indexing_rules.md +79 -63
- data/doc/programmatic_use.md +218 -0
- data/doc/settings.md +28 -1
- data/doc/xml.md +134 -0
- data/lib/traject.rb +5 -0
- data/lib/traject/array_writer.rb +34 -0
- data/lib/traject/command_line.rb +18 -22
- data/lib/traject/debug_writer.rb +2 -5
- data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
- data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
- data/lib/traject/indexer.rb +321 -92
- data/lib/traject/indexer/context.rb +39 -13
- data/lib/traject/indexer/marc_indexer.rb +30 -0
- data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
- data/lib/traject/indexer/settings.rb +36 -53
- data/lib/traject/indexer/step.rb +27 -33
- data/lib/traject/macros/marc21.rb +37 -12
- data/lib/traject/macros/nokogiri_macros.rb +43 -0
- data/lib/traject/macros/transformation.rb +162 -0
- data/lib/traject/marc_extractor.rb +2 -0
- data/lib/traject/ndj_reader.rb +1 -1
- data/lib/traject/nokogiri_reader.rb +179 -0
- data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
- data/lib/traject/solr_json_writer.rb +19 -12
- data/lib/traject/thread_pool.rb +13 -0
- data/lib/traject/util.rb +14 -2
- data/lib/traject/version.rb +1 -1
- data/test/debug_writer_test.rb +3 -3
- data/test/delimited_writer_test.rb +3 -3
- data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
- data/test/indexer/context_test.rb +23 -13
- data/test/indexer/error_handler_test.rb +59 -0
- data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
- data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
- data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
- data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
- data/test/indexer/macros/to_field_test.rb +2 -2
- data/test/indexer/macros/transformation_test.rb +177 -0
- data/test/indexer/map_record_test.rb +2 -3
- data/test/indexer/nokogiri_indexer_test.rb +103 -0
- data/test/indexer/process_record_test.rb +55 -0
- data/test/indexer/process_with_test.rb +148 -0
- data/test/indexer/read_write_test.rb +52 -2
- data/test/indexer/settings_test.rb +34 -24
- data/test/indexer/to_field_test.rb +27 -2
- data/test/marc_extractor_test.rb +7 -7
- data/test/marc_reader_test.rb +4 -4
- data/test/nokogiri_reader_test.rb +158 -0
- data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
- data/test/solr_json_writer_test.rb +24 -28
- data/test/test_helper.rb +8 -2
- data/test/test_support/namespace-test.xml +7 -0
- data/test/test_support/nokogiri_demo_config.rb +17 -0
- data/test/test_support/oai-pmh-one-record-2.xml +24 -0
- data/test/test_support/oai-pmh-one-record-first.xml +24 -0
- data/test/test_support/sample-oai-no-namespace.xml +197 -0
- data/test/test_support/sample-oai-pmh.xml +197 -0
- data/test/thread_pool_test.rb +38 -0
- data/test/translation_map_test.rb +3 -3
- data/test/translation_maps/ruby_map.rb +2 -1
- data/test/translation_maps/yaml_map.yaml +2 -1
- data/traject.gemspec +4 -11
- metadata +92 -6
| @@ -25,7 +25,7 @@ memory_writer_class = Class.new do | |
| 25 25 | 
             
            describe "Traject::Indexer#process" do
         | 
| 26 26 | 
             
              before do
         | 
| 27 27 | 
             
                # no threading for these tests
         | 
| 28 | 
            -
                @indexer = Traject::Indexer.new("processing_thread_pool" => nil)
         | 
| 28 | 
            +
                @indexer = Traject::Indexer::MarcIndexer.new("processing_thread_pool" => nil)
         | 
| 29 29 | 
             
                @indexer.writer_class = memory_writer_class
         | 
| 30 30 | 
             
                @file = File.open(support_file_path "test_data.utf8.mrc")
         | 
| 31 31 | 
             
              end
         | 
| @@ -68,7 +68,7 @@ describe "Traject::Indexer#process" do | |
| 68 68 |  | 
| 69 69 | 
             
              require 'traject/null_writer'
         | 
| 70 70 | 
             
              it "calls after_processing after processing" do
         | 
| 71 | 
            -
                @indexer = Traject::Indexer.new(
         | 
| 71 | 
            +
                @indexer = Traject::Indexer::MarcIndexer.new(
         | 
| 72 72 | 
             
                  "writer_class_name" => "Traject::NullWriter"
         | 
| 73 73 | 
             
                )
         | 
| 74 74 | 
             
                @file = File.open(support_file_path "test_data.utf8.mrc")
         | 
| @@ -87,6 +87,37 @@ describe "Traject::Indexer#process" do | |
| 87 87 | 
             
                assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
         | 
| 88 88 | 
             
              end
         | 
| 89 89 |  | 
| 90 | 
            +
              it "calls after_processing from #run_after_processing_steps" do
         | 
| 91 | 
            +
                @indexer = Traject::Indexer.new(
         | 
| 92 | 
            +
                  "writer_class_name" => "Traject::NullWriter"
         | 
| 93 | 
            +
                )
         | 
| 94 | 
            +
                @file = File.open(support_file_path "test_data.utf8.mrc")
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                called = []
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                @indexer.after_processing do
         | 
| 99 | 
            +
                  called << :one
         | 
| 100 | 
            +
                end
         | 
| 101 | 
            +
                @indexer.after_processing do
         | 
| 102 | 
            +
                  called << :two
         | 
| 103 | 
            +
                end
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                @indexer.run_after_processing_steps
         | 
| 106 | 
            +
                assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
         | 
| 107 | 
            +
              end
         | 
| 108 | 
            +
             | 
| 109 | 
            +
              it "can't be run twice" do
         | 
| 110 | 
            +
                @file = File.open(support_file_path "test_data.utf8.mrc")
         | 
| 111 | 
            +
                @indexer = Traject::Indexer::MarcIndexer.new(
         | 
| 112 | 
            +
                  "writer_class_name" => "Traject::NullWriter"
         | 
| 113 | 
            +
                )
         | 
| 114 | 
            +
                @indexer.process(@file)
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                assert_raises Traject::Indexer::CompletedStateError do
         | 
| 117 | 
            +
                  @indexer.process(@file)
         | 
| 118 | 
            +
                end
         | 
| 119 | 
            +
              end
         | 
| 120 | 
            +
             | 
| 90 121 | 
             
              describe "demo_config.rb" do
         | 
| 91 122 | 
             
                before do
         | 
| 92 123 | 
             
                  @indexer = Traject::Indexer.new(
         | 
| @@ -102,4 +133,23 @@ describe "Traject::Indexer#process" do | |
| 102 133 | 
             
                end
         | 
| 103 134 | 
             
              end
         | 
| 104 135 |  | 
| 136 | 
            +
              describe "multi stream" do
         | 
| 137 | 
            +
                before do
         | 
| 138 | 
            +
                  @file2 = File.open(support_file_path "george_eliot.marc")
         | 
| 139 | 
            +
                  @file1 = File.open(support_file_path "musical_cage.marc")
         | 
| 140 | 
            +
                  @indexer = Traject::Indexer::MarcIndexer.new do
         | 
| 141 | 
            +
                    self.writer_class = memory_writer_class
         | 
| 142 | 
            +
                    to_field "title", extract_marc("245")
         | 
| 143 | 
            +
                  end
         | 
| 144 | 
            +
                end
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                it "parses and loads" do
         | 
| 147 | 
            +
                  @indexer.process([@file1, @file2])
         | 
| 148 | 
            +
                  # kinda ridic, yeah.
         | 
| 149 | 
            +
                  output_hashes = memory_writer_class.class_variable_get("@@last_writer_settings")["memory_writer.added"].collect(&:output_hash)
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                  assert_length 2, output_hashes
         | 
| 152 | 
            +
                  assert output_hashes.all? { |hash| hash["title"].length > 0 }
         | 
| 153 | 
            +
                end
         | 
| 154 | 
            +
              end
         | 
| 105 155 | 
             
            end
         | 
| @@ -5,10 +5,10 @@ describe "Traject::Indexer#settings" do | |
| 5 5 | 
             
                @indexer = Traject::Indexer.new
         | 
| 6 6 | 
             
              end
         | 
| 7 7 |  | 
| 8 | 
            -
              it "starts out a Hash, that  | 
| 8 | 
            +
              it "starts out a Hash, that uses it's defaults" do
         | 
| 9 9 | 
             
                assert_kind_of Hash, @indexer.settings
         | 
| 10 10 |  | 
| 11 | 
            -
                Traject::Indexer | 
| 11 | 
            +
                Traject::Indexer.default_settings.each_pair do |key, value|
         | 
| 12 12 | 
             
                  assert_equal value, @indexer.settings[key]
         | 
| 13 13 | 
             
                end
         | 
| 14 14 | 
             
              end
         | 
| @@ -16,13 +16,15 @@ describe "Traject::Indexer#settings" do | |
| 16 16 | 
             
              it "can fill_in_defaults!" do
         | 
| 17 17 | 
             
                @indexer.settings.fill_in_defaults!
         | 
| 18 18 |  | 
| 19 | 
            -
                assert_equal Traject::Indexer | 
| 19 | 
            +
                assert_equal Traject::Indexer.default_settings, @indexer.settings
         | 
| 20 20 | 
             
              end
         | 
| 21 21 |  | 
| 22 22 | 
             
              it "doesn't overwrite with fill_in_defaults!" do
         | 
| 23 | 
            -
                key = Traject::Indexer | 
| 23 | 
            +
                key = Traject::Indexer.default_settings.keys.first
         | 
| 24 24 | 
             
                @indexer.settings[ key  ] = "MINE KEEP IT"
         | 
| 25 25 |  | 
| 26 | 
            +
                assert_equal "MINE KEEP IT", @indexer.settings[key]
         | 
| 27 | 
            +
             | 
| 26 28 | 
             
                @indexer.settings.fill_in_defaults!
         | 
| 27 29 |  | 
| 28 30 | 
             
                assert_equal "MINE KEEP IT", @indexer.settings[key]
         | 
| @@ -36,7 +38,7 @@ describe "Traject::Indexer#settings" do | |
| 36 38 | 
             
              end
         | 
| 37 39 |  | 
| 38 40 | 
             
              it "has settings DSL to set" do
         | 
| 39 | 
            -
                @indexer. | 
| 41 | 
            +
                @indexer.configure do
         | 
| 40 42 | 
             
                  settings do
         | 
| 41 43 | 
             
                    store "foo", "foo"
         | 
| 42 44 | 
             
                  end
         | 
| @@ -124,28 +126,36 @@ describe "Traject::Indexer#settings" do | |
| 124 126 | 
             
                  assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
         | 
| 125 127 | 
             
                end
         | 
| 126 128 | 
             
              end
         | 
| 127 | 
            -
             | 
| 128 | 
            -
              describe " | 
| 129 | 
            -
                 | 
| 130 | 
            -
                   | 
| 131 | 
            -
             | 
| 132 | 
            -
             | 
| 133 | 
            -
             | 
| 134 | 
            -
             | 
| 135 | 
            -
                    assert_equal "Traject::Marc4JReader", @indexer.settings['reader_class_name']
         | 
| 136 | 
            -
                  else
         | 
| 137 | 
            -
                    assert_equal "Traject::MarcReader", @indexer.settings['reader_class_name']
         | 
| 129 | 
            +
             | 
| 130 | 
            +
              describe "order of precedence" do
         | 
| 131 | 
            +
                it "args beat 'provides'" do
         | 
| 132 | 
            +
                  # args come from command-line in typical use
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                  @indexer = Traject::Indexer.new(sample: "from args")
         | 
| 135 | 
            +
                  @indexer.settings do
         | 
| 136 | 
            +
                    provide :sample, "from config"
         | 
| 138 137 | 
             
                  end
         | 
| 138 | 
            +
                  @indexer.settings.fill_in_defaults!
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                  assert_equal "from args", @indexer.settings["sample"]
         | 
| 139 141 | 
             
                end
         | 
| 140 | 
            -
             | 
| 141 | 
            -
                 | 
| 142 | 
            -
             | 
| 143 | 
            -
             | 
| 144 | 
            -
                   | 
| 145 | 
            -
             | 
| 146 | 
            -
                   | 
| 147 | 
            -
             | 
| 142 | 
            +
             | 
| 143 | 
            +
                it "args beat defaults" do
         | 
| 144 | 
            +
                  key = Traject::Indexer.default_settings.keys.first
         | 
| 145 | 
            +
                  @indexer = Traject::Indexer.new(key.to_sym => "from args")
         | 
| 146 | 
            +
                  @indexer.settings.fill_in_defaults!
         | 
| 147 | 
            +
             | 
| 148 | 
            +
                  assert_equal "from args", @indexer.settings[key]
         | 
| 149 | 
            +
                end
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                it "provide beats defaults" do
         | 
| 152 | 
            +
                  key = Traject::Indexer.default_settings.keys.first
         | 
| 153 | 
            +
                  @indexer.settings do
         | 
| 154 | 
            +
                    provide key, "from config"
         | 
| 148 155 | 
             
                  end
         | 
| 156 | 
            +
                  @indexer.settings.fill_in_defaults!
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                  assert_equal "from config", @indexer.settings[key]
         | 
| 149 159 | 
             
                end
         | 
| 150 160 | 
             
              end
         | 
| 151 161 |  | 
| @@ -69,9 +69,34 @@ describe "Traject::Indexer.to_field" do | |
| 69 69 | 
             
                assert_equal ['hello'], output['foo']
         | 
| 70 70 | 
             
              end
         | 
| 71 71 |  | 
| 72 | 
            +
              describe "supports multiple procs" do
         | 
| 73 | 
            +
                it "with no block" do
         | 
| 74 | 
            +
                  @indexer.to_field "foo",
         | 
| 75 | 
            +
                    lambda {|record, acc| acc << "one"},
         | 
| 76 | 
            +
                    lambda {|record, acc| acc << "two"},
         | 
| 77 | 
            +
                    lambda {|record, acc| acc << "three"}
         | 
| 72 78 |  | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 79 | 
            +
                  output = @indexer.map_record('never looked at')
         | 
| 80 | 
            +
                  assert_equal ['one', 'two', 'three'], output['foo']
         | 
| 81 | 
            +
                end
         | 
| 75 82 |  | 
| 83 | 
            +
                it "with a block too" do
         | 
| 84 | 
            +
                  @indexer.to_field "foo",
         | 
| 85 | 
            +
                    lambda {|record, acc| acc << "one"},
         | 
| 86 | 
            +
                    lambda {|record, acc| acc << "two"} do |record, acc|
         | 
| 87 | 
            +
                      acc << "three"
         | 
| 88 | 
            +
                  end
         | 
| 76 89 |  | 
| 90 | 
            +
                  output = @indexer.map_record('never looked at')
         | 
| 91 | 
            +
                  assert_equal ['one', 'two', 'three'], output['foo']
         | 
| 92 | 
            +
                end
         | 
| 93 | 
            +
              end
         | 
| 77 94 |  | 
| 95 | 
            +
              describe "with an array argument" do
         | 
| 96 | 
            +
                it "indexes to multiple fields" do
         | 
| 97 | 
            +
                  @indexer.to_field ["field1", "field2", "field3"], lambda {|rec, acc| acc << "value" }
         | 
| 98 | 
            +
                  output = @indexer.map_record('never looked at')
         | 
| 99 | 
            +
                  assert_equal({ "field1" => ["value"], "field2" => ["value"], "field3" => ["value"] }, output)
         | 
| 100 | 
            +
                end
         | 
| 101 | 
            +
              end
         | 
| 102 | 
            +
            end
         | 
    
        data/test/marc_extractor_test.rb
    CHANGED
    
    | @@ -28,7 +28,7 @@ describe "Traject::MarcExtractor" do | |
| 28 28 |  | 
| 29 29 | 
             
                  assert_kind_of Array, spec.subfields
         | 
| 30 30 | 
             
                end
         | 
| 31 | 
            -
             | 
| 31 | 
            +
             | 
| 32 32 | 
             
                it "parses specset from an array" do
         | 
| 33 33 | 
             
                  parsed  = Traject::MarcExtractor::SpecSet.new(%w[245abcde 810 700|*4|bcd])
         | 
| 34 34 | 
             
                  assert_equal parsed.tags, %w[245 810 700]
         | 
| @@ -60,17 +60,17 @@ describe "Traject::MarcExtractor" do | |
| 60 60 | 
             
                  assert_equal "4", spec700.indicator2
         | 
| 61 61 | 
             
                  assert_equal %w{b c d}, spec700.subfields
         | 
| 62 62 | 
             
                end
         | 
| 63 | 
            -
             | 
| 63 | 
            +
             | 
| 64 64 | 
             
                it "parses from an array" do
         | 
| 65 65 | 
             
                  parsed  = Traject::MarcExtractor::Spec.hash_from_string(%w[245abcde 810 700|*4|bcd])
         | 
| 66 | 
            -
                   | 
| 67 | 
            -
                   | 
| 68 | 
            -
                   | 
| 66 | 
            +
                  _spec245 = parsed['245'].first
         | 
| 67 | 
            +
                  _spec810 = parsed['810'].first
         | 
| 68 | 
            +
                  _spec700 = parsed['700'].first
         | 
| 69 69 |  | 
| 70 70 | 
             
                  assert_length 3, parsed
         | 
| 71 71 | 
             
                end
         | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 72 | 
            +
             | 
| 73 | 
            +
             | 
| 74 74 |  | 
| 75 75 | 
             
                it "parses fixed field byte offsets" do
         | 
| 76 76 | 
             
                  parsed = Traject::MarcExtractor::Spec.hash_from_string("005[5]:008[7-10]")
         | 
    
        data/test/marc_reader_test.rb
    CHANGED
    
    | @@ -50,13 +50,13 @@ describe "Traject::MarcReader" do | |
| 50 50 | 
             
                  a245a = array.first['245']['a']
         | 
| 51 51 |  | 
| 52 52 | 
             
                  assert a245a.encoding.name, "UTF-8"
         | 
| 53 | 
            -
                  assert a245a.valid_encoding? | 
| 53 | 
            +
                  assert a245a.valid_encoding?
         | 
| 54 54 | 
             
                  assert_equal "Por uma outra globalização :", a245a
         | 
| 55 55 | 
             
                end
         | 
| 56 56 |  | 
| 57 57 | 
             
                it "replaces unicode character reference in Marc8 transcode" do
         | 
| 58 58 | 
             
                  file = File.new(support_file_path("escaped_character_reference.marc8.marc"))
         | 
| 59 | 
            -
             | 
| 59 | 
            +
             | 
| 60 60 | 
             
                  settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC-8") # binary type is default
         | 
| 61 61 | 
             
                  record = Traject::MarcReader.new(file, settings).to_a.first
         | 
| 62 62 |  | 
| @@ -67,7 +67,7 @@ describe "Traject::MarcReader" do | |
| 67 67 | 
             
                  file = File.new(support_file_path "one-marc8.mrc")
         | 
| 68 68 | 
             
                  settings = Traject::Indexer::Settings.new("marc_source.encoding" => "ADFADFADF")
         | 
| 69 69 | 
             
                  assert_raises(ArgumentError) do
         | 
| 70 | 
            -
                     | 
| 70 | 
            +
                    _record = Traject::MarcReader.new(file, settings).to_a.first
         | 
| 71 71 | 
             
                  end
         | 
| 72 72 | 
             
                end
         | 
| 73 73 |  | 
| @@ -78,7 +78,7 @@ describe "Traject::MarcReader" do | |
| 78 78 | 
             
                  reader = Traject::MarcReader.new(file, settings)
         | 
| 79 79 |  | 
| 80 80 | 
             
                  record = reader.to_a.first
         | 
| 81 | 
            -
             | 
| 81 | 
            +
             | 
| 82 82 | 
             
                  value = record['300']['a']
         | 
| 83 83 |  | 
| 84 84 | 
             
                  assert_equal value.encoding.name, "UTF-8"
         | 
| @@ -0,0 +1,158 @@ | |
| 1 | 
            +
            require 'test_helper'
         | 
| 2 | 
            +
            require 'traject/nokogiri_reader'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            describe "Traject::NokogiriReader" do
         | 
| 5 | 
            +
              describe "with namespaces" do
         | 
| 6 | 
            +
                before do
         | 
| 7 | 
            +
                  @namespaces = { "oai" => "http://www.openarchives.org/OAI/2.0/" }
         | 
| 8 | 
            +
                  @xml_sample_path = support_file_path("sample-oai-pmh.xml")
         | 
| 9 | 
            +
                end
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                describe "invalid settings" do
         | 
| 12 | 
            +
                  it "default_namespaces not a hash raises" do
         | 
| 13 | 
            +
                    error = assert_raises(ArgumentError) {
         | 
| 14 | 
            +
                      @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
         | 
| 15 | 
            +
                        "nokogiri.namespaces" => "i am not a hash",
         | 
| 16 | 
            +
                      })
         | 
| 17 | 
            +
                    }
         | 
| 18 | 
            +
                    assert(error.message =~ /nokogiri.namespaces must be a hash/)
         | 
| 19 | 
            +
                  end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  it "each_record_xpath with unregistered prefix raises" do
         | 
| 22 | 
            +
                    error = assert_raises(ArgumentError) {
         | 
| 23 | 
            +
                      @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
         | 
| 24 | 
            +
                        "nokogiri.namespaces" => @namespaces,
         | 
| 25 | 
            +
                        "nokogiri.each_record_xpath" => "//foo:bar"
         | 
| 26 | 
            +
                      })
         | 
| 27 | 
            +
                    }
         | 
| 28 | 
            +
                    assert(error.message =~ %r{Can't find namespace prefix 'foo' in '//foo:bar'})
         | 
| 29 | 
            +
                  end
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                describe "fixed path" do
         | 
| 33 | 
            +
                  before do
         | 
| 34 | 
            +
                    @each_record_xpath = "/oai:OAI-PMH/oai:ListRecords/oai:record"
         | 
| 35 | 
            +
                  end
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                  it "reads" do
         | 
| 38 | 
            +
                    shared_tests
         | 
| 39 | 
            +
                  end
         | 
| 40 | 
            +
                end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                describe "floating path" do
         | 
| 43 | 
            +
                  before do
         | 
| 44 | 
            +
                    @each_record_xpath = "//oai:record"
         | 
| 45 | 
            +
                  end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                  it "reads" do
         | 
| 48 | 
            +
                    shared_tests
         | 
| 49 | 
            +
                  end
         | 
| 50 | 
            +
                end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
             | 
| 53 | 
            +
                describe "extra_xpath_hooks" do
         | 
| 54 | 
            +
                  it "catches oai-pmh resumption token" do
         | 
| 55 | 
            +
                    @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
         | 
| 56 | 
            +
                      "nokogiri.namespaces" => @namespaces,
         | 
| 57 | 
            +
                      "nokogiri.each_record_xpath" => "//oai:record",
         | 
| 58 | 
            +
                      "nokogiri_reader.extra_xpath_hooks" => {
         | 
| 59 | 
            +
                        "//oai:resumptionToken" => lambda do |node, clipboard|
         | 
| 60 | 
            +
                          clipboard[:resumptionToken] = node.text
         | 
| 61 | 
            +
                        end
         | 
| 62 | 
            +
                      }
         | 
| 63 | 
            +
                    })
         | 
| 64 | 
            +
                    _records = @reader.to_a
         | 
| 65 | 
            +
                    assert_equal "oai_dc.f(2018-05-03T18:09:08Z).u(2018-06-15T19:25:21Z).t(6387):100", @reader.clipboard[:resumptionToken]
         | 
| 66 | 
            +
                  end
         | 
| 67 | 
            +
                end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                describe "outer namespaces" do
         | 
| 70 | 
            +
                  it "are preserved" do
         | 
| 71 | 
            +
                    @reader = Traject::NokogiriReader.new(File.open(support_file_path("namespace-test.xml")), {
         | 
| 72 | 
            +
                      "nokogiri.namespaces" => { mytop: "http://example.org/top" },
         | 
| 73 | 
            +
                      "nokogiri.each_record_xpath" => "//mytop:record"
         | 
| 74 | 
            +
                    })
         | 
| 75 | 
            +
                    yielded_records = []
         | 
| 76 | 
            +
                    @reader.each { |record|
         | 
| 77 | 
            +
                      yielded_records << record
         | 
| 78 | 
            +
                    }
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                    assert yielded_records.length > 0
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                    expected_namespaces = {"xmlns"=>"http://example.org/top", "xmlns:a"=>"http://example.org/a", "xmlns:b"=>"http://example.org/b"}
         | 
| 83 | 
            +
                    yielded_records.each do |rec|
         | 
| 84 | 
            +
                      assert_equal expected_namespaces, rec.namespaces
         | 
| 85 | 
            +
                    end
         | 
| 86 | 
            +
                  end
         | 
| 87 | 
            +
                end
         | 
| 88 | 
            +
              end
         | 
| 89 | 
            +
             | 
| 90 | 
            +
              describe "without namespaces" do
         | 
| 91 | 
            +
                before do
         | 
| 92 | 
            +
                  @namespaces = {}
         | 
| 93 | 
            +
                  @xml_sample_path = support_file_path("sample-oai-no-namespace.xml")
         | 
| 94 | 
            +
                end
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                describe "fixed path" do
         | 
| 97 | 
            +
                  before do
         | 
| 98 | 
            +
                    @each_record_xpath = "/OAI-PMH/ListRecords/record"
         | 
| 99 | 
            +
                  end
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                  it "reads" do
         | 
| 102 | 
            +
                    shared_tests
         | 
| 103 | 
            +
                  end
         | 
| 104 | 
            +
                end
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                describe "floating path" do
         | 
| 107 | 
            +
                  before do
         | 
| 108 | 
            +
                    @each_record_xpath = "//record"
         | 
| 109 | 
            +
                  end
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                  it "reads" do
         | 
| 112 | 
            +
                    shared_tests
         | 
| 113 | 
            +
                  end
         | 
| 114 | 
            +
                end
         | 
| 115 | 
            +
              end
         | 
| 116 | 
            +
             | 
| 117 | 
            +
             | 
| 118 | 
            +
              def shared_tests
         | 
| 119 | 
            +
                @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
         | 
| 120 | 
            +
                  "nokogiri.namespaces" => @namespaces,
         | 
| 121 | 
            +
                  "nokogiri.each_record_xpath" => @each_record_xpath
         | 
| 122 | 
            +
                })
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                yielded_records = []
         | 
| 125 | 
            +
                @reader.each { |record|
         | 
| 126 | 
            +
                  yielded_records << record
         | 
| 127 | 
            +
                }
         | 
| 128 | 
            +
             | 
| 129 | 
            +
             | 
| 130 | 
            +
                manually_extracted = Nokogiri::XML.parse(File.open(@xml_sample_path)).xpath(@each_record_xpath, @namespaces)
         | 
| 131 | 
            +
                manually_extracted.collect do |node|
         | 
| 132 | 
            +
                  # nokogiri makes it so hard to reliably get an Element to serialize to XML with all
         | 
| 133 | 
            +
                  # it's inherited namespace declerations. :(  We're only doing this for testing purposes
         | 
| 134 | 
            +
                  # anyway.  This may not handle everything, but handles what we need in the test right now
         | 
| 135 | 
            +
                  if node.namespace
         | 
| 136 | 
            +
                    node["xmlns"] = node.namespace.href
         | 
| 137 | 
            +
                  end
         | 
| 138 | 
            +
                end
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                assert_length manually_extracted.size, yielded_records
         | 
| 141 | 
            +
                assert yielded_records.all? {|r| r.kind_of? Nokogiri::XML::Document }
         | 
| 142 | 
            +
                assert_equal manually_extracted.collect(&:to_xml), yielded_records.collect(&:root).collect(&:to_xml)
         | 
| 143 | 
            +
              end
         | 
| 144 | 
            +
             | 
| 145 | 
            +
              describe "without each_record_xpath" do
         | 
| 146 | 
            +
                before do
         | 
| 147 | 
            +
                  @xml_sample_path = support_file_path("namespace-test.xml")
         | 
| 148 | 
            +
                end
         | 
| 149 | 
            +
                it "yields whole file as one record" do
         | 
| 150 | 
            +
                  @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {})
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                  yielded_records = @reader.to_a
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                  assert_length 1, yielded_records
         | 
| 155 | 
            +
                  assert_equal Nokogiri::XML.parse(File.open(@xml_sample_path)).to_xml, yielded_records.first.to_xml
         | 
| 156 | 
            +
                end
         | 
| 157 | 
            +
              end
         | 
| 158 | 
            +
            end
         | 
| @@ -0,0 +1,23 @@ | |
| 1 | 
            +
            require 'test_helper'
         | 
| 2 | 
            +
            require 'traject/oai_pmh_nokogiri_reader'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            describe "Traject::OaiPmhNokogiriReader" do
         | 
| 5 | 
            +
             | 
| 6 | 
            +
              it "smoke test" do
         | 
| 7 | 
            +
                @reader = Traject::OaiPmhNokogiriReader.new(nil,
         | 
| 8 | 
            +
                  "oai_pmh.start_url" => "http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc"
         | 
| 9 | 
            +
                )
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                fetched = @reader.to_a
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                assert_length 2, fetched
         | 
| 14 | 
            +
              end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
              before do
         | 
| 17 | 
            +
                stub_request(:get, "http://example.com/oai?metadataPrefix=oai_dc&verb=ListRecords").
         | 
| 18 | 
            +
                  to_return(status: 200, body: File.read(support_file_path("oai-pmh-one-record-first.xml")))
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                stub_request(:get, "http://example.com/oai?resumptionToken=dummy_resumption&verb=ListRecords").
         | 
| 21 | 
            +
                  to_return(status: 200, body: File.read(support_file_path("oai-pmh-one-record-2.xml")))
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
            end
         |