traject 2.3.4 → 3.0.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -25,7 +25,7 @@ memory_writer_class = Class.new do
25
25
  describe "Traject::Indexer#process" do
26
26
  before do
27
27
  # no threading for these tests
28
- @indexer = Traject::Indexer.new("processing_thread_pool" => nil)
28
+ @indexer = Traject::Indexer::MarcIndexer.new("processing_thread_pool" => nil)
29
29
  @indexer.writer_class = memory_writer_class
30
30
  @file = File.open(support_file_path "test_data.utf8.mrc")
31
31
  end
@@ -68,7 +68,7 @@ describe "Traject::Indexer#process" do
68
68
 
69
69
  require 'traject/null_writer'
70
70
  it "calls after_processing after processing" do
71
- @indexer = Traject::Indexer.new(
71
+ @indexer = Traject::Indexer::MarcIndexer.new(
72
72
  "writer_class_name" => "Traject::NullWriter"
73
73
  )
74
74
  @file = File.open(support_file_path "test_data.utf8.mrc")
@@ -87,6 +87,37 @@ describe "Traject::Indexer#process" do
87
87
  assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
88
88
  end
89
89
 
90
+ it "calls after_processing from #run_after_processing_steps" do
91
+ @indexer = Traject::Indexer.new(
92
+ "writer_class_name" => "Traject::NullWriter"
93
+ )
94
+ @file = File.open(support_file_path "test_data.utf8.mrc")
95
+
96
+ called = []
97
+
98
+ @indexer.after_processing do
99
+ called << :one
100
+ end
101
+ @indexer.after_processing do
102
+ called << :two
103
+ end
104
+
105
+ @indexer.run_after_processing_steps
106
+ assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
107
+ end
108
+
109
+ it "can't be run twice" do
110
+ @file = File.open(support_file_path "test_data.utf8.mrc")
111
+ @indexer = Traject::Indexer::MarcIndexer.new(
112
+ "writer_class_name" => "Traject::NullWriter"
113
+ )
114
+ @indexer.process(@file)
115
+
116
+ assert_raises Traject::Indexer::CompletedStateError do
117
+ @indexer.process(@file)
118
+ end
119
+ end
120
+
90
121
  describe "demo_config.rb" do
91
122
  before do
92
123
  @indexer = Traject::Indexer.new(
@@ -102,4 +133,23 @@ describe "Traject::Indexer#process" do
102
133
  end
103
134
  end
104
135
 
136
+ describe "multi stream" do
137
+ before do
138
+ @file2 = File.open(support_file_path "george_eliot.marc")
139
+ @file1 = File.open(support_file_path "musical_cage.marc")
140
+ @indexer = Traject::Indexer::MarcIndexer.new do
141
+ self.writer_class = memory_writer_class
142
+ to_field "title", extract_marc("245")
143
+ end
144
+ end
145
+
146
+ it "parses and loads" do
147
+ @indexer.process([@file1, @file2])
148
+ # kinda ridic, yeah.
149
+ output_hashes = memory_writer_class.class_variable_get("@@last_writer_settings")["memory_writer.added"].collect(&:output_hash)
150
+
151
+ assert_length 2, output_hashes
152
+ assert output_hashes.all? { |hash| hash["title"].length > 0 }
153
+ end
154
+ end
105
155
  end
@@ -5,10 +5,10 @@ describe "Traject::Indexer#settings" do
5
5
  @indexer = Traject::Indexer.new
6
6
  end
7
7
 
8
- it "starts out a Hash, that can fill in it's defaults" do
8
+ it "starts out a Hash, that uses it's defaults" do
9
9
  assert_kind_of Hash, @indexer.settings
10
10
 
11
- Traject::Indexer::Settings.defaults.each_pair do |key, value|
11
+ Traject::Indexer.default_settings.each_pair do |key, value|
12
12
  assert_equal value, @indexer.settings[key]
13
13
  end
14
14
  end
@@ -16,13 +16,15 @@ describe "Traject::Indexer#settings" do
16
16
  it "can fill_in_defaults!" do
17
17
  @indexer.settings.fill_in_defaults!
18
18
 
19
- assert_equal Traject::Indexer::Settings.defaults, @indexer.settings
19
+ assert_equal Traject::Indexer.default_settings, @indexer.settings
20
20
  end
21
21
 
22
22
  it "doesn't overwrite with fill_in_defaults!" do
23
- key = Traject::Indexer::Settings.defaults.keys.first
23
+ key = Traject::Indexer.default_settings.keys.first
24
24
  @indexer.settings[ key ] = "MINE KEEP IT"
25
25
 
26
+ assert_equal "MINE KEEP IT", @indexer.settings[key]
27
+
26
28
  @indexer.settings.fill_in_defaults!
27
29
 
28
30
  assert_equal "MINE KEEP IT", @indexer.settings[key]
@@ -36,7 +38,7 @@ describe "Traject::Indexer#settings" do
36
38
  end
37
39
 
38
40
  it "has settings DSL to set" do
39
- @indexer.instance_eval do
41
+ @indexer.configure do
40
42
  settings do
41
43
  store "foo", "foo"
42
44
  end
@@ -124,28 +126,36 @@ describe "Traject::Indexer#settings" do
124
126
  assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
125
127
  end
126
128
  end
127
-
128
- describe "JRuby / MRI" do
129
- before do
130
- @indexer = Traject::Indexer.new
131
- end
132
-
133
- it "has the right indexer name" do
134
- if defined? JRUBY_VERSION
135
- assert_equal "Traject::Marc4JReader", @indexer.settings['reader_class_name']
136
- else
137
- assert_equal "Traject::MarcReader", @indexer.settings['reader_class_name']
129
+
130
+ describe "order of precedence" do
131
+ it "args beat 'provides'" do
132
+ # args come from command-line in typical use
133
+
134
+ @indexer = Traject::Indexer.new(sample: "from args")
135
+ @indexer.settings do
136
+ provide :sample, "from config"
138
137
  end
138
+ @indexer.settings.fill_in_defaults!
139
+
140
+ assert_equal "from args", @indexer.settings["sample"]
139
141
  end
140
-
141
- # This next one has the added effect of making sure the correct class
142
- # has actually been loaded -- otherwise the constant wouldn't be available
143
- it "has the correct default indexer class based on platform" do
144
- if defined? JRUBY_VERSION
145
- assert_equal Traject::Marc4JReader, @indexer.reader_class
146
- else
147
- assert_equal Traject::MarcReader, @indexer.reader_class
142
+
143
+ it "args beat defaults" do
144
+ key = Traject::Indexer.default_settings.keys.first
145
+ @indexer = Traject::Indexer.new(key.to_sym => "from args")
146
+ @indexer.settings.fill_in_defaults!
147
+
148
+ assert_equal "from args", @indexer.settings[key]
149
+ end
150
+
151
+ it "provide beats defaults" do
152
+ key = Traject::Indexer.default_settings.keys.first
153
+ @indexer.settings do
154
+ provide key, "from config"
148
155
  end
156
+ @indexer.settings.fill_in_defaults!
157
+
158
+ assert_equal "from config", @indexer.settings[key]
149
159
  end
150
160
  end
151
161
 
@@ -69,9 +69,34 @@ describe "Traject::Indexer.to_field" do
69
69
  assert_equal ['hello'], output['foo']
70
70
  end
71
71
 
72
+ describe "supports multiple procs" do
73
+ it "with no block" do
74
+ @indexer.to_field "foo",
75
+ lambda {|record, acc| acc << "one"},
76
+ lambda {|record, acc| acc << "two"},
77
+ lambda {|record, acc| acc << "three"}
72
78
 
73
- end
74
-
79
+ output = @indexer.map_record('never looked at')
80
+ assert_equal ['one', 'two', 'three'], output['foo']
81
+ end
75
82
 
83
+ it "with a block too" do
84
+ @indexer.to_field "foo",
85
+ lambda {|record, acc| acc << "one"},
86
+ lambda {|record, acc| acc << "two"} do |record, acc|
87
+ acc << "three"
88
+ end
76
89
 
90
+ output = @indexer.map_record('never looked at')
91
+ assert_equal ['one', 'two', 'three'], output['foo']
92
+ end
93
+ end
77
94
 
95
+ describe "with an array argument" do
96
+ it "indexes to multiple fields" do
97
+ @indexer.to_field ["field1", "field2", "field3"], lambda {|rec, acc| acc << "value" }
98
+ output = @indexer.map_record('never looked at')
99
+ assert_equal({ "field1" => ["value"], "field2" => ["value"], "field3" => ["value"] }, output)
100
+ end
101
+ end
102
+ end
@@ -28,7 +28,7 @@ describe "Traject::MarcExtractor" do
28
28
 
29
29
  assert_kind_of Array, spec.subfields
30
30
  end
31
-
31
+
32
32
  it "parses specset from an array" do
33
33
  parsed = Traject::MarcExtractor::SpecSet.new(%w[245abcde 810 700|*4|bcd])
34
34
  assert_equal parsed.tags, %w[245 810 700]
@@ -60,17 +60,17 @@ describe "Traject::MarcExtractor" do
60
60
  assert_equal "4", spec700.indicator2
61
61
  assert_equal %w{b c d}, spec700.subfields
62
62
  end
63
-
63
+
64
64
  it "parses from an array" do
65
65
  parsed = Traject::MarcExtractor::Spec.hash_from_string(%w[245abcde 810 700|*4|bcd])
66
- spec245 = parsed['245'].first
67
- spec810 = parsed['810'].first
68
- spec700 = parsed['700'].first
66
+ _spec245 = parsed['245'].first
67
+ _spec810 = parsed['810'].first
68
+ _spec700 = parsed['700'].first
69
69
 
70
70
  assert_length 3, parsed
71
71
  end
72
-
73
-
72
+
73
+
74
74
 
75
75
  it "parses fixed field byte offsets" do
76
76
  parsed = Traject::MarcExtractor::Spec.hash_from_string("005[5]:008[7-10]")
@@ -50,13 +50,13 @@ describe "Traject::MarcReader" do
50
50
  a245a = array.first['245']['a']
51
51
 
52
52
  assert a245a.encoding.name, "UTF-8"
53
- assert a245a.valid_encoding?
53
+ assert a245a.valid_encoding?
54
54
  assert_equal "Por uma outra globalização :", a245a
55
55
  end
56
56
 
57
57
  it "replaces unicode character reference in Marc8 transcode" do
58
58
  file = File.new(support_file_path("escaped_character_reference.marc8.marc"))
59
-
59
+
60
60
  settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC-8") # binary type is default
61
61
  record = Traject::MarcReader.new(file, settings).to_a.first
62
62
 
@@ -67,7 +67,7 @@ describe "Traject::MarcReader" do
67
67
  file = File.new(support_file_path "one-marc8.mrc")
68
68
  settings = Traject::Indexer::Settings.new("marc_source.encoding" => "ADFADFADF")
69
69
  assert_raises(ArgumentError) do
70
- record = Traject::MarcReader.new(file, settings).to_a.first
70
+ _record = Traject::MarcReader.new(file, settings).to_a.first
71
71
  end
72
72
  end
73
73
 
@@ -78,7 +78,7 @@ describe "Traject::MarcReader" do
78
78
  reader = Traject::MarcReader.new(file, settings)
79
79
 
80
80
  record = reader.to_a.first
81
-
81
+
82
82
  value = record['300']['a']
83
83
 
84
84
  assert_equal value.encoding.name, "UTF-8"
@@ -0,0 +1,158 @@
1
+ require 'test_helper'
2
+ require 'traject/nokogiri_reader'
3
+
4
+ describe "Traject::NokogiriReader" do
5
+ describe "with namespaces" do
6
+ before do
7
+ @namespaces = { "oai" => "http://www.openarchives.org/OAI/2.0/" }
8
+ @xml_sample_path = support_file_path("sample-oai-pmh.xml")
9
+ end
10
+
11
+ describe "invalid settings" do
12
+ it "default_namespaces not a hash raises" do
13
+ error = assert_raises(ArgumentError) {
14
+ @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
15
+ "nokogiri.namespaces" => "i am not a hash",
16
+ })
17
+ }
18
+ assert(error.message =~ /nokogiri.namespaces must be a hash/)
19
+ end
20
+
21
+ it "each_record_xpath with unregistered prefix raises" do
22
+ error = assert_raises(ArgumentError) {
23
+ @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
24
+ "nokogiri.namespaces" => @namespaces,
25
+ "nokogiri.each_record_xpath" => "//foo:bar"
26
+ })
27
+ }
28
+ assert(error.message =~ %r{Can't find namespace prefix 'foo' in '//foo:bar'})
29
+ end
30
+ end
31
+
32
+ describe "fixed path" do
33
+ before do
34
+ @each_record_xpath = "/oai:OAI-PMH/oai:ListRecords/oai:record"
35
+ end
36
+
37
+ it "reads" do
38
+ shared_tests
39
+ end
40
+ end
41
+
42
+ describe "floating path" do
43
+ before do
44
+ @each_record_xpath = "//oai:record"
45
+ end
46
+
47
+ it "reads" do
48
+ shared_tests
49
+ end
50
+ end
51
+
52
+
53
+ describe "extra_xpath_hooks" do
54
+ it "catches oai-pmh resumption token" do
55
+ @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
56
+ "nokogiri.namespaces" => @namespaces,
57
+ "nokogiri.each_record_xpath" => "//oai:record",
58
+ "nokogiri_reader.extra_xpath_hooks" => {
59
+ "//oai:resumptionToken" => lambda do |node, clipboard|
60
+ clipboard[:resumptionToken] = node.text
61
+ end
62
+ }
63
+ })
64
+ _records = @reader.to_a
65
+ assert_equal "oai_dc.f(2018-05-03T18:09:08Z).u(2018-06-15T19:25:21Z).t(6387):100", @reader.clipboard[:resumptionToken]
66
+ end
67
+ end
68
+
69
+ describe "outer namespaces" do
70
+ it "are preserved" do
71
+ @reader = Traject::NokogiriReader.new(File.open(support_file_path("namespace-test.xml")), {
72
+ "nokogiri.namespaces" => { mytop: "http://example.org/top" },
73
+ "nokogiri.each_record_xpath" => "//mytop:record"
74
+ })
75
+ yielded_records = []
76
+ @reader.each { |record|
77
+ yielded_records << record
78
+ }
79
+
80
+ assert yielded_records.length > 0
81
+
82
+ expected_namespaces = {"xmlns"=>"http://example.org/top", "xmlns:a"=>"http://example.org/a", "xmlns:b"=>"http://example.org/b"}
83
+ yielded_records.each do |rec|
84
+ assert_equal expected_namespaces, rec.namespaces
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ describe "without namespaces" do
91
+ before do
92
+ @namespaces = {}
93
+ @xml_sample_path = support_file_path("sample-oai-no-namespace.xml")
94
+ end
95
+
96
+ describe "fixed path" do
97
+ before do
98
+ @each_record_xpath = "/OAI-PMH/ListRecords/record"
99
+ end
100
+
101
+ it "reads" do
102
+ shared_tests
103
+ end
104
+ end
105
+
106
+ describe "floating path" do
107
+ before do
108
+ @each_record_xpath = "//record"
109
+ end
110
+
111
+ it "reads" do
112
+ shared_tests
113
+ end
114
+ end
115
+ end
116
+
117
+
118
+ def shared_tests
119
+ @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
120
+ "nokogiri.namespaces" => @namespaces,
121
+ "nokogiri.each_record_xpath" => @each_record_xpath
122
+ })
123
+
124
+ yielded_records = []
125
+ @reader.each { |record|
126
+ yielded_records << record
127
+ }
128
+
129
+
130
+ manually_extracted = Nokogiri::XML.parse(File.open(@xml_sample_path)).xpath(@each_record_xpath, @namespaces)
131
+ manually_extracted.collect do |node|
132
+ # nokogiri makes it so hard to reliably get an Element to serialize to XML with all
133
+ # it's inherited namespace declerations. :( We're only doing this for testing purposes
134
+ # anyway. This may not handle everything, but handles what we need in the test right now
135
+ if node.namespace
136
+ node["xmlns"] = node.namespace.href
137
+ end
138
+ end
139
+
140
+ assert_length manually_extracted.size, yielded_records
141
+ assert yielded_records.all? {|r| r.kind_of? Nokogiri::XML::Document }
142
+ assert_equal manually_extracted.collect(&:to_xml), yielded_records.collect(&:root).collect(&:to_xml)
143
+ end
144
+
145
+ describe "without each_record_xpath" do
146
+ before do
147
+ @xml_sample_path = support_file_path("namespace-test.xml")
148
+ end
149
+ it "yields whole file as one record" do
150
+ @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {})
151
+
152
+ yielded_records = @reader.to_a
153
+
154
+ assert_length 1, yielded_records
155
+ assert_equal Nokogiri::XML.parse(File.open(@xml_sample_path)).to_xml, yielded_records.first.to_xml
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,23 @@
1
+ require 'test_helper'
2
+ require 'traject/oai_pmh_nokogiri_reader'
3
+
4
+ describe "Traject::OaiPmhNokogiriReader" do
5
+
6
+ it "smoke test" do
7
+ @reader = Traject::OaiPmhNokogiriReader.new(nil,
8
+ "oai_pmh.start_url" => "http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc"
9
+ )
10
+
11
+ fetched = @reader.to_a
12
+
13
+ assert_length 2, fetched
14
+ end
15
+
16
+ before do
17
+ stub_request(:get, "http://example.com/oai?metadataPrefix=oai_dc&verb=ListRecords").
18
+ to_return(status: 200, body: File.read(support_file_path("oai-pmh-one-record-first.xml")))
19
+
20
+ stub_request(:get, "http://example.com/oai?resumptionToken=dummy_resumption&verb=ListRecords").
21
+ to_return(status: 200, body: File.read(support_file_path("oai-pmh-one-record-2.xml")))
22
+ end
23
+ end