traject 2.3.4 → 3.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -25,7 +25,7 @@ memory_writer_class = Class.new do
25
25
  describe "Traject::Indexer#process" do
26
26
  before do
27
27
  # no threading for these tests
28
- @indexer = Traject::Indexer.new("processing_thread_pool" => nil)
28
+ @indexer = Traject::Indexer::MarcIndexer.new("processing_thread_pool" => nil)
29
29
  @indexer.writer_class = memory_writer_class
30
30
  @file = File.open(support_file_path "test_data.utf8.mrc")
31
31
  end
@@ -68,7 +68,7 @@ describe "Traject::Indexer#process" do
68
68
 
69
69
  require 'traject/null_writer'
70
70
  it "calls after_processing after processing" do
71
- @indexer = Traject::Indexer.new(
71
+ @indexer = Traject::Indexer::MarcIndexer.new(
72
72
  "writer_class_name" => "Traject::NullWriter"
73
73
  )
74
74
  @file = File.open(support_file_path "test_data.utf8.mrc")
@@ -87,6 +87,37 @@ describe "Traject::Indexer#process" do
87
87
  assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
88
88
  end
89
89
 
90
+ it "calls after_processing from #run_after_processing_steps" do
91
+ @indexer = Traject::Indexer.new(
92
+ "writer_class_name" => "Traject::NullWriter"
93
+ )
94
+ @file = File.open(support_file_path "test_data.utf8.mrc")
95
+
96
+ called = []
97
+
98
+ @indexer.after_processing do
99
+ called << :one
100
+ end
101
+ @indexer.after_processing do
102
+ called << :two
103
+ end
104
+
105
+ @indexer.run_after_processing_steps
106
+ assert_equal [:one, :two], called, "Both after_processing hooks called, in order"
107
+ end
108
+
109
+ it "can't be run twice" do
110
+ @file = File.open(support_file_path "test_data.utf8.mrc")
111
+ @indexer = Traject::Indexer::MarcIndexer.new(
112
+ "writer_class_name" => "Traject::NullWriter"
113
+ )
114
+ @indexer.process(@file)
115
+
116
+ assert_raises Traject::Indexer::CompletedStateError do
117
+ @indexer.process(@file)
118
+ end
119
+ end
120
+
90
121
  describe "demo_config.rb" do
91
122
  before do
92
123
  @indexer = Traject::Indexer.new(
@@ -102,4 +133,23 @@ describe "Traject::Indexer#process" do
102
133
  end
103
134
  end
104
135
 
136
+ describe "multi stream" do
137
+ before do
138
+ @file2 = File.open(support_file_path "george_eliot.marc")
139
+ @file1 = File.open(support_file_path "musical_cage.marc")
140
+ @indexer = Traject::Indexer::MarcIndexer.new do
141
+ self.writer_class = memory_writer_class
142
+ to_field "title", extract_marc("245")
143
+ end
144
+ end
145
+
146
+ it "parses and loads" do
147
+ @indexer.process([@file1, @file2])
148
+ # kinda ridic, yeah.
149
+ output_hashes = memory_writer_class.class_variable_get("@@last_writer_settings")["memory_writer.added"].collect(&:output_hash)
150
+
151
+ assert_length 2, output_hashes
152
+ assert output_hashes.all? { |hash| hash["title"].length > 0 }
153
+ end
154
+ end
105
155
  end
@@ -5,10 +5,10 @@ describe "Traject::Indexer#settings" do
5
5
  @indexer = Traject::Indexer.new
6
6
  end
7
7
 
8
- it "starts out a Hash, that can fill in it's defaults" do
8
+ it "starts out a Hash, that uses it's defaults" do
9
9
  assert_kind_of Hash, @indexer.settings
10
10
 
11
- Traject::Indexer::Settings.defaults.each_pair do |key, value|
11
+ Traject::Indexer.default_settings.each_pair do |key, value|
12
12
  assert_equal value, @indexer.settings[key]
13
13
  end
14
14
  end
@@ -16,13 +16,15 @@ describe "Traject::Indexer#settings" do
16
16
  it "can fill_in_defaults!" do
17
17
  @indexer.settings.fill_in_defaults!
18
18
 
19
- assert_equal Traject::Indexer::Settings.defaults, @indexer.settings
19
+ assert_equal Traject::Indexer.default_settings, @indexer.settings
20
20
  end
21
21
 
22
22
  it "doesn't overwrite with fill_in_defaults!" do
23
- key = Traject::Indexer::Settings.defaults.keys.first
23
+ key = Traject::Indexer.default_settings.keys.first
24
24
  @indexer.settings[ key ] = "MINE KEEP IT"
25
25
 
26
+ assert_equal "MINE KEEP IT", @indexer.settings[key]
27
+
26
28
  @indexer.settings.fill_in_defaults!
27
29
 
28
30
  assert_equal "MINE KEEP IT", @indexer.settings[key]
@@ -36,7 +38,7 @@ describe "Traject::Indexer#settings" do
36
38
  end
37
39
 
38
40
  it "has settings DSL to set" do
39
- @indexer.instance_eval do
41
+ @indexer.configure do
40
42
  settings do
41
43
  store "foo", "foo"
42
44
  end
@@ -124,28 +126,36 @@ describe "Traject::Indexer#settings" do
124
126
  assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
125
127
  end
126
128
  end
127
-
128
- describe "JRuby / MRI" do
129
- before do
130
- @indexer = Traject::Indexer.new
131
- end
132
-
133
- it "has the right indexer name" do
134
- if defined? JRUBY_VERSION
135
- assert_equal "Traject::Marc4JReader", @indexer.settings['reader_class_name']
136
- else
137
- assert_equal "Traject::MarcReader", @indexer.settings['reader_class_name']
129
+
130
+ describe "order of precedence" do
131
+ it "args beat 'provides'" do
132
+ # args come from command-line in typical use
133
+
134
+ @indexer = Traject::Indexer.new(sample: "from args")
135
+ @indexer.settings do
136
+ provide :sample, "from config"
138
137
  end
138
+ @indexer.settings.fill_in_defaults!
139
+
140
+ assert_equal "from args", @indexer.settings["sample"]
139
141
  end
140
-
141
- # This next one has the added effect of making sure the correct class
142
- # has actually been loaded -- otherwise the constant wouldn't be available
143
- it "has the correct default indexer class based on platform" do
144
- if defined? JRUBY_VERSION
145
- assert_equal Traject::Marc4JReader, @indexer.reader_class
146
- else
147
- assert_equal Traject::MarcReader, @indexer.reader_class
142
+
143
+ it "args beat defaults" do
144
+ key = Traject::Indexer.default_settings.keys.first
145
+ @indexer = Traject::Indexer.new(key.to_sym => "from args")
146
+ @indexer.settings.fill_in_defaults!
147
+
148
+ assert_equal "from args", @indexer.settings[key]
149
+ end
150
+
151
+ it "provide beats defaults" do
152
+ key = Traject::Indexer.default_settings.keys.first
153
+ @indexer.settings do
154
+ provide key, "from config"
148
155
  end
156
+ @indexer.settings.fill_in_defaults!
157
+
158
+ assert_equal "from config", @indexer.settings[key]
149
159
  end
150
160
  end
151
161
 
@@ -69,9 +69,34 @@ describe "Traject::Indexer.to_field" do
69
69
  assert_equal ['hello'], output['foo']
70
70
  end
71
71
 
72
+ describe "supports multiple procs" do
73
+ it "with no block" do
74
+ @indexer.to_field "foo",
75
+ lambda {|record, acc| acc << "one"},
76
+ lambda {|record, acc| acc << "two"},
77
+ lambda {|record, acc| acc << "three"}
72
78
 
73
- end
74
-
79
+ output = @indexer.map_record('never looked at')
80
+ assert_equal ['one', 'two', 'three'], output['foo']
81
+ end
75
82
 
83
+ it "with a block too" do
84
+ @indexer.to_field "foo",
85
+ lambda {|record, acc| acc << "one"},
86
+ lambda {|record, acc| acc << "two"} do |record, acc|
87
+ acc << "three"
88
+ end
76
89
 
90
+ output = @indexer.map_record('never looked at')
91
+ assert_equal ['one', 'two', 'three'], output['foo']
92
+ end
93
+ end
77
94
 
95
+ describe "with an array argument" do
96
+ it "indexes to multiple fields" do
97
+ @indexer.to_field ["field1", "field2", "field3"], lambda {|rec, acc| acc << "value" }
98
+ output = @indexer.map_record('never looked at')
99
+ assert_equal({ "field1" => ["value"], "field2" => ["value"], "field3" => ["value"] }, output)
100
+ end
101
+ end
102
+ end
@@ -28,7 +28,7 @@ describe "Traject::MarcExtractor" do
28
28
 
29
29
  assert_kind_of Array, spec.subfields
30
30
  end
31
-
31
+
32
32
  it "parses specset from an array" do
33
33
  parsed = Traject::MarcExtractor::SpecSet.new(%w[245abcde 810 700|*4|bcd])
34
34
  assert_equal parsed.tags, %w[245 810 700]
@@ -60,17 +60,17 @@ describe "Traject::MarcExtractor" do
60
60
  assert_equal "4", spec700.indicator2
61
61
  assert_equal %w{b c d}, spec700.subfields
62
62
  end
63
-
63
+
64
64
  it "parses from an array" do
65
65
  parsed = Traject::MarcExtractor::Spec.hash_from_string(%w[245abcde 810 700|*4|bcd])
66
- spec245 = parsed['245'].first
67
- spec810 = parsed['810'].first
68
- spec700 = parsed['700'].first
66
+ _spec245 = parsed['245'].first
67
+ _spec810 = parsed['810'].first
68
+ _spec700 = parsed['700'].first
69
69
 
70
70
  assert_length 3, parsed
71
71
  end
72
-
73
-
72
+
73
+
74
74
 
75
75
  it "parses fixed field byte offsets" do
76
76
  parsed = Traject::MarcExtractor::Spec.hash_from_string("005[5]:008[7-10]")
@@ -50,13 +50,13 @@ describe "Traject::MarcReader" do
50
50
  a245a = array.first['245']['a']
51
51
 
52
52
  assert a245a.encoding.name, "UTF-8"
53
- assert a245a.valid_encoding?
53
+ assert a245a.valid_encoding?
54
54
  assert_equal "Por uma outra globalização :", a245a
55
55
  end
56
56
 
57
57
  it "replaces unicode character reference in Marc8 transcode" do
58
58
  file = File.new(support_file_path("escaped_character_reference.marc8.marc"))
59
-
59
+
60
60
  settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC-8") # binary type is default
61
61
  record = Traject::MarcReader.new(file, settings).to_a.first
62
62
 
@@ -67,7 +67,7 @@ describe "Traject::MarcReader" do
67
67
  file = File.new(support_file_path "one-marc8.mrc")
68
68
  settings = Traject::Indexer::Settings.new("marc_source.encoding" => "ADFADFADF")
69
69
  assert_raises(ArgumentError) do
70
- record = Traject::MarcReader.new(file, settings).to_a.first
70
+ _record = Traject::MarcReader.new(file, settings).to_a.first
71
71
  end
72
72
  end
73
73
 
@@ -78,7 +78,7 @@ describe "Traject::MarcReader" do
78
78
  reader = Traject::MarcReader.new(file, settings)
79
79
 
80
80
  record = reader.to_a.first
81
-
81
+
82
82
  value = record['300']['a']
83
83
 
84
84
  assert_equal value.encoding.name, "UTF-8"
@@ -0,0 +1,158 @@
1
+ require 'test_helper'
2
+ require 'traject/nokogiri_reader'
3
+
4
+ describe "Traject::NokogiriReader" do
5
+ describe "with namespaces" do
6
+ before do
7
+ @namespaces = { "oai" => "http://www.openarchives.org/OAI/2.0/" }
8
+ @xml_sample_path = support_file_path("sample-oai-pmh.xml")
9
+ end
10
+
11
+ describe "invalid settings" do
12
+ it "default_namespaces not a hash raises" do
13
+ error = assert_raises(ArgumentError) {
14
+ @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
15
+ "nokogiri.namespaces" => "i am not a hash",
16
+ })
17
+ }
18
+ assert(error.message =~ /nokogiri.namespaces must be a hash/)
19
+ end
20
+
21
+ it "each_record_xpath with unregistered prefix raises" do
22
+ error = assert_raises(ArgumentError) {
23
+ @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
24
+ "nokogiri.namespaces" => @namespaces,
25
+ "nokogiri.each_record_xpath" => "//foo:bar"
26
+ })
27
+ }
28
+ assert(error.message =~ %r{Can't find namespace prefix 'foo' in '//foo:bar'})
29
+ end
30
+ end
31
+
32
+ describe "fixed path" do
33
+ before do
34
+ @each_record_xpath = "/oai:OAI-PMH/oai:ListRecords/oai:record"
35
+ end
36
+
37
+ it "reads" do
38
+ shared_tests
39
+ end
40
+ end
41
+
42
+ describe "floating path" do
43
+ before do
44
+ @each_record_xpath = "//oai:record"
45
+ end
46
+
47
+ it "reads" do
48
+ shared_tests
49
+ end
50
+ end
51
+
52
+
53
+ describe "extra_xpath_hooks" do
54
+ it "catches oai-pmh resumption token" do
55
+ @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
56
+ "nokogiri.namespaces" => @namespaces,
57
+ "nokogiri.each_record_xpath" => "//oai:record",
58
+ "nokogiri_reader.extra_xpath_hooks" => {
59
+ "//oai:resumptionToken" => lambda do |node, clipboard|
60
+ clipboard[:resumptionToken] = node.text
61
+ end
62
+ }
63
+ })
64
+ _records = @reader.to_a
65
+ assert_equal "oai_dc.f(2018-05-03T18:09:08Z).u(2018-06-15T19:25:21Z).t(6387):100", @reader.clipboard[:resumptionToken]
66
+ end
67
+ end
68
+
69
+ describe "outer namespaces" do
70
+ it "are preserved" do
71
+ @reader = Traject::NokogiriReader.new(File.open(support_file_path("namespace-test.xml")), {
72
+ "nokogiri.namespaces" => { mytop: "http://example.org/top" },
73
+ "nokogiri.each_record_xpath" => "//mytop:record"
74
+ })
75
+ yielded_records = []
76
+ @reader.each { |record|
77
+ yielded_records << record
78
+ }
79
+
80
+ assert yielded_records.length > 0
81
+
82
+ expected_namespaces = {"xmlns"=>"http://example.org/top", "xmlns:a"=>"http://example.org/a", "xmlns:b"=>"http://example.org/b"}
83
+ yielded_records.each do |rec|
84
+ assert_equal expected_namespaces, rec.namespaces
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ describe "without namespaces" do
91
+ before do
92
+ @namespaces = {}
93
+ @xml_sample_path = support_file_path("sample-oai-no-namespace.xml")
94
+ end
95
+
96
+ describe "fixed path" do
97
+ before do
98
+ @each_record_xpath = "/OAI-PMH/ListRecords/record"
99
+ end
100
+
101
+ it "reads" do
102
+ shared_tests
103
+ end
104
+ end
105
+
106
+ describe "floating path" do
107
+ before do
108
+ @each_record_xpath = "//record"
109
+ end
110
+
111
+ it "reads" do
112
+ shared_tests
113
+ end
114
+ end
115
+ end
116
+
117
+
118
+ def shared_tests
119
+ @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
120
+ "nokogiri.namespaces" => @namespaces,
121
+ "nokogiri.each_record_xpath" => @each_record_xpath
122
+ })
123
+
124
+ yielded_records = []
125
+ @reader.each { |record|
126
+ yielded_records << record
127
+ }
128
+
129
+
130
+ manually_extracted = Nokogiri::XML.parse(File.open(@xml_sample_path)).xpath(@each_record_xpath, @namespaces)
131
+ manually_extracted.collect do |node|
132
+ # nokogiri makes it so hard to reliably get an Element to serialize to XML with all
133
+ # it's inherited namespace declerations. :( We're only doing this for testing purposes
134
+ # anyway. This may not handle everything, but handles what we need in the test right now
135
+ if node.namespace
136
+ node["xmlns"] = node.namespace.href
137
+ end
138
+ end
139
+
140
+ assert_length manually_extracted.size, yielded_records
141
+ assert yielded_records.all? {|r| r.kind_of? Nokogiri::XML::Document }
142
+ assert_equal manually_extracted.collect(&:to_xml), yielded_records.collect(&:root).collect(&:to_xml)
143
+ end
144
+
145
+ describe "without each_record_xpath" do
146
+ before do
147
+ @xml_sample_path = support_file_path("namespace-test.xml")
148
+ end
149
+ it "yields whole file as one record" do
150
+ @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {})
151
+
152
+ yielded_records = @reader.to_a
153
+
154
+ assert_length 1, yielded_records
155
+ assert_equal Nokogiri::XML.parse(File.open(@xml_sample_path)).to_xml, yielded_records.first.to_xml
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,23 @@
1
+ require 'test_helper'
2
+ require 'traject/oai_pmh_nokogiri_reader'
3
+
4
+ describe "Traject::OaiPmhNokogiriReader" do
5
+
6
+ it "smoke test" do
7
+ @reader = Traject::OaiPmhNokogiriReader.new(nil,
8
+ "oai_pmh.start_url" => "http://example.com/oai?verb=ListRecords&metadataPrefix=oai_dc"
9
+ )
10
+
11
+ fetched = @reader.to_a
12
+
13
+ assert_length 2, fetched
14
+ end
15
+
16
+ before do
17
+ stub_request(:get, "http://example.com/oai?metadataPrefix=oai_dc&verb=ListRecords").
18
+ to_return(status: 200, body: File.read(support_file_path("oai-pmh-one-record-first.xml")))
19
+
20
+ stub_request(:get, "http://example.com/oai?resumptionToken=dummy_resumption&verb=ListRecords").
21
+ to_return(status: 200, body: File.read(support_file_path("oai-pmh-one-record-2.xml")))
22
+ end
23
+ end