traject 2.3.4 → 3.0.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -0,0 +1,177 @@
1
+ # Encoding: UTF-8
2
+
3
+ require 'test_helper'
4
+ require 'traject/indexer'
5
+
6
+ # should be built into every indexer
7
+ describe "Traject::Macros::Transformation" do
8
+ before do
9
+ @indexer = Traject::Indexer.new
10
+ @record = nil
11
+ end
12
+
13
+ describe "translation_map" do
14
+ it "translates" do
15
+ @indexer.configure do
16
+ to_field "cataloging_agency", literal("DLC"), translation_map("marc_040a_translate_test")
17
+ end
18
+ output = @indexer.map_record(@record)
19
+ assert_equal ["Library of Congress"], output["cataloging_agency"]
20
+ end
21
+
22
+ it "can merge multiple" do
23
+ @indexer.configure do
24
+ to_field "result", literal("key_to_be_overridden"), translation_map("ruby_map", "yaml_map")
25
+ end
26
+ output = @indexer.map_record(@record)
27
+ assert_equal ["value_from_yaml"], output["result"]
28
+ end
29
+
30
+ it "can merge multiple with hash" do
31
+ @indexer.configure do
32
+ to_field "result", literal("key_to_be_overridden"), translation_map("ruby_map", "yaml_map", {"key_to_be_overridden" => "value_from_inline_hash"})
33
+ end
34
+ output = @indexer.map_record(@record)
35
+ assert_equal ["value_from_inline_hash"], output["result"]
36
+ end
37
+ end
38
+
39
+ describe "transform" do
40
+ it "transforms with block" do
41
+ @indexer.configure do
42
+ to_field "sample_field", literal("one"), literal("two"), transform(&:upcase)
43
+ end
44
+ output = @indexer.map_record(@record)
45
+ assert_equal ["ONE", "TWO"], output["sample_field"]
46
+ end
47
+
48
+ it "transforms with proc arg" do
49
+ @indexer.configure do
50
+ to_field "sample_field", literal("one"), literal("two"), transform(->(val) { val.tr('aeiou', '!') })
51
+ end
52
+ output = @indexer.map_record(@record)
53
+ assert_equal ["!n!", "tw!"], output["sample_field"]
54
+ end
55
+
56
+ it "transforms with both, in correct order" do
57
+ @indexer.configure do
58
+ to_field "sample_field", literal("one"), literal("two"), transform(->(val) { val.tr('aeiou', '!') }, &:upcase)
59
+ end
60
+ output = @indexer.map_record(@record)
61
+ assert_equal ["!N!", "TW!"], output["sample_field"]
62
+ end
63
+ end
64
+
65
+ describe "default" do
66
+ it "adds default to empty accumulator" do
67
+ @indexer.configure do
68
+ to_field "test", default("default")
69
+ end
70
+ output = @indexer.map_record(@record)
71
+ assert_equal ["default"], output["test"]
72
+ end
73
+
74
+ it "does not add default if value present" do
75
+ @indexer.configure do
76
+ to_field "test", literal("value"), default("defaut")
77
+ end
78
+ output = @indexer.map_record(@record)
79
+ assert_equal ["value"], output["test"]
80
+ end
81
+ end
82
+
83
+ describe "first_only" do
84
+ it "takes only first in multi-value" do
85
+ @indexer.configure do
86
+ to_field "test", literal("one"), literal("two"), literal("three"), first_only
87
+ end
88
+ output = @indexer.map_record(@record)
89
+ assert_equal ["one"], output["test"]
90
+ end
91
+
92
+ it "no-ops on nil" do
93
+ @indexer.configure do
94
+ to_field "test", first_only
95
+ end
96
+ output = @indexer.map_record(@record)
97
+ assert_nil output["test"]
98
+ end
99
+
100
+ it "no-ops on single value" do
101
+ @indexer.configure do
102
+ to_field "test", literal("one"), first_only
103
+ end
104
+ output = @indexer.map_record(@record)
105
+ assert_equal ["one"], output["test"]
106
+ end
107
+ end
108
+
109
+ describe "unique" do
110
+ it "uniqs" do
111
+ @indexer.configure do
112
+ to_field "test", literal("one"), literal("two"), literal("one"), literal("three"), unique
113
+ end
114
+ output = @indexer.map_record(@record)
115
+ assert_equal ["one", "two", "three"], output["test"]
116
+ end
117
+ end
118
+
119
+ describe "strip" do
120
+ it "strips" do
121
+ @indexer.configure do
122
+ to_field "test", literal(" one"), literal(" two "), strip
123
+ end
124
+ output = @indexer.map_record(@record)
125
+ assert_equal ["one", "two"], output["test"]
126
+ end
127
+
128
+ it "strips unicode whitespace" do
129
+ @indexer.configure do
130
+ to_field "test", literal(" \u00A0 \u2002 one \u202F "), strip
131
+ end
132
+ output = @indexer.map_record(@record)
133
+ assert_equal ["one"], output["test"]
134
+ end
135
+ end
136
+
137
+ describe "split" do
138
+ it "splits" do
139
+ @indexer.configure do
140
+ to_field "test", literal("one.two"), split(".")
141
+ end
142
+ output = @indexer.map_record(@record)
143
+ assert_equal ["one", "two"], output["test"]
144
+ end
145
+ end
146
+
147
+ describe "append" do
148
+ it "appends suffix" do
149
+ @indexer.configure do
150
+ to_field "test", literal("one"), literal("two"), append(".suffix")
151
+ end
152
+ output = @indexer.map_record(@record)
153
+ assert_equal ["one.suffix", "two.suffix"], output["test"]
154
+ end
155
+ end
156
+
157
+ describe "prepend" do
158
+ it "prepends prefix" do
159
+ @indexer.configure do
160
+ to_field "test", literal("one"), literal("two"), prepend("prefix.")
161
+ end
162
+ output = @indexer.map_record(@record)
163
+ assert_equal ["prefix.one", "prefix.two"], output["test"]
164
+ end
165
+ end
166
+
167
+ describe "gsub" do
168
+ it "gsubs" do
169
+ @indexer.configure do
170
+ to_field "test", literal("one1212two23three"), gsub(/\d+/, ' ')
171
+ end
172
+ output = @indexer.map_record(@record)
173
+ assert_equal ["one two three"], output["test"]
174
+ end
175
+ end
176
+
177
+ end
@@ -196,12 +196,11 @@ describe "Traject::Indexer#map_record" do
196
196
  end
197
197
 
198
198
  @indexer.to_field('afterSkip') do |rec, acc|
199
- acc << "After. Should never happen"
199
+ raise ArgumentError, "intentional, should never happen"
200
200
  end
201
201
 
202
202
  output = @indexer.map_record(@record)
203
- assert_equal ['Before'], output['beforeSkip']
204
- assert_nil output['afterSkip']
203
+ assert_nil output
205
204
  end
206
205
 
207
206
 
@@ -0,0 +1,103 @@
1
+ require 'test_helper'
2
+
3
+ describe "Traject::NokogiriIndexer" do
4
+ before do
5
+ Traject::Indexer.send(:default_settings=, Traject::Indexer.default_settings.merge("solr_writer.thread_pool" => 0, "processing_thread_pool" => 0))
6
+
7
+
8
+ @xml_sample_path = support_file_path("sample-oai-pmh.xml")
9
+ @indexer = Traject::Indexer::NokogiriIndexer.new("writer_class_name" => "Traject::ArrayWriter", "solr_writer.thread_pool" => 0, "processing_thread_pool" => 0)
10
+ @namespaces = {
11
+ "oai" => "http://www.openarchives.org/OAI/2.0/",
12
+ "dc" => "http://purl.org/dc/elements/1.1/",
13
+ "oai_dc" => "http://www.openarchives.org/OAI/2.0/oai_dc/",
14
+ "edm" => "http://www.europeana.eu/schemas/edm/"
15
+ }
16
+ end
17
+
18
+ it "smoke test" do
19
+ namespaces = @namespaces
20
+ @indexer.configure do
21
+ settings do
22
+ provide "nokogiri.namespaces", namespaces
23
+ provide "nokogiri.each_record_xpath", "//oai:record"
24
+ end
25
+ to_field "id", extract_xpath("//oai:metadata/oai_dc:dc/dc:identifier"), first_only
26
+ to_field "title", extract_xpath("//oai:metadata/oai_dc:dc/dc:title")
27
+ end
28
+
29
+ @indexer.process(File.open(@xml_sample_path))
30
+
31
+ results = @indexer.writer.values
32
+
33
+ source_doc = Nokogiri::XML.parse(File.open(@xml_sample_path))
34
+
35
+ assert_equal source_doc.xpath("//oai:record", @namespaces).count, results.count
36
+ assert(results.all? { |hash|
37
+ hash["id"] && hash["id"].length == 1 &&
38
+ hash["title"] && hash["title"].length >= 1
39
+ }, "expected results have expected values")
40
+ end
41
+
42
+ it "namespaces to extract_xpath" do
43
+ namespaces = @namespaces.merge(edm: "http://this.is.wrong")
44
+ @indexer.configure do
45
+ settings do
46
+ provide "nokogiri.namespaces", namespaces
47
+ provide "nokogiri.each_record_xpath", "//oai:record"
48
+ end
49
+ to_field "rights", extract_xpath("//oai:metadata/oai_dc:dc/edm:rights", ns: { edm: "http://www.europeana.eu/schemas/edm/" })
50
+ end
51
+
52
+ @indexer.process(File.open(@xml_sample_path))
53
+
54
+ results = @indexer.writer.values
55
+
56
+ refute_empty results.last["rights"]
57
+ end
58
+
59
+ describe "xpath to non-terminal element" do
60
+ before do
61
+ @xml = <<-EOS
62
+ <record>
63
+ <name>
64
+ <first>José</first>
65
+ <last>Lopez</last>
66
+ </name>
67
+ <name>
68
+ <first>Sue</first>
69
+ <last>Jones</last>
70
+ </name>
71
+ </record>
72
+ EOS
73
+
74
+ @indexer.configure do
75
+ settings do
76
+ provide "nokogiri.each_record_xpath", "//record"
77
+ end
78
+ end
79
+ end
80
+
81
+ it "outputs text" do
82
+ @indexer.configure { to_field "name", extract_xpath("/record/name") }
83
+ @indexer.process(StringIO.new(@xml))
84
+ results = @indexer.writer.values
85
+
86
+ assert_equal( {"name" => ["José Lopez", "Sue Jones"]}, results.first )
87
+ end
88
+
89
+ it "outputs Nokogiri::XML::Element with to_text: false" do
90
+ @indexer.configure { to_field "name", extract_xpath("/record/name", to_text: false) }
91
+ @indexer.process(StringIO.new(@xml))
92
+ results = @indexer.writer.values
93
+
94
+ values = results.first["name"]
95
+
96
+ assert(values.each { |result|
97
+ result["name"].kind_of?(Nokogiri::XML::Element) &&
98
+ result["name"].name == "name"
99
+ })
100
+ end
101
+
102
+ end
103
+ end
@@ -0,0 +1,55 @@
1
+ require 'test_helper'
2
+
3
+ describe "Traject::Indexer#process_record" do
4
+ before do
5
+ @writer = Traject::ArrayWriter.new
6
+ @indexer = Traject::Indexer.new(writer: @writer) do
7
+ to_field "record", lambda { |rec, acc| acc << rec }
8
+ end
9
+ @record = {key: "value"}
10
+ end
11
+
12
+ it "sends to writer" do
13
+ @indexer.process_record(@record)
14
+ assert_equal [{"record" => [@record] }], @writer.values
15
+ end
16
+
17
+ it "returns context" do
18
+ context = @indexer.process_record(@record)
19
+ assert context.is_a?(Traject::Indexer::Context)
20
+ assert_equal @record, context.source_record
21
+ end
22
+
23
+ it "skips if skipped" do
24
+ @indexer = Traject::Indexer.new(writer: @writer) do
25
+ to_field "record", lambda { |rec, acc, context| acc << rec; context.skip! }
26
+ end
27
+ context = @indexer.process_record(@record)
28
+
29
+ assert context.skip?
30
+ assert_equal [], @writer.values
31
+ end
32
+
33
+ it "raises exceptions out" do
34
+ @indexer = Traject::Indexer.new(writer: @writer) do
35
+ to_field "record", lambda { |rec, acc, context| acc << rec; raise ArgumentError, "intentional" }
36
+ end
37
+ assert_raises(ArgumentError) do
38
+ @indexer.process_record(@record)
39
+ end
40
+ end
41
+
42
+ it "aliases <<" do
43
+ assert_equal @indexer.method(:process_record), @indexer.method(:<<)
44
+
45
+ @indexer << @record
46
+ end
47
+
48
+ it "raises on completed indexer" do
49
+ @indexer.complete
50
+ assert_raises Traject::Indexer::CompletedStateError do
51
+ @indexer.process_record(@record)
52
+ end
53
+ end
54
+
55
+ end
@@ -0,0 +1,148 @@
1
+ require 'test_helper'
2
+
3
+ describe "Traject::Indexer#process_with" do
4
+ let(:input_records) { [
5
+ { one: "one" },
6
+ { two: "two" },
7
+ { three: "three" }
8
+ ] }
9
+ let(:array_writer) { Traject::ArrayWriter.new }
10
+ let(:indexer) {
11
+ Traject::Indexer.new do
12
+ to_field "records", lambda { |rec, acc|
13
+ acc << rec
14
+ }
15
+ end
16
+ }
17
+
18
+ it "processes" do
19
+ writer = indexer.process_with(input_records, array_writer)
20
+ assert_equal([{"records"=>[{:one=>"one"}]}, {"records"=>[{:two=>"two"}]}, {"records"=>[{:three=>"three"}]}], writer.values)
21
+ end
22
+
23
+ describe "calls close" do
24
+ before do
25
+ array_writer.extend(Module.new do
26
+ def close
27
+ @close_called = true
28
+ end
29
+ def close_called?
30
+ @close_called
31
+ end
32
+ end)
33
+ end
34
+
35
+ it "calls by default" do
36
+ writer = indexer.process_with(input_records, array_writer)
37
+ assert writer.close_called?
38
+ end
39
+
40
+ it "does not call if told not to" do
41
+ writer = indexer.process_with(input_records, array_writer, close_writer: false)
42
+ assert ! writer.close_called?
43
+ end
44
+ end
45
+
46
+ describe "after_processing steps" do
47
+ let(:indexer) {
48
+ Traject::Indexer.new do
49
+ after_processing do
50
+ raise "Don't call me"
51
+ end
52
+ end
53
+ }
54
+ it "are not called" do
55
+ # should not raise
56
+ indexer.process_with(input_records, array_writer)
57
+ end
58
+ end
59
+
60
+ describe "with block as destination" do
61
+ it "calls block for each record" do
62
+ received = []
63
+ indexer.process_with(input_records) do |context|
64
+ received << context
65
+ end
66
+
67
+ assert_equal 3, received.length
68
+ assert received.all? { |o| o.kind_of?(Traject::Indexer::Context)}
69
+ assert_equal input_records.collect { |r| [r] }, received.collect { |c| c.output_hash["records"] }
70
+ end
71
+ end
72
+
73
+ describe "exceptions" do
74
+ let(:indexer) {
75
+ Traject::Indexer.new do
76
+ to_field "foo", lambda { |rec, acc|
77
+ if rec.keys.include?(:one)
78
+ raise ArgumentError, "intentional"
79
+ end
80
+
81
+ acc << rec
82
+ }
83
+ end
84
+ }
85
+
86
+ describe "by default" do
87
+ it "raises" do
88
+ assert_raises(ArgumentError) do
89
+ indexer.process_with(input_records, array_writer)
90
+ end
91
+ end
92
+ end
93
+
94
+ describe "with rescue_with" do
95
+ it "calls block and keeps processing" do
96
+ rescued = []
97
+ rescue_lambda = lambda do |context, exception|
98
+ rescued << {
99
+ context: context,
100
+ exception: exception
101
+ }
102
+ end
103
+
104
+ _writer = indexer.process_with(input_records, array_writer, rescue_with: rescue_lambda)
105
+
106
+ # not including the one that raised
107
+ assert_equal 2, array_writer.contexts.length
108
+ # and raise was called
109
+
110
+ assert_equal 1, rescued.length
111
+ assert rescued.first[:context].is_a?(Traject::Indexer::Context)
112
+ assert_equal ArgumentError, rescued.first[:exception].class
113
+ assert_equal "intentional", rescued.first[:exception].message
114
+ end
115
+
116
+ it "can raise from rescue" do
117
+ rescue_lambda = lambda do |context, exception|
118
+ raise exception
119
+ end
120
+
121
+ assert_raises(ArgumentError) do
122
+ indexer.process_with(input_records, array_writer, rescue: rescue_lambda)
123
+ end
124
+ end
125
+ end
126
+
127
+ describe "skipped records" do
128
+ let(:indexer) {
129
+ Traject::Indexer.new do
130
+ to_field "foo", literal("value")
131
+ each_record do |record, context|
132
+ context.skip!
133
+ end
134
+ end
135
+ }
136
+ it "calls on_skipped, does not send to writer" do
137
+ skip_calls = []
138
+ on_skipped = lambda { |*args| skip_calls << args }
139
+
140
+ writer = indexer.process_with(input_records, array_writer, on_skipped: on_skipped)
141
+
142
+ assert_equal writer.values, [], "nothing sent to writer"
143
+ assert_equal input_records.count, skip_calls.count, "skip proc called"
144
+ assert skip_calls.all? {|a| a.length == 1 && a[0].kind_of?(Traject::Indexer::Context) }, "skip proc called with single arg"
145
+ end
146
+ end
147
+ end
148
+ end