traject 2.3.4 → 3.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -0,0 +1,177 @@
1
+ # Encoding: UTF-8
2
+
3
+ require 'test_helper'
4
+ require 'traject/indexer'
5
+
6
+ # should be built into every indexer
7
+ describe "Traject::Macros::Transformation" do
8
+ before do
9
+ @indexer = Traject::Indexer.new
10
+ @record = nil
11
+ end
12
+
13
+ describe "translation_map" do
14
+ it "translates" do
15
+ @indexer.configure do
16
+ to_field "cataloging_agency", literal("DLC"), translation_map("marc_040a_translate_test")
17
+ end
18
+ output = @indexer.map_record(@record)
19
+ assert_equal ["Library of Congress"], output["cataloging_agency"]
20
+ end
21
+
22
+ it "can merge multiple" do
23
+ @indexer.configure do
24
+ to_field "result", literal("key_to_be_overridden"), translation_map("ruby_map", "yaml_map")
25
+ end
26
+ output = @indexer.map_record(@record)
27
+ assert_equal ["value_from_yaml"], output["result"]
28
+ end
29
+
30
+ it "can merge multiple with hash" do
31
+ @indexer.configure do
32
+ to_field "result", literal("key_to_be_overridden"), translation_map("ruby_map", "yaml_map", {"key_to_be_overridden" => "value_from_inline_hash"})
33
+ end
34
+ output = @indexer.map_record(@record)
35
+ assert_equal ["value_from_inline_hash"], output["result"]
36
+ end
37
+ end
38
+
39
+ describe "transform" do
40
+ it "transforms with block" do
41
+ @indexer.configure do
42
+ to_field "sample_field", literal("one"), literal("two"), transform(&:upcase)
43
+ end
44
+ output = @indexer.map_record(@record)
45
+ assert_equal ["ONE", "TWO"], output["sample_field"]
46
+ end
47
+
48
+ it "transforms with proc arg" do
49
+ @indexer.configure do
50
+ to_field "sample_field", literal("one"), literal("two"), transform(->(val) { val.tr('aeiou', '!') })
51
+ end
52
+ output = @indexer.map_record(@record)
53
+ assert_equal ["!n!", "tw!"], output["sample_field"]
54
+ end
55
+
56
+ it "transforms with both, in correct order" do
57
+ @indexer.configure do
58
+ to_field "sample_field", literal("one"), literal("two"), transform(->(val) { val.tr('aeiou', '!') }, &:upcase)
59
+ end
60
+ output = @indexer.map_record(@record)
61
+ assert_equal ["!N!", "TW!"], output["sample_field"]
62
+ end
63
+ end
64
+
65
+ describe "default" do
66
+ it "adds default to empty accumulator" do
67
+ @indexer.configure do
68
+ to_field "test", default("default")
69
+ end
70
+ output = @indexer.map_record(@record)
71
+ assert_equal ["default"], output["test"]
72
+ end
73
+
74
+ it "does not add default if value present" do
75
+ @indexer.configure do
76
+ to_field "test", literal("value"), default("defaut")
77
+ end
78
+ output = @indexer.map_record(@record)
79
+ assert_equal ["value"], output["test"]
80
+ end
81
+ end
82
+
83
+ describe "first_only" do
84
+ it "takes only first in multi-value" do
85
+ @indexer.configure do
86
+ to_field "test", literal("one"), literal("two"), literal("three"), first_only
87
+ end
88
+ output = @indexer.map_record(@record)
89
+ assert_equal ["one"], output["test"]
90
+ end
91
+
92
+ it "no-ops on nil" do
93
+ @indexer.configure do
94
+ to_field "test", first_only
95
+ end
96
+ output = @indexer.map_record(@record)
97
+ assert_nil output["test"]
98
+ end
99
+
100
+ it "no-ops on single value" do
101
+ @indexer.configure do
102
+ to_field "test", literal("one"), first_only
103
+ end
104
+ output = @indexer.map_record(@record)
105
+ assert_equal ["one"], output["test"]
106
+ end
107
+ end
108
+
109
+ describe "unique" do
110
+ it "uniqs" do
111
+ @indexer.configure do
112
+ to_field "test", literal("one"), literal("two"), literal("one"), literal("three"), unique
113
+ end
114
+ output = @indexer.map_record(@record)
115
+ assert_equal ["one", "two", "three"], output["test"]
116
+ end
117
+ end
118
+
119
+ describe "strip" do
120
+ it "strips" do
121
+ @indexer.configure do
122
+ to_field "test", literal(" one"), literal(" two "), strip
123
+ end
124
+ output = @indexer.map_record(@record)
125
+ assert_equal ["one", "two"], output["test"]
126
+ end
127
+
128
+ it "strips unicode whitespace" do
129
+ @indexer.configure do
130
+ to_field "test", literal(" \u00A0 \u2002 one \u202F "), strip
131
+ end
132
+ output = @indexer.map_record(@record)
133
+ assert_equal ["one"], output["test"]
134
+ end
135
+ end
136
+
137
+ describe "split" do
138
+ it "splits" do
139
+ @indexer.configure do
140
+ to_field "test", literal("one.two"), split(".")
141
+ end
142
+ output = @indexer.map_record(@record)
143
+ assert_equal ["one", "two"], output["test"]
144
+ end
145
+ end
146
+
147
+ describe "append" do
148
+ it "appends suffix" do
149
+ @indexer.configure do
150
+ to_field "test", literal("one"), literal("two"), append(".suffix")
151
+ end
152
+ output = @indexer.map_record(@record)
153
+ assert_equal ["one.suffix", "two.suffix"], output["test"]
154
+ end
155
+ end
156
+
157
+ describe "prepend" do
158
+ it "prepends prefix" do
159
+ @indexer.configure do
160
+ to_field "test", literal("one"), literal("two"), prepend("prefix.")
161
+ end
162
+ output = @indexer.map_record(@record)
163
+ assert_equal ["prefix.one", "prefix.two"], output["test"]
164
+ end
165
+ end
166
+
167
+ describe "gsub" do
168
+ it "gsubs" do
169
+ @indexer.configure do
170
+ to_field "test", literal("one1212two23three"), gsub(/\d+/, ' ')
171
+ end
172
+ output = @indexer.map_record(@record)
173
+ assert_equal ["one two three"], output["test"]
174
+ end
175
+ end
176
+
177
+ end
@@ -196,12 +196,11 @@ describe "Traject::Indexer#map_record" do
196
196
  end
197
197
 
198
198
  @indexer.to_field('afterSkip') do |rec, acc|
199
- acc << "After. Should never happen"
199
+ raise ArgumentError, "intentional, should never happen"
200
200
  end
201
201
 
202
202
  output = @indexer.map_record(@record)
203
- assert_equal ['Before'], output['beforeSkip']
204
- assert_nil output['afterSkip']
203
+ assert_nil output
205
204
  end
206
205
 
207
206
 
@@ -0,0 +1,103 @@
1
+ require 'test_helper'
2
+
3
+ describe "Traject::NokogiriIndexer" do
4
+ before do
5
+ Traject::Indexer.send(:default_settings=, Traject::Indexer.default_settings.merge("solr_writer.thread_pool" => 0, "processing_thread_pool" => 0))
6
+
7
+
8
+ @xml_sample_path = support_file_path("sample-oai-pmh.xml")
9
+ @indexer = Traject::Indexer::NokogiriIndexer.new("writer_class_name" => "Traject::ArrayWriter", "solr_writer.thread_pool" => 0, "processing_thread_pool" => 0)
10
+ @namespaces = {
11
+ "oai" => "http://www.openarchives.org/OAI/2.0/",
12
+ "dc" => "http://purl.org/dc/elements/1.1/",
13
+ "oai_dc" => "http://www.openarchives.org/OAI/2.0/oai_dc/",
14
+ "edm" => "http://www.europeana.eu/schemas/edm/"
15
+ }
16
+ end
17
+
18
+ it "smoke test" do
19
+ namespaces = @namespaces
20
+ @indexer.configure do
21
+ settings do
22
+ provide "nokogiri.namespaces", namespaces
23
+ provide "nokogiri.each_record_xpath", "//oai:record"
24
+ end
25
+ to_field "id", extract_xpath("//oai:metadata/oai_dc:dc/dc:identifier"), first_only
26
+ to_field "title", extract_xpath("//oai:metadata/oai_dc:dc/dc:title")
27
+ end
28
+
29
+ @indexer.process(File.open(@xml_sample_path))
30
+
31
+ results = @indexer.writer.values
32
+
33
+ source_doc = Nokogiri::XML.parse(File.open(@xml_sample_path))
34
+
35
+ assert_equal source_doc.xpath("//oai:record", @namespaces).count, results.count
36
+ assert(results.all? { |hash|
37
+ hash["id"] && hash["id"].length == 1 &&
38
+ hash["title"] && hash["title"].length >= 1
39
+ }, "expected results have expected values")
40
+ end
41
+
42
+ it "namespaces to extract_xpath" do
43
+ namespaces = @namespaces.merge(edm: "http://this.is.wrong")
44
+ @indexer.configure do
45
+ settings do
46
+ provide "nokogiri.namespaces", namespaces
47
+ provide "nokogiri.each_record_xpath", "//oai:record"
48
+ end
49
+ to_field "rights", extract_xpath("//oai:metadata/oai_dc:dc/edm:rights", ns: { edm: "http://www.europeana.eu/schemas/edm/" })
50
+ end
51
+
52
+ @indexer.process(File.open(@xml_sample_path))
53
+
54
+ results = @indexer.writer.values
55
+
56
+ refute_empty results.last["rights"]
57
+ end
58
+
59
+ describe "xpath to non-terminal element" do
60
+ before do
61
+ @xml = <<-EOS
62
+ <record>
63
+ <name>
64
+ <first>José</first>
65
+ <last>Lopez</last>
66
+ </name>
67
+ <name>
68
+ <first>Sue</first>
69
+ <last>Jones</last>
70
+ </name>
71
+ </record>
72
+ EOS
73
+
74
+ @indexer.configure do
75
+ settings do
76
+ provide "nokogiri.each_record_xpath", "//record"
77
+ end
78
+ end
79
+ end
80
+
81
+ it "outputs text" do
82
+ @indexer.configure { to_field "name", extract_xpath("/record/name") }
83
+ @indexer.process(StringIO.new(@xml))
84
+ results = @indexer.writer.values
85
+
86
+ assert_equal( {"name" => ["José Lopez", "Sue Jones"]}, results.first )
87
+ end
88
+
89
+ it "outputs Nokogiri::XML::Element with to_text: false" do
90
+ @indexer.configure { to_field "name", extract_xpath("/record/name", to_text: false) }
91
+ @indexer.process(StringIO.new(@xml))
92
+ results = @indexer.writer.values
93
+
94
+ values = results.first["name"]
95
+
96
+ assert(values.each { |result|
97
+ result["name"].kind_of?(Nokogiri::XML::Element) &&
98
+ result["name"].name == "name"
99
+ })
100
+ end
101
+
102
+ end
103
+ end
@@ -0,0 +1,55 @@
1
+ require 'test_helper'
2
+
3
+ describe "Traject::Indexer#process_record" do
4
+ before do
5
+ @writer = Traject::ArrayWriter.new
6
+ @indexer = Traject::Indexer.new(writer: @writer) do
7
+ to_field "record", lambda { |rec, acc| acc << rec }
8
+ end
9
+ @record = {key: "value"}
10
+ end
11
+
12
+ it "sends to writer" do
13
+ @indexer.process_record(@record)
14
+ assert_equal [{"record" => [@record] }], @writer.values
15
+ end
16
+
17
+ it "returns context" do
18
+ context = @indexer.process_record(@record)
19
+ assert context.is_a?(Traject::Indexer::Context)
20
+ assert_equal @record, context.source_record
21
+ end
22
+
23
+ it "skips if skipped" do
24
+ @indexer = Traject::Indexer.new(writer: @writer) do
25
+ to_field "record", lambda { |rec, acc, context| acc << rec; context.skip! }
26
+ end
27
+ context = @indexer.process_record(@record)
28
+
29
+ assert context.skip?
30
+ assert_equal [], @writer.values
31
+ end
32
+
33
+ it "raises exceptions out" do
34
+ @indexer = Traject::Indexer.new(writer: @writer) do
35
+ to_field "record", lambda { |rec, acc, context| acc << rec; raise ArgumentError, "intentional" }
36
+ end
37
+ assert_raises(ArgumentError) do
38
+ @indexer.process_record(@record)
39
+ end
40
+ end
41
+
42
+ it "aliases <<" do
43
+ assert_equal @indexer.method(:process_record), @indexer.method(:<<)
44
+
45
+ @indexer << @record
46
+ end
47
+
48
+ it "raises on completed indexer" do
49
+ @indexer.complete
50
+ assert_raises Traject::Indexer::CompletedStateError do
51
+ @indexer.process_record(@record)
52
+ end
53
+ end
54
+
55
+ end
@@ -0,0 +1,148 @@
1
+ require 'test_helper'
2
+
3
+ describe "Traject::Indexer#process_with" do
4
+ let(:input_records) { [
5
+ { one: "one" },
6
+ { two: "two" },
7
+ { three: "three" }
8
+ ] }
9
+ let(:array_writer) { Traject::ArrayWriter.new }
10
+ let(:indexer) {
11
+ Traject::Indexer.new do
12
+ to_field "records", lambda { |rec, acc|
13
+ acc << rec
14
+ }
15
+ end
16
+ }
17
+
18
+ it "processes" do
19
+ writer = indexer.process_with(input_records, array_writer)
20
+ assert_equal([{"records"=>[{:one=>"one"}]}, {"records"=>[{:two=>"two"}]}, {"records"=>[{:three=>"three"}]}], writer.values)
21
+ end
22
+
23
+ describe "calls close" do
24
+ before do
25
+ array_writer.extend(Module.new do
26
+ def close
27
+ @close_called = true
28
+ end
29
+ def close_called?
30
+ @close_called
31
+ end
32
+ end)
33
+ end
34
+
35
+ it "calls by default" do
36
+ writer = indexer.process_with(input_records, array_writer)
37
+ assert writer.close_called?
38
+ end
39
+
40
+ it "does not call if told not to" do
41
+ writer = indexer.process_with(input_records, array_writer, close_writer: false)
42
+ assert ! writer.close_called?
43
+ end
44
+ end
45
+
46
+ describe "after_processing steps" do
47
+ let(:indexer) {
48
+ Traject::Indexer.new do
49
+ after_processing do
50
+ raise "Don't call me"
51
+ end
52
+ end
53
+ }
54
+ it "are not called" do
55
+ # should not raise
56
+ indexer.process_with(input_records, array_writer)
57
+ end
58
+ end
59
+
60
+ describe "with block as destination" do
61
+ it "calls block for each record" do
62
+ received = []
63
+ indexer.process_with(input_records) do |context|
64
+ received << context
65
+ end
66
+
67
+ assert_equal 3, received.length
68
+ assert received.all? { |o| o.kind_of?(Traject::Indexer::Context)}
69
+ assert_equal input_records.collect { |r| [r] }, received.collect { |c| c.output_hash["records"] }
70
+ end
71
+ end
72
+
73
+ describe "exceptions" do
74
+ let(:indexer) {
75
+ Traject::Indexer.new do
76
+ to_field "foo", lambda { |rec, acc|
77
+ if rec.keys.include?(:one)
78
+ raise ArgumentError, "intentional"
79
+ end
80
+
81
+ acc << rec
82
+ }
83
+ end
84
+ }
85
+
86
+ describe "by default" do
87
+ it "raises" do
88
+ assert_raises(ArgumentError) do
89
+ indexer.process_with(input_records, array_writer)
90
+ end
91
+ end
92
+ end
93
+
94
+ describe "with rescue_with" do
95
+ it "calls block and keeps processing" do
96
+ rescued = []
97
+ rescue_lambda = lambda do |context, exception|
98
+ rescued << {
99
+ context: context,
100
+ exception: exception
101
+ }
102
+ end
103
+
104
+ _writer = indexer.process_with(input_records, array_writer, rescue_with: rescue_lambda)
105
+
106
+ # not including the one that raised
107
+ assert_equal 2, array_writer.contexts.length
108
+ # and raise was called
109
+
110
+ assert_equal 1, rescued.length
111
+ assert rescued.first[:context].is_a?(Traject::Indexer::Context)
112
+ assert_equal ArgumentError, rescued.first[:exception].class
113
+ assert_equal "intentional", rescued.first[:exception].message
114
+ end
115
+
116
+ it "can raise from rescue" do
117
+ rescue_lambda = lambda do |context, exception|
118
+ raise exception
119
+ end
120
+
121
+ assert_raises(ArgumentError) do
122
+ indexer.process_with(input_records, array_writer, rescue: rescue_lambda)
123
+ end
124
+ end
125
+ end
126
+
127
+ describe "skipped records" do
128
+ let(:indexer) {
129
+ Traject::Indexer.new do
130
+ to_field "foo", literal("value")
131
+ each_record do |record, context|
132
+ context.skip!
133
+ end
134
+ end
135
+ }
136
+ it "calls on_skipped, does not send to writer" do
137
+ skip_calls = []
138
+ on_skipped = lambda { |*args| skip_calls << args }
139
+
140
+ writer = indexer.process_with(input_records, array_writer, on_skipped: on_skipped)
141
+
142
+ assert_equal writer.values, [], "nothing sent to writer"
143
+ assert_equal input_records.count, skip_calls.count, "skip proc called"
144
+ assert skip_calls.all? {|a| a.length == 1 && a[0].kind_of?(Traject::Indexer::Context) }, "skip proc called with single arg"
145
+ end
146
+ end
147
+ end
148
+ end