traject 2.3.4 → 3.0.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -47,6 +47,8 @@ require 'concurrent' # for atomic_fixnum
47
47
  class Traject::SolrJsonWriter
48
48
  include Traject::QualifiedConstGet
49
49
 
50
+ URI_REGEXP = URI::Parser.new.make_regexp.freeze
51
+
50
52
  DEFAULT_MAX_SKIPPED = 0
51
53
  DEFAULT_BATCH_SIZE = 100
52
54
 
@@ -105,6 +107,18 @@ class Traject::SolrJsonWriter
105
107
  end
106
108
  end
107
109
 
110
+ # Not part of standard writer API.
111
+ #
112
+ # If we are batching adds, and have some not-yet-written ones queued up --
113
+ # flush em all to solr.
114
+ #
115
+ # This should be thread-safe to call, but the write does take place in
116
+ # the caller's thread, no threading is done for you here, regardless of setting
117
+ # of solr_writer.thread_pool
118
+ def flush
119
+ send_batch( Traject::Util.drain_queue(@batched_queue) )
120
+ end
121
+
108
122
  # Send the given batch of contexts. If something goes wrong, send
109
123
  # them one at a time.
110
124
  # @param [Array<Traject::Indexer::Context>] an array of contexts
@@ -147,7 +161,7 @@ class Traject::SolrJsonWriter
147
161
  else
148
162
  msg = "Solr error response: #{resp.status}: #{resp.body}"
149
163
  end
150
- logger.error "Could not add record #{c.source_record_id} at source file position #{c.position}: #{msg}"
164
+ logger.error "Could not add record #{c.record_inspect}: #{msg}"
151
165
  logger.debug(c.source_record.to_s)
152
166
 
153
167
  @skipped_record_incrementer.increment
@@ -236,7 +250,7 @@ class Traject::SolrJsonWriter
236
250
 
237
251
  # If we've got a solr.update_url, make sure it's ok
238
252
  def check_solr_update_url(url)
239
- unless /^#{URI::regexp}$/.match(url)
253
+ unless /^#{URI_REGEXP}$/.match(url)
240
254
  raise ArgumentError.new("#{self.class.name} setting `solr.update_url` doesn't look like a URL: `#{url}`")
241
255
  end
242
256
  url
@@ -249,18 +263,11 @@ class Traject::SolrJsonWriter
249
263
  end
250
264
 
251
265
  # Not a URL? Bail
252
- unless /^#{URI::regexp}$/.match(url)
266
+ unless /^#{URI_REGEXP}$/.match(url)
253
267
  raise ArgumentError.new("#{self.class.name} setting `solr.url` doesn't look like a URL: `#{url}`")
254
268
  end
255
269
 
256
- # First, try the /update/json handler
257
- candidate = [url.chomp('/'), 'update', 'json'].join('/')
258
- resp = @http_client.get(candidate)
259
- if resp.status == 404
260
- candidate = [url.chomp('/'), 'update'].join('/')
261
- end
262
- candidate
270
+ # Assume the /update/json handler
271
+ return [url.chomp('/'), 'update', 'json'].join('/')
263
272
  end
264
-
265
-
266
273
  end
@@ -50,11 +50,24 @@ module Traject
50
50
  class ThreadPool
51
51
  attr_reader :pool_size, :queue_capacity
52
52
 
53
+ @@disable_concurrency = false
54
+
55
+ # Calling Traject::ThreadPool.disable_concurrency! permanently and irrevocably (for program execution)
56
+ # forces all ThreadPools to have a pool_size of 0 -- running all work inline -- so should disable all
57
+ # use of threads in Traject.
58
+ def self.disable_concurrency! ; @@disable_concurrency = true ; end
59
+ def self.concurrency_disabled? ; @@disable_concurrency ; end
60
+
53
61
  # First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
54
62
  # work in caller thread.
55
63
  def initialize(pool_size)
56
64
  @thread_pool = nil # assume we don't have one
57
65
  @exceptions_caught_queue = [] # start off without exceptions
66
+
67
+ if self.class.concurrency_disabled?
68
+ pool_size = 0
69
+ end
70
+
58
71
  unless pool_size.nil? || pool_size == 0
59
72
  @pool_size = pool_size.to_i
60
73
  @queue_capacity = pool_size * 3
@@ -60,7 +60,6 @@ module Traject
60
60
  if line.start_with?(file_path)
61
61
  if m = /\A.*\:(\d+)\:in/.match(line)
62
62
  return m[1].to_i
63
- break
64
63
  end
65
64
  end
66
65
  end
@@ -116,11 +115,24 @@ module Traject
116
115
  result << queue.deq(:raise_if_empty)
117
116
  end
118
117
  rescue ThreadError
119
- # Need do nothing, queue was concurrently popped, no biggie
118
+ # Need do nothing, queue was concurrently popped, no biggie, but let's
119
+ # stop iterating and return what we've got.
120
+ return result
120
121
  end
121
122
 
122
123
  return result
123
124
  end
124
125
 
126
+ def self.is_jruby?
127
+ unless defined?(@is_jruby)
128
+ @is_jruby = defined?(JRUBY_VERSION)
129
+ end
130
+ @is_jruby
131
+ end
132
+ # How can we refer to an io object input in logs? For now, if it's a file-like
133
+ # object, we can use #path.
134
+ def self.io_name(io_like_object)
135
+ io_like_object.path if io_like_object.respond_to?(:path)
136
+ end
125
137
  end
126
138
  end
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "2.3.4"
2
+ VERSION = "3.0.0.alpha.1"
3
3
  end
@@ -9,7 +9,7 @@ describe 'Simple output' do
9
9
  before do
10
10
  @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
11
11
  @indexer = Traject::Indexer.new
12
- @indexer.instance_eval do
12
+ @indexer.configure do
13
13
  to_field "id", extract_marc("001", :first => true)
14
14
  to_field "title", extract_marc("245ab")
15
15
  end
@@ -46,7 +46,7 @@ describe 'Simple output' do
46
46
  "record_num_1 title #{@title}",
47
47
  ]
48
48
  assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
49
- assert_match /At least one record \(\#1\) doesn't define field 'id'/, logger_strio.string
49
+ assert_match(/At least one record \(<record #1>\) doesn't define field 'id'/, logger_strio.string)
50
50
  @writer.close
51
51
 
52
52
  end
@@ -68,7 +68,7 @@ describe 'Simple output' do
68
68
  "record_num_1 title #{@title}",
69
69
  ]
70
70
  assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
71
- assert_match /At least one record \(\#1\) doesn't define field 'iden'/, logger_strio.string
71
+ assert_match(/At least one record \(<record #1, output_id:2710183>\) doesn't define field 'iden'/, logger_strio.string)
72
72
  writer.close
73
73
 
74
74
  end
@@ -39,13 +39,13 @@ describe "Delimited/CSV Writers" do
39
39
  end
40
40
 
41
41
  it "outputs a header if asked to" do
42
- dw = Traject::DelimitedWriter.new(@settings)
42
+ Traject::DelimitedWriter.new(@settings)
43
43
  @out.string.chomp.must_equal %w[four one two].join("\t")
44
44
  end
45
45
 
46
46
  it "doesn't output a header if asked not to" do
47
47
  @settings['delimited_writer.header'] = 'false'
48
- dw = Traject::DelimitedWriter.new(@settings)
48
+ Traject::DelimitedWriter.new(@settings)
49
49
  @out.string.must_be_empty
50
50
  end
51
51
 
@@ -69,7 +69,7 @@ describe "Delimited/CSV Writers" do
69
69
  end
70
70
 
71
71
  it "writes the header" do
72
- cw = Traject::CSVWriter.new(@settings)
72
+ Traject::CSVWriter.new(@settings)
73
73
  @out.string.chomp.must_equal 'four,one,two'
74
74
  end
75
75
 
@@ -0,0 +1,169 @@
1
+ require 'test_helper'
2
+ require 'traject/experimental_nokogiri_streaming_reader'
3
+
4
+ # Streaming nokogiri reader is experimental, half-finished, and not supported for real use.
5
+ describe "Traject::ExperimentalNokogiriStreamingReader" do
6
+ describe "with namespaces" do
7
+ before do
8
+ @namespaces = { "oai" => "http://www.openarchives.org/OAI/2.0/" }
9
+ @xml_sample_path = support_file_path("sample-oai-pmh.xml")
10
+ end
11
+
12
+ describe "invalid settings" do
13
+ it "default_namespaces not a hash raises" do
14
+ error = assert_raises(ArgumentError) {
15
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
16
+ "nokogiri.namespaces" => "i am not a hash",
17
+ })
18
+ }
19
+ assert(error.message =~ /nokogiri.namespaces must be a hash/)
20
+ end
21
+
22
+ it "each_record_xpath with unregistered prefix raises" do
23
+ error = assert_raises(ArgumentError) {
24
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
25
+ "nokogiri.namespaces" => @namespaces,
26
+ "nokogiri.each_record_xpath" => "//foo:bar"
27
+ })
28
+ }
29
+ assert(error.message =~ %r{Can't find namespace prefix 'foo' in '//foo:bar'})
30
+ end
31
+
32
+ it "raises on some unsupported xpath" do
33
+ error = assert_raises(ArgumentError) {
34
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
35
+ "nokogiri.namespaces" => @namespaces,
36
+ "nokogiri.each_record_xpath" => "//oai:record[@id='foo']"
37
+ })
38
+ }
39
+ assert(error.message =~ /Only very simple xpaths supported\./)
40
+ end
41
+ end
42
+
43
+ describe "fixed path" do
44
+ before do
45
+ @each_record_xpath = "/oai:OAI-PMH/oai:ListRecords/oai:record"
46
+ end
47
+
48
+ it "reads" do
49
+ shared_tests
50
+ end
51
+ end
52
+
53
+ describe "floating path" do
54
+ before do
55
+ @each_record_xpath = "//oai:record"
56
+ end
57
+
58
+ it "reads" do
59
+ shared_tests
60
+ end
61
+ end
62
+
63
+
64
+ describe "extra_xpath_hooks" do
65
+ it "catches oai-pmh resumption token" do
66
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
67
+ "nokogiri.namespaces" => @namespaces,
68
+ "nokogiri.each_record_xpath" => "//oai:record",
69
+ "nokogiri_reader.extra_xpath_hooks" => {
70
+ "//oai:resumptionToken" => lambda do |node, clipboard|
71
+ clipboard[:resumptionToken] = node.text
72
+ end
73
+ }
74
+ })
75
+ _records = @reader.to_a
76
+ assert_equal "oai_dc.f(2018-05-03T18:09:08Z).u(2018-06-15T19:25:21Z).t(6387):100", @reader.clipboard[:resumptionToken]
77
+ end
78
+ end
79
+
80
+ describe "outer namespaces" do
81
+ it "are preserved" do
82
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(support_file_path("namespace-test.xml")), {
83
+ "nokogiri.namespaces" => { mytop: "http://example.org/top" },
84
+ "nokogiri.each_record_xpath" => "//mytop:record"
85
+ })
86
+ yielded_records = []
87
+ @reader.each { |record|
88
+ yielded_records << record
89
+ }
90
+
91
+ assert yielded_records.length > 0
92
+
93
+ expected_namespaces = {"xmlns"=>"http://example.org/top", "xmlns:a"=>"http://example.org/a", "xmlns:b"=>"http://example.org/b"}
94
+ yielded_records.each do |rec|
95
+ assert_equal expected_namespaces, rec.namespaces
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+ describe "without namespaces" do
102
+ before do
103
+ @namespaces = {}
104
+ @xml_sample_path = support_file_path("sample-oai-no-namespace.xml")
105
+ end
106
+
107
+ describe "fixed path" do
108
+ before do
109
+ @each_record_xpath = "/OAI-PMH/ListRecords/record"
110
+ end
111
+
112
+ it "reads" do
113
+ shared_tests
114
+ end
115
+ end
116
+
117
+ describe "floating path" do
118
+ before do
119
+ @each_record_xpath = "//record"
120
+ end
121
+
122
+ it "reads" do
123
+ shared_tests
124
+ end
125
+ end
126
+ end
127
+
128
+
129
+ def shared_tests
130
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
131
+ "nokogiri.namespaces" => @namespaces,
132
+ "nokogiri.each_record_xpath" => @each_record_xpath
133
+ })
134
+
135
+ yielded_records = []
136
+ @reader.each { |record|
137
+ yielded_records << record
138
+ }
139
+
140
+
141
+ manually_extracted = Nokogiri::XML.parse(File.open(@xml_sample_path)).xpath(@each_record_xpath, @namespaces)
142
+ manually_extracted.collect do |node|
143
+ # nokogiri makes it so hard to reliably get an Element to serialize to XML with all
144
+ # it's inherited namespace declerations. :( We're only doing this for testing purposes
145
+ # anyway. This may not handle everything, but handles what we need in the test right now
146
+ if node.namespace
147
+ node["xmlns"] = node.namespace.href
148
+ end
149
+ end
150
+
151
+ assert_length manually_extracted.size, yielded_records
152
+ assert yielded_records.all? {|r| r.kind_of? Nokogiri::XML::Document }
153
+ assert_equal manually_extracted.collect(&:to_xml), yielded_records.collect(&:root).collect(&:to_xml)
154
+ end
155
+
156
+ describe "without each_record_xpath" do
157
+ before do
158
+ @xml_sample_path = support_file_path("namespace-test.xml")
159
+ end
160
+ it "yields whole file as one record" do
161
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {})
162
+
163
+ yielded_records = @reader.to_a
164
+
165
+ assert_length 1, yielded_records
166
+ assert_equal Nokogiri::XML.parse(File.open(@xml_sample_path)).to_xml, yielded_records.first.to_xml
167
+ end
168
+ end
169
+ end
@@ -5,7 +5,7 @@ describe "Traject::Indexer::Context" do
5
5
  describe "source_record_id" do
6
6
  before do
7
7
  @record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
8
- @context = Traject::Indexer::Context.new
8
+ @context = Traject::Indexer::Context.new(source_record_id_proc: Traject::Indexer::MarcIndexer.new.source_record_id_proc)
9
9
  @record_001 = " 00282214 " # from the mrc file
10
10
  end
11
11
 
@@ -13,23 +13,33 @@ describe "Traject::Indexer::Context" do
13
13
  @context.source_record = @record
14
14
  assert_equal @record_001, @context.source_record_id
15
15
  end
16
+ end
16
17
 
17
- it "gets it from the id" do
18
- @context.output_hash['id'] = 'the_record_id'
19
- assert_equal 'the_record_id', @context.source_record_id
20
- end
18
+ describe "#record_inspect" do
19
+ before do
20
+ @record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
21
+ @source_record_id_proc = Traject::Indexer::MarcIndexer.new.source_record_id_proc
22
+ @record_001 = " 00282214 " # from the mrc file
21
23
 
22
- it "gets from the id with non-MARC source" do
23
- @context.source_record = Object.new
24
- @context.output_hash['id'] = 'the_record_id'
25
- assert_equal 'the_record_id', @context.source_record_id
24
+ @position = 10
25
+ @input_name = "some_file.mrc"
26
+ @position_in_input = 10
26
27
  end
27
28
 
28
- it "gets it from both 001 and id" do
29
- @context.output_hash['id'] = 'the_record_id'
30
- @context.source_record = @record
31
- assert_equal [@record_001, 'the_record_id'].join('/'), @context.source_record_id
29
+ it "can print complete inspect label" do
30
+ @context = Traject::Indexer::Context.new(
31
+ source_record: @record,
32
+ source_record_id_proc: @source_record_id_proc,
33
+ position: @position,
34
+ input_name: @input_name,
35
+ position_in_input: @position_in_input
36
+ )
37
+ @context.output_hash["id"] = "output_id"
38
+
39
+ assert_equal "<record ##{@position} (#{@input_name} ##{@position_in_input}), source_id:#{@record_001} output_id:output_id>", @context.record_inspect
32
40
  end
41
+
33
42
  end
34
43
 
44
+
35
45
  end
@@ -0,0 +1,59 @@
1
+ require 'test_helper'
2
+
3
+ describe 'Custom mapping error handler' do
4
+ # the exception thrown by the custom handler
5
+ class CustomFakeException < StandardError; end
6
+
7
+ let(:indexer) { Traject::Indexer.new }
8
+
9
+ it 'invokes the default handler when custom handler is not set' do
10
+ output = StringIO.new
11
+ logger =Logger.new(output)
12
+ indexer.logger = logger
13
+ indexer.configure do
14
+ to_field 'id' do |_, _, _|
15
+ raise CustomFakeException, "I just like raising errors"
16
+ end
17
+ end
18
+
19
+ e = assert_raises(CustomFakeException) do
20
+ indexer.map_record({})
21
+ end
22
+
23
+ assert_equal "I just like raising errors", e.message
24
+ assert output.string =~ /while executing \(to_field \"id\" at .*error_handler_test.rb:\d+\)/
25
+ assert output.string =~ /CustomFakeException: I just like raising errors/
26
+ end
27
+
28
+ it 'invokes the custom handler when set' do
29
+ indexer.configure do
30
+ settings do
31
+ provide 'mapping_rescue', -> (ctx, e) {
32
+ raise CustomFakeException, "custom handler called #{ctx.record_inspect}: #{ctx.index_step.inspect}, #{e.inspect}"
33
+ }
34
+ end
35
+
36
+ to_field 'id' do |_context , _exception|
37
+ raise 'this was always going to fail'
38
+ end
39
+ end
40
+ e = assert_raises(CustomFakeException) { indexer.map_record({}) }
41
+ assert e.message =~ /\(to_field \"id\" at .*error_handler_test.rb:\d+\)/
42
+ end
43
+
44
+ it "custom handler can skip and continue" do
45
+ indexer.configure do
46
+ settings do
47
+ provide "mapping_rescue", -> (context, exception) {
48
+ context.skip!
49
+ }
50
+ end
51
+
52
+ to_field 'id' do |_context , _exception|
53
+ raise 'this was always going to fail'
54
+ end
55
+ end
56
+
57
+ assert_nil indexer.map_record({})
58
+ end
59
+ end