traject 2.3.4 → 3.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -47,6 +47,8 @@ require 'concurrent' # for atomic_fixnum
47
47
  class Traject::SolrJsonWriter
48
48
  include Traject::QualifiedConstGet
49
49
 
50
+ URI_REGEXP = URI::Parser.new.make_regexp.freeze
51
+
50
52
  DEFAULT_MAX_SKIPPED = 0
51
53
  DEFAULT_BATCH_SIZE = 100
52
54
 
@@ -105,6 +107,18 @@ class Traject::SolrJsonWriter
105
107
  end
106
108
  end
107
109
 
110
+ # Not part of standard writer API.
111
+ #
112
+ # If we are batching adds, and have some not-yet-written ones queued up --
113
+ # flush em all to solr.
114
+ #
115
+ # This should be thread-safe to call, but the write does take place in
116
+ # the caller's thread, no threading is done for you here, regardless of setting
117
+ # of solr_writer.thread_pool
118
+ def flush
119
+ send_batch( Traject::Util.drain_queue(@batched_queue) )
120
+ end
121
+
108
122
  # Send the given batch of contexts. If something goes wrong, send
109
123
  # them one at a time.
110
124
  # @param [Array<Traject::Indexer::Context>] an array of contexts
@@ -147,7 +161,7 @@ class Traject::SolrJsonWriter
147
161
  else
148
162
  msg = "Solr error response: #{resp.status}: #{resp.body}"
149
163
  end
150
- logger.error "Could not add record #{c.source_record_id} at source file position #{c.position}: #{msg}"
164
+ logger.error "Could not add record #{c.record_inspect}: #{msg}"
151
165
  logger.debug(c.source_record.to_s)
152
166
 
153
167
  @skipped_record_incrementer.increment
@@ -236,7 +250,7 @@ class Traject::SolrJsonWriter
236
250
 
237
251
  # If we've got a solr.update_url, make sure it's ok
238
252
  def check_solr_update_url(url)
239
- unless /^#{URI::regexp}$/.match(url)
253
+ unless /^#{URI_REGEXP}$/.match(url)
240
254
  raise ArgumentError.new("#{self.class.name} setting `solr.update_url` doesn't look like a URL: `#{url}`")
241
255
  end
242
256
  url
@@ -249,18 +263,11 @@ class Traject::SolrJsonWriter
249
263
  end
250
264
 
251
265
  # Not a URL? Bail
252
- unless /^#{URI::regexp}$/.match(url)
266
+ unless /^#{URI_REGEXP}$/.match(url)
253
267
  raise ArgumentError.new("#{self.class.name} setting `solr.url` doesn't look like a URL: `#{url}`")
254
268
  end
255
269
 
256
- # First, try the /update/json handler
257
- candidate = [url.chomp('/'), 'update', 'json'].join('/')
258
- resp = @http_client.get(candidate)
259
- if resp.status == 404
260
- candidate = [url.chomp('/'), 'update'].join('/')
261
- end
262
- candidate
270
+ # Assume the /update/json handler
271
+ return [url.chomp('/'), 'update', 'json'].join('/')
263
272
  end
264
-
265
-
266
273
  end
@@ -50,11 +50,24 @@ module Traject
50
50
  class ThreadPool
51
51
  attr_reader :pool_size, :queue_capacity
52
52
 
53
+ @@disable_concurrency = false
54
+
55
+ # Calling Traject::ThreadPool.disable_concurrency! permanently and irrevocably (for program execution)
56
+ # forces all ThreadPools to have a pool_size of 0 -- running all work inline -- so should disable all
57
+ # use of threads in Traject.
58
+ def self.disable_concurrency! ; @@disable_concurrency = true ; end
59
+ def self.concurrency_disabled? ; @@disable_concurrency ; end
60
+
53
61
  # First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
54
62
  # work in caller thread.
55
63
  def initialize(pool_size)
56
64
  @thread_pool = nil # assume we don't have one
57
65
  @exceptions_caught_queue = [] # start off without exceptions
66
+
67
+ if self.class.concurrency_disabled?
68
+ pool_size = 0
69
+ end
70
+
58
71
  unless pool_size.nil? || pool_size == 0
59
72
  @pool_size = pool_size.to_i
60
73
  @queue_capacity = pool_size * 3
@@ -60,7 +60,6 @@ module Traject
60
60
  if line.start_with?(file_path)
61
61
  if m = /\A.*\:(\d+)\:in/.match(line)
62
62
  return m[1].to_i
63
- break
64
63
  end
65
64
  end
66
65
  end
@@ -116,11 +115,24 @@ module Traject
116
115
  result << queue.deq(:raise_if_empty)
117
116
  end
118
117
  rescue ThreadError
119
- # Need do nothing, queue was concurrently popped, no biggie
118
+ # Need do nothing, queue was concurrently popped, no biggie, but let's
119
+ # stop iterating and return what we've got.
120
+ return result
120
121
  end
121
122
 
122
123
  return result
123
124
  end
124
125
 
126
+ def self.is_jruby?
127
+ unless defined?(@is_jruby)
128
+ @is_jruby = defined?(JRUBY_VERSION)
129
+ end
130
+ @is_jruby
131
+ end
132
+ # How can we refer to an io object input in logs? For now, if it's a file-like
133
+ # object, we can use #path.
134
+ def self.io_name(io_like_object)
135
+ io_like_object.path if io_like_object.respond_to?(:path)
136
+ end
125
137
  end
126
138
  end
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "2.3.4"
2
+ VERSION = "3.0.0.alpha.1"
3
3
  end
@@ -9,7 +9,7 @@ describe 'Simple output' do
9
9
  before do
10
10
  @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
11
11
  @indexer = Traject::Indexer.new
12
- @indexer.instance_eval do
12
+ @indexer.configure do
13
13
  to_field "id", extract_marc("001", :first => true)
14
14
  to_field "title", extract_marc("245ab")
15
15
  end
@@ -46,7 +46,7 @@ describe 'Simple output' do
46
46
  "record_num_1 title #{@title}",
47
47
  ]
48
48
  assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
49
- assert_match /At least one record \(\#1\) doesn't define field 'id'/, logger_strio.string
49
+ assert_match(/At least one record \(<record #1>\) doesn't define field 'id'/, logger_strio.string)
50
50
  @writer.close
51
51
 
52
52
  end
@@ -68,7 +68,7 @@ describe 'Simple output' do
68
68
  "record_num_1 title #{@title}",
69
69
  ]
70
70
  assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
71
- assert_match /At least one record \(\#1\) doesn't define field 'iden'/, logger_strio.string
71
+ assert_match(/At least one record \(<record #1, output_id:2710183>\) doesn't define field 'iden'/, logger_strio.string)
72
72
  writer.close
73
73
 
74
74
  end
@@ -39,13 +39,13 @@ describe "Delimited/CSV Writers" do
39
39
  end
40
40
 
41
41
  it "outputs a header if asked to" do
42
- dw = Traject::DelimitedWriter.new(@settings)
42
+ Traject::DelimitedWriter.new(@settings)
43
43
  @out.string.chomp.must_equal %w[four one two].join("\t")
44
44
  end
45
45
 
46
46
  it "doesn't output a header if asked not to" do
47
47
  @settings['delimited_writer.header'] = 'false'
48
- dw = Traject::DelimitedWriter.new(@settings)
48
+ Traject::DelimitedWriter.new(@settings)
49
49
  @out.string.must_be_empty
50
50
  end
51
51
 
@@ -69,7 +69,7 @@ describe "Delimited/CSV Writers" do
69
69
  end
70
70
 
71
71
  it "writes the header" do
72
- cw = Traject::CSVWriter.new(@settings)
72
+ Traject::CSVWriter.new(@settings)
73
73
  @out.string.chomp.must_equal 'four,one,two'
74
74
  end
75
75
 
@@ -0,0 +1,169 @@
1
+ require 'test_helper'
2
+ require 'traject/experimental_nokogiri_streaming_reader'
3
+
4
+ # Streaming nokogiri reader is experimental, half-finished, and not supported for real use.
5
+ describe "Traject::ExperimentalNokogiriStreamingReader" do
6
+ describe "with namespaces" do
7
+ before do
8
+ @namespaces = { "oai" => "http://www.openarchives.org/OAI/2.0/" }
9
+ @xml_sample_path = support_file_path("sample-oai-pmh.xml")
10
+ end
11
+
12
+ describe "invalid settings" do
13
+ it "default_namespaces not a hash raises" do
14
+ error = assert_raises(ArgumentError) {
15
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
16
+ "nokogiri.namespaces" => "i am not a hash",
17
+ })
18
+ }
19
+ assert(error.message =~ /nokogiri.namespaces must be a hash/)
20
+ end
21
+
22
+ it "each_record_xpath with unregistered prefix raises" do
23
+ error = assert_raises(ArgumentError) {
24
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
25
+ "nokogiri.namespaces" => @namespaces,
26
+ "nokogiri.each_record_xpath" => "//foo:bar"
27
+ })
28
+ }
29
+ assert(error.message =~ %r{Can't find namespace prefix 'foo' in '//foo:bar'})
30
+ end
31
+
32
+ it "raises on some unsupported xpath" do
33
+ error = assert_raises(ArgumentError) {
34
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
35
+ "nokogiri.namespaces" => @namespaces,
36
+ "nokogiri.each_record_xpath" => "//oai:record[@id='foo']"
37
+ })
38
+ }
39
+ assert(error.message =~ /Only very simple xpaths supported\./)
40
+ end
41
+ end
42
+
43
+ describe "fixed path" do
44
+ before do
45
+ @each_record_xpath = "/oai:OAI-PMH/oai:ListRecords/oai:record"
46
+ end
47
+
48
+ it "reads" do
49
+ shared_tests
50
+ end
51
+ end
52
+
53
+ describe "floating path" do
54
+ before do
55
+ @each_record_xpath = "//oai:record"
56
+ end
57
+
58
+ it "reads" do
59
+ shared_tests
60
+ end
61
+ end
62
+
63
+
64
+ describe "extra_xpath_hooks" do
65
+ it "catches oai-pmh resumption token" do
66
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
67
+ "nokogiri.namespaces" => @namespaces,
68
+ "nokogiri.each_record_xpath" => "//oai:record",
69
+ "nokogiri_reader.extra_xpath_hooks" => {
70
+ "//oai:resumptionToken" => lambda do |node, clipboard|
71
+ clipboard[:resumptionToken] = node.text
72
+ end
73
+ }
74
+ })
75
+ _records = @reader.to_a
76
+ assert_equal "oai_dc.f(2018-05-03T18:09:08Z).u(2018-06-15T19:25:21Z).t(6387):100", @reader.clipboard[:resumptionToken]
77
+ end
78
+ end
79
+
80
+ describe "outer namespaces" do
81
+ it "are preserved" do
82
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(support_file_path("namespace-test.xml")), {
83
+ "nokogiri.namespaces" => { mytop: "http://example.org/top" },
84
+ "nokogiri.each_record_xpath" => "//mytop:record"
85
+ })
86
+ yielded_records = []
87
+ @reader.each { |record|
88
+ yielded_records << record
89
+ }
90
+
91
+ assert yielded_records.length > 0
92
+
93
+ expected_namespaces = {"xmlns"=>"http://example.org/top", "xmlns:a"=>"http://example.org/a", "xmlns:b"=>"http://example.org/b"}
94
+ yielded_records.each do |rec|
95
+ assert_equal expected_namespaces, rec.namespaces
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+ describe "without namespaces" do
102
+ before do
103
+ @namespaces = {}
104
+ @xml_sample_path = support_file_path("sample-oai-no-namespace.xml")
105
+ end
106
+
107
+ describe "fixed path" do
108
+ before do
109
+ @each_record_xpath = "/OAI-PMH/ListRecords/record"
110
+ end
111
+
112
+ it "reads" do
113
+ shared_tests
114
+ end
115
+ end
116
+
117
+ describe "floating path" do
118
+ before do
119
+ @each_record_xpath = "//record"
120
+ end
121
+
122
+ it "reads" do
123
+ shared_tests
124
+ end
125
+ end
126
+ end
127
+
128
+
129
+ def shared_tests
130
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {
131
+ "nokogiri.namespaces" => @namespaces,
132
+ "nokogiri.each_record_xpath" => @each_record_xpath
133
+ })
134
+
135
+ yielded_records = []
136
+ @reader.each { |record|
137
+ yielded_records << record
138
+ }
139
+
140
+
141
+ manually_extracted = Nokogiri::XML.parse(File.open(@xml_sample_path)).xpath(@each_record_xpath, @namespaces)
142
+ manually_extracted.collect do |node|
143
+ # nokogiri makes it so hard to reliably get an Element to serialize to XML with all
144
+ # it's inherited namespace declerations. :( We're only doing this for testing purposes
145
+ # anyway. This may not handle everything, but handles what we need in the test right now
146
+ if node.namespace
147
+ node["xmlns"] = node.namespace.href
148
+ end
149
+ end
150
+
151
+ assert_length manually_extracted.size, yielded_records
152
+ assert yielded_records.all? {|r| r.kind_of? Nokogiri::XML::Document }
153
+ assert_equal manually_extracted.collect(&:to_xml), yielded_records.collect(&:root).collect(&:to_xml)
154
+ end
155
+
156
+ describe "without each_record_xpath" do
157
+ before do
158
+ @xml_sample_path = support_file_path("namespace-test.xml")
159
+ end
160
+ it "yields whole file as one record" do
161
+ @reader = Traject::ExperimentalNokogiriStreamingReader.new(File.open(@xml_sample_path), {})
162
+
163
+ yielded_records = @reader.to_a
164
+
165
+ assert_length 1, yielded_records
166
+ assert_equal Nokogiri::XML.parse(File.open(@xml_sample_path)).to_xml, yielded_records.first.to_xml
167
+ end
168
+ end
169
+ end
@@ -5,7 +5,7 @@ describe "Traject::Indexer::Context" do
5
5
  describe "source_record_id" do
6
6
  before do
7
7
  @record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
8
- @context = Traject::Indexer::Context.new
8
+ @context = Traject::Indexer::Context.new(source_record_id_proc: Traject::Indexer::MarcIndexer.new.source_record_id_proc)
9
9
  @record_001 = " 00282214 " # from the mrc file
10
10
  end
11
11
 
@@ -13,23 +13,33 @@ describe "Traject::Indexer::Context" do
13
13
  @context.source_record = @record
14
14
  assert_equal @record_001, @context.source_record_id
15
15
  end
16
+ end
16
17
 
17
- it "gets it from the id" do
18
- @context.output_hash['id'] = 'the_record_id'
19
- assert_equal 'the_record_id', @context.source_record_id
20
- end
18
+ describe "#record_inspect" do
19
+ before do
20
+ @record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
21
+ @source_record_id_proc = Traject::Indexer::MarcIndexer.new.source_record_id_proc
22
+ @record_001 = " 00282214 " # from the mrc file
21
23
 
22
- it "gets from the id with non-MARC source" do
23
- @context.source_record = Object.new
24
- @context.output_hash['id'] = 'the_record_id'
25
- assert_equal 'the_record_id', @context.source_record_id
24
+ @position = 10
25
+ @input_name = "some_file.mrc"
26
+ @position_in_input = 10
26
27
  end
27
28
 
28
- it "gets it from both 001 and id" do
29
- @context.output_hash['id'] = 'the_record_id'
30
- @context.source_record = @record
31
- assert_equal [@record_001, 'the_record_id'].join('/'), @context.source_record_id
29
+ it "can print complete inspect label" do
30
+ @context = Traject::Indexer::Context.new(
31
+ source_record: @record,
32
+ source_record_id_proc: @source_record_id_proc,
33
+ position: @position,
34
+ input_name: @input_name,
35
+ position_in_input: @position_in_input
36
+ )
37
+ @context.output_hash["id"] = "output_id"
38
+
39
+ assert_equal "<record ##{@position} (#{@input_name} ##{@position_in_input}), source_id:#{@record_001} output_id:output_id>", @context.record_inspect
32
40
  end
41
+
33
42
  end
34
43
 
44
+
35
45
  end
@@ -0,0 +1,59 @@
1
+ require 'test_helper'
2
+
3
+ describe 'Custom mapping error handler' do
4
+ # the exception thrown by the custom handler
5
+ class CustomFakeException < StandardError; end
6
+
7
+ let(:indexer) { Traject::Indexer.new }
8
+
9
+ it 'invokes the default handler when custom handler is not set' do
10
+ output = StringIO.new
11
+ logger =Logger.new(output)
12
+ indexer.logger = logger
13
+ indexer.configure do
14
+ to_field 'id' do |_, _, _|
15
+ raise CustomFakeException, "I just like raising errors"
16
+ end
17
+ end
18
+
19
+ e = assert_raises(CustomFakeException) do
20
+ indexer.map_record({})
21
+ end
22
+
23
+ assert_equal "I just like raising errors", e.message
24
+ assert output.string =~ /while executing \(to_field \"id\" at .*error_handler_test.rb:\d+\)/
25
+ assert output.string =~ /CustomFakeException: I just like raising errors/
26
+ end
27
+
28
+ it 'invokes the custom handler when set' do
29
+ indexer.configure do
30
+ settings do
31
+ provide 'mapping_rescue', -> (ctx, e) {
32
+ raise CustomFakeException, "custom handler called #{ctx.record_inspect}: #{ctx.index_step.inspect}, #{e.inspect}"
33
+ }
34
+ end
35
+
36
+ to_field 'id' do |_context , _exception|
37
+ raise 'this was always going to fail'
38
+ end
39
+ end
40
+ e = assert_raises(CustomFakeException) { indexer.map_record({}) }
41
+ assert e.message =~ /\(to_field \"id\" at .*error_handler_test.rb:\d+\)/
42
+ end
43
+
44
+ it "custom handler can skip and continue" do
45
+ indexer.configure do
46
+ settings do
47
+ provide "mapping_rescue", -> (context, exception) {
48
+ context.skip!
49
+ }
50
+ end
51
+
52
+ to_field 'id' do |_context , _exception|
53
+ raise 'this was always going to fail'
54
+ end
55
+ end
56
+
57
+ assert_nil indexer.map_record({})
58
+ end
59
+ end