traject 2.3.4 → 3.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -0,0 +1,38 @@
1
+ require 'test_helper'
2
+ require 'rspec/mocks'
3
+
4
+ # MOST of ThreadPool is not tested directly at this point.
5
+ describe "Traject::ThreadPool" do
6
+ include ::RSpec::Mocks::ExampleMethods
7
+
8
+ # http://blog.plataformatec.com.br/2015/05/nobody-told-me-minitest-was-this-fun/
9
+ def before_setup
10
+ ::RSpec::Mocks.setup
11
+ super
12
+ end
13
+
14
+ def after_teardown
15
+ super
16
+ ::RSpec::Mocks.verify
17
+ ensure
18
+ ::RSpec::Mocks.teardown
19
+ end
20
+
21
+
22
+ describe "disable_concurrency!" do
23
+
24
+ it "disables concurrency" do
25
+ allow(Traject::ThreadPool).to receive(:concurrency_disabled?).and_return(true)
26
+
27
+ parent_thread_id = Thread.current.object_id
28
+
29
+ work_thread_id = Concurrent::AtomicFixnum.new
30
+
31
+ Traject::ThreadPool.new(10).maybe_in_thread_pool do
32
+ work_thread_id.update { Thread.current.object_id }
33
+ end
34
+
35
+ assert_equal parent_thread_id, work_thread_id.value
36
+ end
37
+ end
38
+ end
@@ -57,7 +57,7 @@ describe "TranslationMap" do
57
57
 
58
58
  it "raises on syntax error in yaml" do
59
59
  exception = assert_raises(Psych::SyntaxError) do
60
- found = @cache.lookup("bad_yaml")
60
+ _found = @cache.lookup("bad_yaml")
61
61
  end
62
62
 
63
63
  assert exception.message.include?("bad_yaml.yaml"), "exception message includes source file"
@@ -65,7 +65,7 @@ describe "TranslationMap" do
65
65
 
66
66
  it "raises on syntax error in ruby" do
67
67
  exception = assert_raises(SyntaxError) do
68
- found = @cache.lookup("bad_ruby")
68
+ _found = @cache.lookup("bad_ruby")
69
69
  end
70
70
  assert exception.message.include?("bad_ruby.rb"), "exception message includes source file"
71
71
  end
@@ -118,7 +118,7 @@ describe "TranslationMap" do
118
118
 
119
119
  assert_equal "DEFAULT LITERAL", map["not in the map"]
120
120
  end
121
-
121
+
122
122
  it "respects __default__ __passthrough__" do
123
123
  map = Traject::TranslationMap.new("default_passthrough")
124
124
 
@@ -2,7 +2,8 @@ foo = "bar"
2
2
 
3
3
  some_hash = {
4
4
  "key1" => "value1",
5
- "array_key" => %w{one two three}
5
+ "array_key" => %w{one two three},
6
+ "key_to_be_overridden" => "value_from_ruby"
6
7
  }
7
8
  some_hash["also"] = "this"
8
9
 
@@ -1,7 +1,8 @@
1
1
  key1: value1
2
+ key_to_be_overridden: value_from_yaml
2
3
  array_key:
3
4
  -one
4
5
  -two
5
6
  -three
6
7
  # comment
7
- other: yes
8
+ other: yes
@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
8
8
  spec.version = Traject::VERSION
9
9
  spec.authors = ["Jonathan Rochkind", "Bill Dueber"]
10
10
  spec.email = ["none@nowhere.org"]
11
- spec.summary = %q{Index MARC to Solr; or generally process source records to hash-like structures}
11
+ spec.summary = %q{An easy to use, high-performance, flexible and extensible metadata transformation system, focused on library-archives-museums input, and indexing to Solr as output.}
12
12
  spec.homepage = "http://github.com/traject/traject"
13
13
  spec.license = "MIT"
14
14
 
@@ -28,20 +28,13 @@ Gem::Specification.new do |spec|
28
28
  spec.add_dependency "yell" # logging
29
29
  spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
30
30
  spec.add_dependency "httpclient", "~> 2.5"
31
+ spec.add_dependency "http", "~> 3.0" # used in oai_pmh_reader, may use more extensively in future instead of httpclient
31
32
  spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
32
-
33
- # If we're building the package under JRuby, add in the
34
- # jruby-only gems and specify the platform.
35
-
36
- if defined? JRUBY_VERSION
37
- spec.platform = 'java'
38
- spec.add_dependency "traject-marc4j_reader", "~> 1.0"
39
- else
40
- spec.platform = "ruby"
41
- end
33
+ spec.add_dependency "nokogiri", "~> 1.0" # NokogiriIndexer
42
34
 
43
35
  spec.add_development_dependency "bundler", '~> 1.7'
44
36
 
45
37
  spec.add_development_dependency "rake"
46
38
  spec.add_development_dependency "minitest"
39
+ spec.add_development_dependency "rspec-mocks", '~> 3.4'
47
40
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.4
4
+ version: 3.0.0.alpha.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-12-06 00:00:00.000000000 Z
12
+ date: 2018-08-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby
@@ -115,6 +115,20 @@ dependencies:
115
115
  - - "~>"
116
116
  - !ruby/object:Gem::Version
117
117
  version: '2.5'
118
+ - !ruby/object:Gem::Dependency
119
+ name: http
120
+ requirement: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.0'
125
+ type: :runtime
126
+ prerelease: false
127
+ version_requirements: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '3.0'
118
132
  - !ruby/object:Gem::Dependency
119
133
  name: marc-fastxmlwriter
120
134
  requirement: !ruby/object:Gem::Requirement
@@ -129,6 +143,20 @@ dependencies:
129
143
  - - "~>"
130
144
  - !ruby/object:Gem::Version
131
145
  version: '1.0'
146
+ - !ruby/object:Gem::Dependency
147
+ name: nokogiri
148
+ requirement: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '1.0'
153
+ type: :runtime
154
+ prerelease: false
155
+ version_requirements: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '1.0'
132
160
  - !ruby/object:Gem::Dependency
133
161
  name: bundler
134
162
  requirement: !ruby/object:Gem::Requirement
@@ -171,6 +199,20 @@ dependencies:
171
199
  - - ">="
172
200
  - !ruby/object:Gem::Version
173
201
  version: '0'
202
+ - !ruby/object:Gem::Dependency
203
+ name: rspec-mocks
204
+ requirement: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - "~>"
207
+ - !ruby/object:Gem::Version
208
+ version: '3.4'
209
+ type: :development
210
+ prerelease: false
211
+ version_requirements: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - "~>"
214
+ - !ruby/object:Gem::Version
215
+ version: '3.4'
174
216
  description:
175
217
  email:
176
218
  - none@nowhere.org
@@ -182,7 +224,9 @@ extra_rdoc_files:
182
224
  - doc/extending.md
183
225
  - doc/indexing_rules.md
184
226
  - doc/other_commands.md
227
+ - doc/programmatic_use.md
185
228
  - doc/settings.md
229
+ - doc/xml.md
186
230
  files:
187
231
  - ".gitignore"
188
232
  - ".travis.yml"
@@ -198,7 +242,9 @@ files:
198
242
  - doc/extending.md
199
243
  - doc/indexing_rules.md
200
244
  - doc/other_commands.md
245
+ - doc/programmatic_use.md
201
246
  - doc/settings.md
247
+ - doc/xml.md
202
248
  - index_bench/batch.dat
203
249
  - index_bench/common.rb
204
250
  - index_bench/index_bench.rb
@@ -216,12 +262,17 @@ files:
216
262
  - index_bench/translation_maps/umich/obsolete_cop.yaml
217
263
  - lib/tasks/load_maps.rake
218
264
  - lib/traject.rb
265
+ - lib/traject/array_writer.rb
219
266
  - lib/traject/command_line.rb
220
267
  - lib/traject/csv_writer.rb
221
268
  - lib/traject/debug_writer.rb
222
269
  - lib/traject/delimited_writer.rb
270
+ - lib/traject/experimental_nokogiri_streaming_reader.rb
271
+ - lib/traject/hashie/indifferent_access_fix.rb
223
272
  - lib/traject/indexer.rb
224
273
  - lib/traject/indexer/context.rb
274
+ - lib/traject/indexer/marc_indexer.rb
275
+ - lib/traject/indexer/nokogiri_indexer.rb
225
276
  - lib/traject/indexer/settings.rb
226
277
  - lib/traject/indexer/step.rb
227
278
  - lib/traject/json_writer.rb
@@ -230,12 +281,16 @@ files:
230
281
  - lib/traject/macros/marc21.rb
231
282
  - lib/traject/macros/marc21_semantics.rb
232
283
  - lib/traject/macros/marc_format_classifier.rb
284
+ - lib/traject/macros/nokogiri_macros.rb
285
+ - lib/traject/macros/transformation.rb
233
286
  - lib/traject/marc_extractor.rb
234
287
  - lib/traject/marc_extractor_spec.rb
235
288
  - lib/traject/marc_reader.rb
236
289
  - lib/traject/mock_reader.rb
237
290
  - lib/traject/ndj_reader.rb
291
+ - lib/traject/nokogiri_reader.rb
238
292
  - lib/traject/null_writer.rb
293
+ - lib/traject/oai_pmh_nokogiri_reader.rb
239
294
  - lib/traject/qualified_const_get.rb
240
295
  - lib/traject/solr_json_writer.rb
241
296
  - lib/traject/thread_pool.rb
@@ -251,8 +306,10 @@ files:
251
306
  - lib/translation_maps/marc_languages.yaml
252
307
  - test/debug_writer_test.rb
253
308
  - test/delimited_writer_test.rb
309
+ - test/experimental_nokogiri_streaming_reader_test.rb
254
310
  - test/indexer/context_test.rb
255
311
  - test/indexer/each_record_test.rb
312
+ - test/indexer/error_handler_test.rb
256
313
  - test/indexer/load_config_file_test.rb
257
314
  - test/indexer/macros/macros_marc21_semantics_test.rb
258
315
  - test/indexer/macros/marc21/extract_all_marc_values_test.rb
@@ -260,7 +317,11 @@ files:
260
317
  - test/indexer/macros/marc21/serialize_marc_test.rb
261
318
  - test/indexer/macros/marc21/trim_punctuation_test.rb
262
319
  - test/indexer/macros/to_field_test.rb
320
+ - test/indexer/macros/transformation_test.rb
263
321
  - test/indexer/map_record_test.rb
322
+ - test/indexer/nokogiri_indexer_test.rb
323
+ - test/indexer/process_record_test.rb
324
+ - test/indexer/process_with_test.rb
264
325
  - test/indexer/read_write_test.rb
265
326
  - test/indexer/settings_test.rb
266
327
  - test/indexer/to_field_test.rb
@@ -268,6 +329,8 @@ files:
268
329
  - test/marc_extractor_test.rb
269
330
  - test/marc_format_classifier_test.rb
270
331
  - test/marc_reader_test.rb
332
+ - test/nokogiri_reader_test.rb
333
+ - test/oai_pmh_nokogiri_reader_test.rb
271
334
  - test/solr_json_writer_test.rb
272
335
  - test/test_helper.rb
273
336
  - test/test_support/245_no_ab.marc
@@ -289,15 +352,22 @@ files:
289
352
  - test/test_support/multi_era.marc
290
353
  - test/test_support/multi_geo.marc
291
354
  - test/test_support/musical_cage.marc
355
+ - test/test_support/namespace-test.xml
292
356
  - test/test_support/nature.marc
357
+ - test/test_support/nokogiri_demo_config.rb
358
+ - test/test_support/oai-pmh-one-record-2.xml
359
+ - test/test_support/oai-pmh-one-record-first.xml
293
360
  - test/test_support/one-marc8.mrc
294
361
  - test/test_support/online_only.marc
295
362
  - test/test_support/packed_041a_lang.marc
363
+ - test/test_support/sample-oai-no-namespace.xml
364
+ - test/test_support/sample-oai-pmh.xml
296
365
  - test/test_support/test_data.utf8.json
297
366
  - test/test_support/test_data.utf8.marc.xml
298
367
  - test/test_support/test_data.utf8.mrc
299
368
  - test/test_support/test_data.utf8.mrc.gz
300
369
  - test/test_support/the_business_ren.marc
370
+ - test/thread_pool_test.rb
301
371
  - test/translation_map_test.rb
302
372
  - test/translation_maps/bad_ruby.rb
303
373
  - test/translation_maps/bad_yaml.yaml
@@ -326,20 +396,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
326
396
  version: '0'
327
397
  required_rubygems_version: !ruby/object:Gem::Requirement
328
398
  requirements:
329
- - - ">="
399
+ - - ">"
330
400
  - !ruby/object:Gem::Version
331
- version: '0'
401
+ version: 1.3.1
332
402
  requirements: []
333
403
  rubyforge_project:
334
- rubygems_version: 2.6.13
404
+ rubygems_version: 2.7.7
335
405
  signing_key:
336
406
  specification_version: 4
337
- summary: Index MARC to Solr; or generally process source records to hash-like structures
407
+ summary: An easy to use, high-performance, flexible and extensible metadata transformation
408
+ system, focused on library-archives-museums input, and indexing to Solr as output.
338
409
  test_files:
339
410
  - test/debug_writer_test.rb
340
411
  - test/delimited_writer_test.rb
412
+ - test/experimental_nokogiri_streaming_reader_test.rb
341
413
  - test/indexer/context_test.rb
342
414
  - test/indexer/each_record_test.rb
415
+ - test/indexer/error_handler_test.rb
343
416
  - test/indexer/load_config_file_test.rb
344
417
  - test/indexer/macros/macros_marc21_semantics_test.rb
345
418
  - test/indexer/macros/marc21/extract_all_marc_values_test.rb
@@ -347,7 +420,11 @@ test_files:
347
420
  - test/indexer/macros/marc21/serialize_marc_test.rb
348
421
  - test/indexer/macros/marc21/trim_punctuation_test.rb
349
422
  - test/indexer/macros/to_field_test.rb
423
+ - test/indexer/macros/transformation_test.rb
350
424
  - test/indexer/map_record_test.rb
425
+ - test/indexer/nokogiri_indexer_test.rb
426
+ - test/indexer/process_record_test.rb
427
+ - test/indexer/process_with_test.rb
351
428
  - test/indexer/read_write_test.rb
352
429
  - test/indexer/settings_test.rb
353
430
  - test/indexer/to_field_test.rb
@@ -355,6 +432,8 @@ test_files:
355
432
  - test/marc_extractor_test.rb
356
433
  - test/marc_format_classifier_test.rb
357
434
  - test/marc_reader_test.rb
435
+ - test/nokogiri_reader_test.rb
436
+ - test/oai_pmh_nokogiri_reader_test.rb
358
437
  - test/solr_json_writer_test.rb
359
438
  - test/test_helper.rb
360
439
  - test/test_support/245_no_ab.marc
@@ -376,15 +455,22 @@ test_files:
376
455
  - test/test_support/multi_era.marc
377
456
  - test/test_support/multi_geo.marc
378
457
  - test/test_support/musical_cage.marc
458
+ - test/test_support/namespace-test.xml
379
459
  - test/test_support/nature.marc
460
+ - test/test_support/nokogiri_demo_config.rb
461
+ - test/test_support/oai-pmh-one-record-2.xml
462
+ - test/test_support/oai-pmh-one-record-first.xml
380
463
  - test/test_support/one-marc8.mrc
381
464
  - test/test_support/online_only.marc
382
465
  - test/test_support/packed_041a_lang.marc
466
+ - test/test_support/sample-oai-no-namespace.xml
467
+ - test/test_support/sample-oai-pmh.xml
383
468
  - test/test_support/test_data.utf8.json
384
469
  - test/test_support/test_data.utf8.marc.xml
385
470
  - test/test_support/test_data.utf8.mrc
386
471
  - test/test_support/test_data.utf8.mrc.gz
387
472
  - test/test_support/the_business_ren.marc
473
+ - test/thread_pool_test.rb
388
474
  - test/translation_map_test.rb
389
475
  - test/translation_maps/bad_ruby.rb
390
476
  - test/translation_maps/bad_yaml.yaml