traject 2.3.4 → 3.0.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +16 -9
  3. data/CHANGES.md +74 -1
  4. data/Gemfile +2 -1
  5. data/README.md +104 -53
  6. data/Rakefile +8 -1
  7. data/doc/indexing_rules.md +79 -63
  8. data/doc/programmatic_use.md +218 -0
  9. data/doc/settings.md +28 -1
  10. data/doc/xml.md +134 -0
  11. data/lib/traject.rb +5 -0
  12. data/lib/traject/array_writer.rb +34 -0
  13. data/lib/traject/command_line.rb +18 -22
  14. data/lib/traject/debug_writer.rb +2 -5
  15. data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
  16. data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
  17. data/lib/traject/indexer.rb +321 -92
  18. data/lib/traject/indexer/context.rb +39 -13
  19. data/lib/traject/indexer/marc_indexer.rb +30 -0
  20. data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
  21. data/lib/traject/indexer/settings.rb +36 -53
  22. data/lib/traject/indexer/step.rb +27 -33
  23. data/lib/traject/macros/marc21.rb +37 -12
  24. data/lib/traject/macros/nokogiri_macros.rb +43 -0
  25. data/lib/traject/macros/transformation.rb +162 -0
  26. data/lib/traject/marc_extractor.rb +2 -0
  27. data/lib/traject/ndj_reader.rb +1 -1
  28. data/lib/traject/nokogiri_reader.rb +179 -0
  29. data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
  30. data/lib/traject/solr_json_writer.rb +19 -12
  31. data/lib/traject/thread_pool.rb +13 -0
  32. data/lib/traject/util.rb +14 -2
  33. data/lib/traject/version.rb +1 -1
  34. data/test/debug_writer_test.rb +3 -3
  35. data/test/delimited_writer_test.rb +3 -3
  36. data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
  37. data/test/indexer/context_test.rb +23 -13
  38. data/test/indexer/error_handler_test.rb +59 -0
  39. data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
  40. data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
  41. data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
  42. data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
  43. data/test/indexer/macros/to_field_test.rb +2 -2
  44. data/test/indexer/macros/transformation_test.rb +177 -0
  45. data/test/indexer/map_record_test.rb +2 -3
  46. data/test/indexer/nokogiri_indexer_test.rb +103 -0
  47. data/test/indexer/process_record_test.rb +55 -0
  48. data/test/indexer/process_with_test.rb +148 -0
  49. data/test/indexer/read_write_test.rb +52 -2
  50. data/test/indexer/settings_test.rb +34 -24
  51. data/test/indexer/to_field_test.rb +27 -2
  52. data/test/marc_extractor_test.rb +7 -7
  53. data/test/marc_reader_test.rb +4 -4
  54. data/test/nokogiri_reader_test.rb +158 -0
  55. data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
  56. data/test/solr_json_writer_test.rb +24 -28
  57. data/test/test_helper.rb +8 -2
  58. data/test/test_support/namespace-test.xml +7 -0
  59. data/test/test_support/nokogiri_demo_config.rb +17 -0
  60. data/test/test_support/oai-pmh-one-record-2.xml +24 -0
  61. data/test/test_support/oai-pmh-one-record-first.xml +24 -0
  62. data/test/test_support/sample-oai-no-namespace.xml +197 -0
  63. data/test/test_support/sample-oai-pmh.xml +197 -0
  64. data/test/thread_pool_test.rb +38 -0
  65. data/test/translation_map_test.rb +3 -3
  66. data/test/translation_maps/ruby_map.rb +2 -1
  67. data/test/translation_maps/yaml_map.yaml +2 -1
  68. data/traject.gemspec +4 -11
  69. metadata +92 -6
@@ -0,0 +1,38 @@
1
+ require 'test_helper'
2
+ require 'rspec/mocks'
3
+
4
+ # MOST of ThreadPool is not tested directly at this point.
5
+ describe "Traject::ThreadPool" do
6
+ include ::RSpec::Mocks::ExampleMethods
7
+
8
+ # http://blog.plataformatec.com.br/2015/05/nobody-told-me-minitest-was-this-fun/
9
+ def before_setup
10
+ ::RSpec::Mocks.setup
11
+ super
12
+ end
13
+
14
+ def after_teardown
15
+ super
16
+ ::RSpec::Mocks.verify
17
+ ensure
18
+ ::RSpec::Mocks.teardown
19
+ end
20
+
21
+
22
+ describe "disable_concurrency!" do
23
+
24
+ it "disables concurrency" do
25
+ allow(Traject::ThreadPool).to receive(:concurrency_disabled?).and_return(true)
26
+
27
+ parent_thread_id = Thread.current.object_id
28
+
29
+ work_thread_id = Concurrent::AtomicFixnum.new
30
+
31
+ Traject::ThreadPool.new(10).maybe_in_thread_pool do
32
+ work_thread_id.update { Thread.current.object_id }
33
+ end
34
+
35
+ assert_equal parent_thread_id, work_thread_id.value
36
+ end
37
+ end
38
+ end
@@ -57,7 +57,7 @@ describe "TranslationMap" do
57
57
 
58
58
  it "raises on syntax error in yaml" do
59
59
  exception = assert_raises(Psych::SyntaxError) do
60
- found = @cache.lookup("bad_yaml")
60
+ _found = @cache.lookup("bad_yaml")
61
61
  end
62
62
 
63
63
  assert exception.message.include?("bad_yaml.yaml"), "exception message includes source file"
@@ -65,7 +65,7 @@ describe "TranslationMap" do
65
65
 
66
66
  it "raises on syntax error in ruby" do
67
67
  exception = assert_raises(SyntaxError) do
68
- found = @cache.lookup("bad_ruby")
68
+ _found = @cache.lookup("bad_ruby")
69
69
  end
70
70
  assert exception.message.include?("bad_ruby.rb"), "exception message includes source file"
71
71
  end
@@ -118,7 +118,7 @@ describe "TranslationMap" do
118
118
 
119
119
  assert_equal "DEFAULT LITERAL", map["not in the map"]
120
120
  end
121
-
121
+
122
122
  it "respects __default__ __passthrough__" do
123
123
  map = Traject::TranslationMap.new("default_passthrough")
124
124
 
@@ -2,7 +2,8 @@ foo = "bar"
2
2
 
3
3
  some_hash = {
4
4
  "key1" => "value1",
5
- "array_key" => %w{one two three}
5
+ "array_key" => %w{one two three},
6
+ "key_to_be_overridden" => "value_from_ruby"
6
7
  }
7
8
  some_hash["also"] = "this"
8
9
 
@@ -1,7 +1,8 @@
1
1
  key1: value1
2
+ key_to_be_overridden: value_from_yaml
2
3
  array_key:
3
4
  -one
4
5
  -two
5
6
  -three
6
7
  # comment
7
- other: yes
8
+ other: yes
@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
8
8
  spec.version = Traject::VERSION
9
9
  spec.authors = ["Jonathan Rochkind", "Bill Dueber"]
10
10
  spec.email = ["none@nowhere.org"]
11
- spec.summary = %q{Index MARC to Solr; or generally process source records to hash-like structures}
11
+ spec.summary = %q{An easy to use, high-performance, flexible and extensible metadata transformation system, focused on library-archives-museums input, and indexing to Solr as output.}
12
12
  spec.homepage = "http://github.com/traject/traject"
13
13
  spec.license = "MIT"
14
14
 
@@ -28,20 +28,13 @@ Gem::Specification.new do |spec|
28
28
  spec.add_dependency "yell" # logging
29
29
  spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
30
30
  spec.add_dependency "httpclient", "~> 2.5"
31
+ spec.add_dependency "http", "~> 3.0" # used in oai_pmh_reader, may use more extensively in future instead of httpclient
31
32
  spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
32
-
33
- # If we're building the package under JRuby, add in the
34
- # jruby-only gems and specify the platform.
35
-
36
- if defined? JRUBY_VERSION
37
- spec.platform = 'java'
38
- spec.add_dependency "traject-marc4j_reader", "~> 1.0"
39
- else
40
- spec.platform = "ruby"
41
- end
33
+ spec.add_dependency "nokogiri", "~> 1.0" # NokogiriIndexer
42
34
 
43
35
  spec.add_development_dependency "bundler", '~> 1.7'
44
36
 
45
37
  spec.add_development_dependency "rake"
46
38
  spec.add_development_dependency "minitest"
39
+ spec.add_development_dependency "rspec-mocks", '~> 3.4'
47
40
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.4
4
+ version: 3.0.0.alpha.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-12-06 00:00:00.000000000 Z
12
+ date: 2018-08-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby
@@ -115,6 +115,20 @@ dependencies:
115
115
  - - "~>"
116
116
  - !ruby/object:Gem::Version
117
117
  version: '2.5'
118
+ - !ruby/object:Gem::Dependency
119
+ name: http
120
+ requirement: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.0'
125
+ type: :runtime
126
+ prerelease: false
127
+ version_requirements: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '3.0'
118
132
  - !ruby/object:Gem::Dependency
119
133
  name: marc-fastxmlwriter
120
134
  requirement: !ruby/object:Gem::Requirement
@@ -129,6 +143,20 @@ dependencies:
129
143
  - - "~>"
130
144
  - !ruby/object:Gem::Version
131
145
  version: '1.0'
146
+ - !ruby/object:Gem::Dependency
147
+ name: nokogiri
148
+ requirement: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '1.0'
153
+ type: :runtime
154
+ prerelease: false
155
+ version_requirements: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '1.0'
132
160
  - !ruby/object:Gem::Dependency
133
161
  name: bundler
134
162
  requirement: !ruby/object:Gem::Requirement
@@ -171,6 +199,20 @@ dependencies:
171
199
  - - ">="
172
200
  - !ruby/object:Gem::Version
173
201
  version: '0'
202
+ - !ruby/object:Gem::Dependency
203
+ name: rspec-mocks
204
+ requirement: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - "~>"
207
+ - !ruby/object:Gem::Version
208
+ version: '3.4'
209
+ type: :development
210
+ prerelease: false
211
+ version_requirements: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - "~>"
214
+ - !ruby/object:Gem::Version
215
+ version: '3.4'
174
216
  description:
175
217
  email:
176
218
  - none@nowhere.org
@@ -182,7 +224,9 @@ extra_rdoc_files:
182
224
  - doc/extending.md
183
225
  - doc/indexing_rules.md
184
226
  - doc/other_commands.md
227
+ - doc/programmatic_use.md
185
228
  - doc/settings.md
229
+ - doc/xml.md
186
230
  files:
187
231
  - ".gitignore"
188
232
  - ".travis.yml"
@@ -198,7 +242,9 @@ files:
198
242
  - doc/extending.md
199
243
  - doc/indexing_rules.md
200
244
  - doc/other_commands.md
245
+ - doc/programmatic_use.md
201
246
  - doc/settings.md
247
+ - doc/xml.md
202
248
  - index_bench/batch.dat
203
249
  - index_bench/common.rb
204
250
  - index_bench/index_bench.rb
@@ -216,12 +262,17 @@ files:
216
262
  - index_bench/translation_maps/umich/obsolete_cop.yaml
217
263
  - lib/tasks/load_maps.rake
218
264
  - lib/traject.rb
265
+ - lib/traject/array_writer.rb
219
266
  - lib/traject/command_line.rb
220
267
  - lib/traject/csv_writer.rb
221
268
  - lib/traject/debug_writer.rb
222
269
  - lib/traject/delimited_writer.rb
270
+ - lib/traject/experimental_nokogiri_streaming_reader.rb
271
+ - lib/traject/hashie/indifferent_access_fix.rb
223
272
  - lib/traject/indexer.rb
224
273
  - lib/traject/indexer/context.rb
274
+ - lib/traject/indexer/marc_indexer.rb
275
+ - lib/traject/indexer/nokogiri_indexer.rb
225
276
  - lib/traject/indexer/settings.rb
226
277
  - lib/traject/indexer/step.rb
227
278
  - lib/traject/json_writer.rb
@@ -230,12 +281,16 @@ files:
230
281
  - lib/traject/macros/marc21.rb
231
282
  - lib/traject/macros/marc21_semantics.rb
232
283
  - lib/traject/macros/marc_format_classifier.rb
284
+ - lib/traject/macros/nokogiri_macros.rb
285
+ - lib/traject/macros/transformation.rb
233
286
  - lib/traject/marc_extractor.rb
234
287
  - lib/traject/marc_extractor_spec.rb
235
288
  - lib/traject/marc_reader.rb
236
289
  - lib/traject/mock_reader.rb
237
290
  - lib/traject/ndj_reader.rb
291
+ - lib/traject/nokogiri_reader.rb
238
292
  - lib/traject/null_writer.rb
293
+ - lib/traject/oai_pmh_nokogiri_reader.rb
239
294
  - lib/traject/qualified_const_get.rb
240
295
  - lib/traject/solr_json_writer.rb
241
296
  - lib/traject/thread_pool.rb
@@ -251,8 +306,10 @@ files:
251
306
  - lib/translation_maps/marc_languages.yaml
252
307
  - test/debug_writer_test.rb
253
308
  - test/delimited_writer_test.rb
309
+ - test/experimental_nokogiri_streaming_reader_test.rb
254
310
  - test/indexer/context_test.rb
255
311
  - test/indexer/each_record_test.rb
312
+ - test/indexer/error_handler_test.rb
256
313
  - test/indexer/load_config_file_test.rb
257
314
  - test/indexer/macros/macros_marc21_semantics_test.rb
258
315
  - test/indexer/macros/marc21/extract_all_marc_values_test.rb
@@ -260,7 +317,11 @@ files:
260
317
  - test/indexer/macros/marc21/serialize_marc_test.rb
261
318
  - test/indexer/macros/marc21/trim_punctuation_test.rb
262
319
  - test/indexer/macros/to_field_test.rb
320
+ - test/indexer/macros/transformation_test.rb
263
321
  - test/indexer/map_record_test.rb
322
+ - test/indexer/nokogiri_indexer_test.rb
323
+ - test/indexer/process_record_test.rb
324
+ - test/indexer/process_with_test.rb
264
325
  - test/indexer/read_write_test.rb
265
326
  - test/indexer/settings_test.rb
266
327
  - test/indexer/to_field_test.rb
@@ -268,6 +329,8 @@ files:
268
329
  - test/marc_extractor_test.rb
269
330
  - test/marc_format_classifier_test.rb
270
331
  - test/marc_reader_test.rb
332
+ - test/nokogiri_reader_test.rb
333
+ - test/oai_pmh_nokogiri_reader_test.rb
271
334
  - test/solr_json_writer_test.rb
272
335
  - test/test_helper.rb
273
336
  - test/test_support/245_no_ab.marc
@@ -289,15 +352,22 @@ files:
289
352
  - test/test_support/multi_era.marc
290
353
  - test/test_support/multi_geo.marc
291
354
  - test/test_support/musical_cage.marc
355
+ - test/test_support/namespace-test.xml
292
356
  - test/test_support/nature.marc
357
+ - test/test_support/nokogiri_demo_config.rb
358
+ - test/test_support/oai-pmh-one-record-2.xml
359
+ - test/test_support/oai-pmh-one-record-first.xml
293
360
  - test/test_support/one-marc8.mrc
294
361
  - test/test_support/online_only.marc
295
362
  - test/test_support/packed_041a_lang.marc
363
+ - test/test_support/sample-oai-no-namespace.xml
364
+ - test/test_support/sample-oai-pmh.xml
296
365
  - test/test_support/test_data.utf8.json
297
366
  - test/test_support/test_data.utf8.marc.xml
298
367
  - test/test_support/test_data.utf8.mrc
299
368
  - test/test_support/test_data.utf8.mrc.gz
300
369
  - test/test_support/the_business_ren.marc
370
+ - test/thread_pool_test.rb
301
371
  - test/translation_map_test.rb
302
372
  - test/translation_maps/bad_ruby.rb
303
373
  - test/translation_maps/bad_yaml.yaml
@@ -326,20 +396,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
326
396
  version: '0'
327
397
  required_rubygems_version: !ruby/object:Gem::Requirement
328
398
  requirements:
329
- - - ">="
399
+ - - ">"
330
400
  - !ruby/object:Gem::Version
331
- version: '0'
401
+ version: 1.3.1
332
402
  requirements: []
333
403
  rubyforge_project:
334
- rubygems_version: 2.6.13
404
+ rubygems_version: 2.7.7
335
405
  signing_key:
336
406
  specification_version: 4
337
- summary: Index MARC to Solr; or generally process source records to hash-like structures
407
+ summary: An easy to use, high-performance, flexible and extensible metadata transformation
408
+ system, focused on library-archives-museums input, and indexing to Solr as output.
338
409
  test_files:
339
410
  - test/debug_writer_test.rb
340
411
  - test/delimited_writer_test.rb
412
+ - test/experimental_nokogiri_streaming_reader_test.rb
341
413
  - test/indexer/context_test.rb
342
414
  - test/indexer/each_record_test.rb
415
+ - test/indexer/error_handler_test.rb
343
416
  - test/indexer/load_config_file_test.rb
344
417
  - test/indexer/macros/macros_marc21_semantics_test.rb
345
418
  - test/indexer/macros/marc21/extract_all_marc_values_test.rb
@@ -347,7 +420,11 @@ test_files:
347
420
  - test/indexer/macros/marc21/serialize_marc_test.rb
348
421
  - test/indexer/macros/marc21/trim_punctuation_test.rb
349
422
  - test/indexer/macros/to_field_test.rb
423
+ - test/indexer/macros/transformation_test.rb
350
424
  - test/indexer/map_record_test.rb
425
+ - test/indexer/nokogiri_indexer_test.rb
426
+ - test/indexer/process_record_test.rb
427
+ - test/indexer/process_with_test.rb
351
428
  - test/indexer/read_write_test.rb
352
429
  - test/indexer/settings_test.rb
353
430
  - test/indexer/to_field_test.rb
@@ -355,6 +432,8 @@ test_files:
355
432
  - test/marc_extractor_test.rb
356
433
  - test/marc_format_classifier_test.rb
357
434
  - test/marc_reader_test.rb
435
+ - test/nokogiri_reader_test.rb
436
+ - test/oai_pmh_nokogiri_reader_test.rb
358
437
  - test/solr_json_writer_test.rb
359
438
  - test/test_helper.rb
360
439
  - test/test_support/245_no_ab.marc
@@ -376,15 +455,22 @@ test_files:
376
455
  - test/test_support/multi_era.marc
377
456
  - test/test_support/multi_geo.marc
378
457
  - test/test_support/musical_cage.marc
458
+ - test/test_support/namespace-test.xml
379
459
  - test/test_support/nature.marc
460
+ - test/test_support/nokogiri_demo_config.rb
461
+ - test/test_support/oai-pmh-one-record-2.xml
462
+ - test/test_support/oai-pmh-one-record-first.xml
380
463
  - test/test_support/one-marc8.mrc
381
464
  - test/test_support/online_only.marc
382
465
  - test/test_support/packed_041a_lang.marc
466
+ - test/test_support/sample-oai-no-namespace.xml
467
+ - test/test_support/sample-oai-pmh.xml
383
468
  - test/test_support/test_data.utf8.json
384
469
  - test/test_support/test_data.utf8.marc.xml
385
470
  - test/test_support/test_data.utf8.mrc
386
471
  - test/test_support/test_data.utf8.mrc.gz
387
472
  - test/test_support/the_business_ren.marc
473
+ - test/thread_pool_test.rb
388
474
  - test/translation_map_test.rb
389
475
  - test/translation_maps/bad_ruby.rb
390
476
  - test/translation_maps/bad_yaml.yaml