traject 2.3.4 → 3.0.0.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +16 -9
- data/CHANGES.md +74 -1
- data/Gemfile +2 -1
- data/README.md +104 -53
- data/Rakefile +8 -1
- data/doc/indexing_rules.md +79 -63
- data/doc/programmatic_use.md +218 -0
- data/doc/settings.md +28 -1
- data/doc/xml.md +134 -0
- data/lib/traject.rb +5 -0
- data/lib/traject/array_writer.rb +34 -0
- data/lib/traject/command_line.rb +18 -22
- data/lib/traject/debug_writer.rb +2 -5
- data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
- data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
- data/lib/traject/indexer.rb +321 -92
- data/lib/traject/indexer/context.rb +39 -13
- data/lib/traject/indexer/marc_indexer.rb +30 -0
- data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
- data/lib/traject/indexer/settings.rb +36 -53
- data/lib/traject/indexer/step.rb +27 -33
- data/lib/traject/macros/marc21.rb +37 -12
- data/lib/traject/macros/nokogiri_macros.rb +43 -0
- data/lib/traject/macros/transformation.rb +162 -0
- data/lib/traject/marc_extractor.rb +2 -0
- data/lib/traject/ndj_reader.rb +1 -1
- data/lib/traject/nokogiri_reader.rb +179 -0
- data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
- data/lib/traject/solr_json_writer.rb +19 -12
- data/lib/traject/thread_pool.rb +13 -0
- data/lib/traject/util.rb +14 -2
- data/lib/traject/version.rb +1 -1
- data/test/debug_writer_test.rb +3 -3
- data/test/delimited_writer_test.rb +3 -3
- data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
- data/test/indexer/context_test.rb +23 -13
- data/test/indexer/error_handler_test.rb +59 -0
- data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
- data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
- data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
- data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
- data/test/indexer/macros/to_field_test.rb +2 -2
- data/test/indexer/macros/transformation_test.rb +177 -0
- data/test/indexer/map_record_test.rb +2 -3
- data/test/indexer/nokogiri_indexer_test.rb +103 -0
- data/test/indexer/process_record_test.rb +55 -0
- data/test/indexer/process_with_test.rb +148 -0
- data/test/indexer/read_write_test.rb +52 -2
- data/test/indexer/settings_test.rb +34 -24
- data/test/indexer/to_field_test.rb +27 -2
- data/test/marc_extractor_test.rb +7 -7
- data/test/marc_reader_test.rb +4 -4
- data/test/nokogiri_reader_test.rb +158 -0
- data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
- data/test/solr_json_writer_test.rb +24 -28
- data/test/test_helper.rb +8 -2
- data/test/test_support/namespace-test.xml +7 -0
- data/test/test_support/nokogiri_demo_config.rb +17 -0
- data/test/test_support/oai-pmh-one-record-2.xml +24 -0
- data/test/test_support/oai-pmh-one-record-first.xml +24 -0
- data/test/test_support/sample-oai-no-namespace.xml +197 -0
- data/test/test_support/sample-oai-pmh.xml +197 -0
- data/test/thread_pool_test.rb +38 -0
- data/test/translation_map_test.rb +3 -3
- data/test/translation_maps/ruby_map.rb +2 -1
- data/test/translation_maps/yaml_map.yaml +2 -1
- data/traject.gemspec +4 -11
- metadata +92 -6
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'rspec/mocks'
|
3
|
+
|
4
|
+
# MOST of ThreadPool is not tested directly at this point.
|
5
|
+
describe "Traject::ThreadPool" do
|
6
|
+
include ::RSpec::Mocks::ExampleMethods
|
7
|
+
|
8
|
+
# http://blog.plataformatec.com.br/2015/05/nobody-told-me-minitest-was-this-fun/
|
9
|
+
def before_setup
|
10
|
+
::RSpec::Mocks.setup
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
def after_teardown
|
15
|
+
super
|
16
|
+
::RSpec::Mocks.verify
|
17
|
+
ensure
|
18
|
+
::RSpec::Mocks.teardown
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
describe "disable_concurrency!" do
|
23
|
+
|
24
|
+
it "disables concurrency" do
|
25
|
+
allow(Traject::ThreadPool).to receive(:concurrency_disabled?).and_return(true)
|
26
|
+
|
27
|
+
parent_thread_id = Thread.current.object_id
|
28
|
+
|
29
|
+
work_thread_id = Concurrent::AtomicFixnum.new
|
30
|
+
|
31
|
+
Traject::ThreadPool.new(10).maybe_in_thread_pool do
|
32
|
+
work_thread_id.update { Thread.current.object_id }
|
33
|
+
end
|
34
|
+
|
35
|
+
assert_equal parent_thread_id, work_thread_id.value
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -57,7 +57,7 @@ describe "TranslationMap" do
|
|
57
57
|
|
58
58
|
it "raises on syntax error in yaml" do
|
59
59
|
exception = assert_raises(Psych::SyntaxError) do
|
60
|
-
|
60
|
+
_found = @cache.lookup("bad_yaml")
|
61
61
|
end
|
62
62
|
|
63
63
|
assert exception.message.include?("bad_yaml.yaml"), "exception message includes source file"
|
@@ -65,7 +65,7 @@ describe "TranslationMap" do
|
|
65
65
|
|
66
66
|
it "raises on syntax error in ruby" do
|
67
67
|
exception = assert_raises(SyntaxError) do
|
68
|
-
|
68
|
+
_found = @cache.lookup("bad_ruby")
|
69
69
|
end
|
70
70
|
assert exception.message.include?("bad_ruby.rb"), "exception message includes source file"
|
71
71
|
end
|
@@ -118,7 +118,7 @@ describe "TranslationMap" do
|
|
118
118
|
|
119
119
|
assert_equal "DEFAULT LITERAL", map["not in the map"]
|
120
120
|
end
|
121
|
-
|
121
|
+
|
122
122
|
it "respects __default__ __passthrough__" do
|
123
123
|
map = Traject::TranslationMap.new("default_passthrough")
|
124
124
|
|
data/traject.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = Traject::VERSION
|
9
9
|
spec.authors = ["Jonathan Rochkind", "Bill Dueber"]
|
10
10
|
spec.email = ["none@nowhere.org"]
|
11
|
-
spec.summary = %q{
|
11
|
+
spec.summary = %q{An easy to use, high-performance, flexible and extensible metadata transformation system, focused on library-archives-museums input, and indexing to Solr as output.}
|
12
12
|
spec.homepage = "http://github.com/traject/traject"
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
@@ -28,20 +28,13 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.add_dependency "yell" # logging
|
29
29
|
spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
|
30
30
|
spec.add_dependency "httpclient", "~> 2.5"
|
31
|
+
spec.add_dependency "http", "~> 3.0" # used in oai_pmh_reader, may use more extensively in future instead of httpclient
|
31
32
|
spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
|
32
|
-
|
33
|
-
# If we're building the package under JRuby, add in the
|
34
|
-
# jruby-only gems and specify the platform.
|
35
|
-
|
36
|
-
if defined? JRUBY_VERSION
|
37
|
-
spec.platform = 'java'
|
38
|
-
spec.add_dependency "traject-marc4j_reader", "~> 1.0"
|
39
|
-
else
|
40
|
-
spec.platform = "ruby"
|
41
|
-
end
|
33
|
+
spec.add_dependency "nokogiri", "~> 1.0" # NokogiriIndexer
|
42
34
|
|
43
35
|
spec.add_development_dependency "bundler", '~> 1.7'
|
44
36
|
|
45
37
|
spec.add_development_dependency "rake"
|
46
38
|
spec.add_development_dependency "minitest"
|
39
|
+
spec.add_development_dependency "rspec-mocks", '~> 3.4'
|
47
40
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0.alpha.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2018-08-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: concurrent-ruby
|
@@ -115,6 +115,20 @@ dependencies:
|
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
117
|
version: '2.5'
|
118
|
+
- !ruby/object:Gem::Dependency
|
119
|
+
name: http
|
120
|
+
requirement: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '3.0'
|
125
|
+
type: :runtime
|
126
|
+
prerelease: false
|
127
|
+
version_requirements: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '3.0'
|
118
132
|
- !ruby/object:Gem::Dependency
|
119
133
|
name: marc-fastxmlwriter
|
120
134
|
requirement: !ruby/object:Gem::Requirement
|
@@ -129,6 +143,20 @@ dependencies:
|
|
129
143
|
- - "~>"
|
130
144
|
- !ruby/object:Gem::Version
|
131
145
|
version: '1.0'
|
146
|
+
- !ruby/object:Gem::Dependency
|
147
|
+
name: nokogiri
|
148
|
+
requirement: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '1.0'
|
153
|
+
type: :runtime
|
154
|
+
prerelease: false
|
155
|
+
version_requirements: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '1.0'
|
132
160
|
- !ruby/object:Gem::Dependency
|
133
161
|
name: bundler
|
134
162
|
requirement: !ruby/object:Gem::Requirement
|
@@ -171,6 +199,20 @@ dependencies:
|
|
171
199
|
- - ">="
|
172
200
|
- !ruby/object:Gem::Version
|
173
201
|
version: '0'
|
202
|
+
- !ruby/object:Gem::Dependency
|
203
|
+
name: rspec-mocks
|
204
|
+
requirement: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - "~>"
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '3.4'
|
209
|
+
type: :development
|
210
|
+
prerelease: false
|
211
|
+
version_requirements: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - "~>"
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '3.4'
|
174
216
|
description:
|
175
217
|
email:
|
176
218
|
- none@nowhere.org
|
@@ -182,7 +224,9 @@ extra_rdoc_files:
|
|
182
224
|
- doc/extending.md
|
183
225
|
- doc/indexing_rules.md
|
184
226
|
- doc/other_commands.md
|
227
|
+
- doc/programmatic_use.md
|
185
228
|
- doc/settings.md
|
229
|
+
- doc/xml.md
|
186
230
|
files:
|
187
231
|
- ".gitignore"
|
188
232
|
- ".travis.yml"
|
@@ -198,7 +242,9 @@ files:
|
|
198
242
|
- doc/extending.md
|
199
243
|
- doc/indexing_rules.md
|
200
244
|
- doc/other_commands.md
|
245
|
+
- doc/programmatic_use.md
|
201
246
|
- doc/settings.md
|
247
|
+
- doc/xml.md
|
202
248
|
- index_bench/batch.dat
|
203
249
|
- index_bench/common.rb
|
204
250
|
- index_bench/index_bench.rb
|
@@ -216,12 +262,17 @@ files:
|
|
216
262
|
- index_bench/translation_maps/umich/obsolete_cop.yaml
|
217
263
|
- lib/tasks/load_maps.rake
|
218
264
|
- lib/traject.rb
|
265
|
+
- lib/traject/array_writer.rb
|
219
266
|
- lib/traject/command_line.rb
|
220
267
|
- lib/traject/csv_writer.rb
|
221
268
|
- lib/traject/debug_writer.rb
|
222
269
|
- lib/traject/delimited_writer.rb
|
270
|
+
- lib/traject/experimental_nokogiri_streaming_reader.rb
|
271
|
+
- lib/traject/hashie/indifferent_access_fix.rb
|
223
272
|
- lib/traject/indexer.rb
|
224
273
|
- lib/traject/indexer/context.rb
|
274
|
+
- lib/traject/indexer/marc_indexer.rb
|
275
|
+
- lib/traject/indexer/nokogiri_indexer.rb
|
225
276
|
- lib/traject/indexer/settings.rb
|
226
277
|
- lib/traject/indexer/step.rb
|
227
278
|
- lib/traject/json_writer.rb
|
@@ -230,12 +281,16 @@ files:
|
|
230
281
|
- lib/traject/macros/marc21.rb
|
231
282
|
- lib/traject/macros/marc21_semantics.rb
|
232
283
|
- lib/traject/macros/marc_format_classifier.rb
|
284
|
+
- lib/traject/macros/nokogiri_macros.rb
|
285
|
+
- lib/traject/macros/transformation.rb
|
233
286
|
- lib/traject/marc_extractor.rb
|
234
287
|
- lib/traject/marc_extractor_spec.rb
|
235
288
|
- lib/traject/marc_reader.rb
|
236
289
|
- lib/traject/mock_reader.rb
|
237
290
|
- lib/traject/ndj_reader.rb
|
291
|
+
- lib/traject/nokogiri_reader.rb
|
238
292
|
- lib/traject/null_writer.rb
|
293
|
+
- lib/traject/oai_pmh_nokogiri_reader.rb
|
239
294
|
- lib/traject/qualified_const_get.rb
|
240
295
|
- lib/traject/solr_json_writer.rb
|
241
296
|
- lib/traject/thread_pool.rb
|
@@ -251,8 +306,10 @@ files:
|
|
251
306
|
- lib/translation_maps/marc_languages.yaml
|
252
307
|
- test/debug_writer_test.rb
|
253
308
|
- test/delimited_writer_test.rb
|
309
|
+
- test/experimental_nokogiri_streaming_reader_test.rb
|
254
310
|
- test/indexer/context_test.rb
|
255
311
|
- test/indexer/each_record_test.rb
|
312
|
+
- test/indexer/error_handler_test.rb
|
256
313
|
- test/indexer/load_config_file_test.rb
|
257
314
|
- test/indexer/macros/macros_marc21_semantics_test.rb
|
258
315
|
- test/indexer/macros/marc21/extract_all_marc_values_test.rb
|
@@ -260,7 +317,11 @@ files:
|
|
260
317
|
- test/indexer/macros/marc21/serialize_marc_test.rb
|
261
318
|
- test/indexer/macros/marc21/trim_punctuation_test.rb
|
262
319
|
- test/indexer/macros/to_field_test.rb
|
320
|
+
- test/indexer/macros/transformation_test.rb
|
263
321
|
- test/indexer/map_record_test.rb
|
322
|
+
- test/indexer/nokogiri_indexer_test.rb
|
323
|
+
- test/indexer/process_record_test.rb
|
324
|
+
- test/indexer/process_with_test.rb
|
264
325
|
- test/indexer/read_write_test.rb
|
265
326
|
- test/indexer/settings_test.rb
|
266
327
|
- test/indexer/to_field_test.rb
|
@@ -268,6 +329,8 @@ files:
|
|
268
329
|
- test/marc_extractor_test.rb
|
269
330
|
- test/marc_format_classifier_test.rb
|
270
331
|
- test/marc_reader_test.rb
|
332
|
+
- test/nokogiri_reader_test.rb
|
333
|
+
- test/oai_pmh_nokogiri_reader_test.rb
|
271
334
|
- test/solr_json_writer_test.rb
|
272
335
|
- test/test_helper.rb
|
273
336
|
- test/test_support/245_no_ab.marc
|
@@ -289,15 +352,22 @@ files:
|
|
289
352
|
- test/test_support/multi_era.marc
|
290
353
|
- test/test_support/multi_geo.marc
|
291
354
|
- test/test_support/musical_cage.marc
|
355
|
+
- test/test_support/namespace-test.xml
|
292
356
|
- test/test_support/nature.marc
|
357
|
+
- test/test_support/nokogiri_demo_config.rb
|
358
|
+
- test/test_support/oai-pmh-one-record-2.xml
|
359
|
+
- test/test_support/oai-pmh-one-record-first.xml
|
293
360
|
- test/test_support/one-marc8.mrc
|
294
361
|
- test/test_support/online_only.marc
|
295
362
|
- test/test_support/packed_041a_lang.marc
|
363
|
+
- test/test_support/sample-oai-no-namespace.xml
|
364
|
+
- test/test_support/sample-oai-pmh.xml
|
296
365
|
- test/test_support/test_data.utf8.json
|
297
366
|
- test/test_support/test_data.utf8.marc.xml
|
298
367
|
- test/test_support/test_data.utf8.mrc
|
299
368
|
- test/test_support/test_data.utf8.mrc.gz
|
300
369
|
- test/test_support/the_business_ren.marc
|
370
|
+
- test/thread_pool_test.rb
|
301
371
|
- test/translation_map_test.rb
|
302
372
|
- test/translation_maps/bad_ruby.rb
|
303
373
|
- test/translation_maps/bad_yaml.yaml
|
@@ -326,20 +396,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
326
396
|
version: '0'
|
327
397
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
328
398
|
requirements:
|
329
|
-
- - "
|
399
|
+
- - ">"
|
330
400
|
- !ruby/object:Gem::Version
|
331
|
-
version:
|
401
|
+
version: 1.3.1
|
332
402
|
requirements: []
|
333
403
|
rubyforge_project:
|
334
|
-
rubygems_version: 2.
|
404
|
+
rubygems_version: 2.7.7
|
335
405
|
signing_key:
|
336
406
|
specification_version: 4
|
337
|
-
summary:
|
407
|
+
summary: An easy to use, high-performance, flexible and extensible metadata transformation
|
408
|
+
system, focused on library-archives-museums input, and indexing to Solr as output.
|
338
409
|
test_files:
|
339
410
|
- test/debug_writer_test.rb
|
340
411
|
- test/delimited_writer_test.rb
|
412
|
+
- test/experimental_nokogiri_streaming_reader_test.rb
|
341
413
|
- test/indexer/context_test.rb
|
342
414
|
- test/indexer/each_record_test.rb
|
415
|
+
- test/indexer/error_handler_test.rb
|
343
416
|
- test/indexer/load_config_file_test.rb
|
344
417
|
- test/indexer/macros/macros_marc21_semantics_test.rb
|
345
418
|
- test/indexer/macros/marc21/extract_all_marc_values_test.rb
|
@@ -347,7 +420,11 @@ test_files:
|
|
347
420
|
- test/indexer/macros/marc21/serialize_marc_test.rb
|
348
421
|
- test/indexer/macros/marc21/trim_punctuation_test.rb
|
349
422
|
- test/indexer/macros/to_field_test.rb
|
423
|
+
- test/indexer/macros/transformation_test.rb
|
350
424
|
- test/indexer/map_record_test.rb
|
425
|
+
- test/indexer/nokogiri_indexer_test.rb
|
426
|
+
- test/indexer/process_record_test.rb
|
427
|
+
- test/indexer/process_with_test.rb
|
351
428
|
- test/indexer/read_write_test.rb
|
352
429
|
- test/indexer/settings_test.rb
|
353
430
|
- test/indexer/to_field_test.rb
|
@@ -355,6 +432,8 @@ test_files:
|
|
355
432
|
- test/marc_extractor_test.rb
|
356
433
|
- test/marc_format_classifier_test.rb
|
357
434
|
- test/marc_reader_test.rb
|
435
|
+
- test/nokogiri_reader_test.rb
|
436
|
+
- test/oai_pmh_nokogiri_reader_test.rb
|
358
437
|
- test/solr_json_writer_test.rb
|
359
438
|
- test/test_helper.rb
|
360
439
|
- test/test_support/245_no_ab.marc
|
@@ -376,15 +455,22 @@ test_files:
|
|
376
455
|
- test/test_support/multi_era.marc
|
377
456
|
- test/test_support/multi_geo.marc
|
378
457
|
- test/test_support/musical_cage.marc
|
458
|
+
- test/test_support/namespace-test.xml
|
379
459
|
- test/test_support/nature.marc
|
460
|
+
- test/test_support/nokogiri_demo_config.rb
|
461
|
+
- test/test_support/oai-pmh-one-record-2.xml
|
462
|
+
- test/test_support/oai-pmh-one-record-first.xml
|
380
463
|
- test/test_support/one-marc8.mrc
|
381
464
|
- test/test_support/online_only.marc
|
382
465
|
- test/test_support/packed_041a_lang.marc
|
466
|
+
- test/test_support/sample-oai-no-namespace.xml
|
467
|
+
- test/test_support/sample-oai-pmh.xml
|
383
468
|
- test/test_support/test_data.utf8.json
|
384
469
|
- test/test_support/test_data.utf8.marc.xml
|
385
470
|
- test/test_support/test_data.utf8.mrc
|
386
471
|
- test/test_support/test_data.utf8.mrc.gz
|
387
472
|
- test/test_support/the_business_ren.marc
|
473
|
+
- test/thread_pool_test.rb
|
388
474
|
- test/translation_map_test.rb
|
389
475
|
- test/translation_maps/bad_ruby.rb
|
390
476
|
- test/translation_maps/bad_yaml.yaml
|