traject 2.3.4 → 3.0.0.alpha.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +16 -9
- data/CHANGES.md +74 -1
- data/Gemfile +2 -1
- data/README.md +104 -53
- data/Rakefile +8 -1
- data/doc/indexing_rules.md +79 -63
- data/doc/programmatic_use.md +218 -0
- data/doc/settings.md +28 -1
- data/doc/xml.md +134 -0
- data/lib/traject.rb +5 -0
- data/lib/traject/array_writer.rb +34 -0
- data/lib/traject/command_line.rb +18 -22
- data/lib/traject/debug_writer.rb +2 -5
- data/lib/traject/experimental_nokogiri_streaming_reader.rb +276 -0
- data/lib/traject/hashie/indifferent_access_fix.rb +25 -0
- data/lib/traject/indexer.rb +321 -92
- data/lib/traject/indexer/context.rb +39 -13
- data/lib/traject/indexer/marc_indexer.rb +30 -0
- data/lib/traject/indexer/nokogiri_indexer.rb +30 -0
- data/lib/traject/indexer/settings.rb +36 -53
- data/lib/traject/indexer/step.rb +27 -33
- data/lib/traject/macros/marc21.rb +37 -12
- data/lib/traject/macros/nokogiri_macros.rb +43 -0
- data/lib/traject/macros/transformation.rb +162 -0
- data/lib/traject/marc_extractor.rb +2 -0
- data/lib/traject/ndj_reader.rb +1 -1
- data/lib/traject/nokogiri_reader.rb +179 -0
- data/lib/traject/oai_pmh_nokogiri_reader.rb +159 -0
- data/lib/traject/solr_json_writer.rb +19 -12
- data/lib/traject/thread_pool.rb +13 -0
- data/lib/traject/util.rb +14 -2
- data/lib/traject/version.rb +1 -1
- data/test/debug_writer_test.rb +3 -3
- data/test/delimited_writer_test.rb +3 -3
- data/test/experimental_nokogiri_streaming_reader_test.rb +169 -0
- data/test/indexer/context_test.rb +23 -13
- data/test/indexer/error_handler_test.rb +59 -0
- data/test/indexer/macros/macros_marc21_semantics_test.rb +46 -46
- data/test/indexer/macros/marc21/extract_all_marc_values_test.rb +1 -1
- data/test/indexer/macros/marc21/extract_marc_test.rb +19 -9
- data/test/indexer/macros/marc21/serialize_marc_test.rb +4 -4
- data/test/indexer/macros/to_field_test.rb +2 -2
- data/test/indexer/macros/transformation_test.rb +177 -0
- data/test/indexer/map_record_test.rb +2 -3
- data/test/indexer/nokogiri_indexer_test.rb +103 -0
- data/test/indexer/process_record_test.rb +55 -0
- data/test/indexer/process_with_test.rb +148 -0
- data/test/indexer/read_write_test.rb +52 -2
- data/test/indexer/settings_test.rb +34 -24
- data/test/indexer/to_field_test.rb +27 -2
- data/test/marc_extractor_test.rb +7 -7
- data/test/marc_reader_test.rb +4 -4
- data/test/nokogiri_reader_test.rb +158 -0
- data/test/oai_pmh_nokogiri_reader_test.rb +23 -0
- data/test/solr_json_writer_test.rb +24 -28
- data/test/test_helper.rb +8 -2
- data/test/test_support/namespace-test.xml +7 -0
- data/test/test_support/nokogiri_demo_config.rb +17 -0
- data/test/test_support/oai-pmh-one-record-2.xml +24 -0
- data/test/test_support/oai-pmh-one-record-first.xml +24 -0
- data/test/test_support/sample-oai-no-namespace.xml +197 -0
- data/test/test_support/sample-oai-pmh.xml +197 -0
- data/test/thread_pool_test.rb +38 -0
- data/test/translation_map_test.rb +3 -3
- data/test/translation_maps/ruby_map.rb +2 -1
- data/test/translation_maps/yaml_map.yaml +2 -1
- data/traject.gemspec +4 -11
- metadata +92 -6
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require 'rspec/mocks'
|
3
|
+
|
4
|
+
# MOST of ThreadPool is not tested directly at this point.
|
5
|
+
describe "Traject::ThreadPool" do
|
6
|
+
include ::RSpec::Mocks::ExampleMethods
|
7
|
+
|
8
|
+
# http://blog.plataformatec.com.br/2015/05/nobody-told-me-minitest-was-this-fun/
|
9
|
+
def before_setup
|
10
|
+
::RSpec::Mocks.setup
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
def after_teardown
|
15
|
+
super
|
16
|
+
::RSpec::Mocks.verify
|
17
|
+
ensure
|
18
|
+
::RSpec::Mocks.teardown
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
describe "disable_concurrency!" do
|
23
|
+
|
24
|
+
it "disables concurrency" do
|
25
|
+
allow(Traject::ThreadPool).to receive(:concurrency_disabled?).and_return(true)
|
26
|
+
|
27
|
+
parent_thread_id = Thread.current.object_id
|
28
|
+
|
29
|
+
work_thread_id = Concurrent::AtomicFixnum.new
|
30
|
+
|
31
|
+
Traject::ThreadPool.new(10).maybe_in_thread_pool do
|
32
|
+
work_thread_id.update { Thread.current.object_id }
|
33
|
+
end
|
34
|
+
|
35
|
+
assert_equal parent_thread_id, work_thread_id.value
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -57,7 +57,7 @@ describe "TranslationMap" do
|
|
57
57
|
|
58
58
|
it "raises on syntax error in yaml" do
|
59
59
|
exception = assert_raises(Psych::SyntaxError) do
|
60
|
-
|
60
|
+
_found = @cache.lookup("bad_yaml")
|
61
61
|
end
|
62
62
|
|
63
63
|
assert exception.message.include?("bad_yaml.yaml"), "exception message includes source file"
|
@@ -65,7 +65,7 @@ describe "TranslationMap" do
|
|
65
65
|
|
66
66
|
it "raises on syntax error in ruby" do
|
67
67
|
exception = assert_raises(SyntaxError) do
|
68
|
-
|
68
|
+
_found = @cache.lookup("bad_ruby")
|
69
69
|
end
|
70
70
|
assert exception.message.include?("bad_ruby.rb"), "exception message includes source file"
|
71
71
|
end
|
@@ -118,7 +118,7 @@ describe "TranslationMap" do
|
|
118
118
|
|
119
119
|
assert_equal "DEFAULT LITERAL", map["not in the map"]
|
120
120
|
end
|
121
|
-
|
121
|
+
|
122
122
|
it "respects __default__ __passthrough__" do
|
123
123
|
map = Traject::TranslationMap.new("default_passthrough")
|
124
124
|
|
data/traject.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = Traject::VERSION
|
9
9
|
spec.authors = ["Jonathan Rochkind", "Bill Dueber"]
|
10
10
|
spec.email = ["none@nowhere.org"]
|
11
|
-
spec.summary = %q{
|
11
|
+
spec.summary = %q{An easy to use, high-performance, flexible and extensible metadata transformation system, focused on library-archives-museums input, and indexing to Solr as output.}
|
12
12
|
spec.homepage = "http://github.com/traject/traject"
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
@@ -28,20 +28,13 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.add_dependency "yell" # logging
|
29
29
|
spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
|
30
30
|
spec.add_dependency "httpclient", "~> 2.5"
|
31
|
+
spec.add_dependency "http", "~> 3.0" # used in oai_pmh_reader, may use more extensively in future instead of httpclient
|
31
32
|
spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
|
32
|
-
|
33
|
-
# If we're building the package under JRuby, add in the
|
34
|
-
# jruby-only gems and specify the platform.
|
35
|
-
|
36
|
-
if defined? JRUBY_VERSION
|
37
|
-
spec.platform = 'java'
|
38
|
-
spec.add_dependency "traject-marc4j_reader", "~> 1.0"
|
39
|
-
else
|
40
|
-
spec.platform = "ruby"
|
41
|
-
end
|
33
|
+
spec.add_dependency "nokogiri", "~> 1.0" # NokogiriIndexer
|
42
34
|
|
43
35
|
spec.add_development_dependency "bundler", '~> 1.7'
|
44
36
|
|
45
37
|
spec.add_development_dependency "rake"
|
46
38
|
spec.add_development_dependency "minitest"
|
39
|
+
spec.add_development_dependency "rspec-mocks", '~> 3.4'
|
47
40
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0.alpha.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2018-08-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: concurrent-ruby
|
@@ -115,6 +115,20 @@ dependencies:
|
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
117
|
version: '2.5'
|
118
|
+
- !ruby/object:Gem::Dependency
|
119
|
+
name: http
|
120
|
+
requirement: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '3.0'
|
125
|
+
type: :runtime
|
126
|
+
prerelease: false
|
127
|
+
version_requirements: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '3.0'
|
118
132
|
- !ruby/object:Gem::Dependency
|
119
133
|
name: marc-fastxmlwriter
|
120
134
|
requirement: !ruby/object:Gem::Requirement
|
@@ -129,6 +143,20 @@ dependencies:
|
|
129
143
|
- - "~>"
|
130
144
|
- !ruby/object:Gem::Version
|
131
145
|
version: '1.0'
|
146
|
+
- !ruby/object:Gem::Dependency
|
147
|
+
name: nokogiri
|
148
|
+
requirement: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '1.0'
|
153
|
+
type: :runtime
|
154
|
+
prerelease: false
|
155
|
+
version_requirements: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '1.0'
|
132
160
|
- !ruby/object:Gem::Dependency
|
133
161
|
name: bundler
|
134
162
|
requirement: !ruby/object:Gem::Requirement
|
@@ -171,6 +199,20 @@ dependencies:
|
|
171
199
|
- - ">="
|
172
200
|
- !ruby/object:Gem::Version
|
173
201
|
version: '0'
|
202
|
+
- !ruby/object:Gem::Dependency
|
203
|
+
name: rspec-mocks
|
204
|
+
requirement: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - "~>"
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '3.4'
|
209
|
+
type: :development
|
210
|
+
prerelease: false
|
211
|
+
version_requirements: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - "~>"
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '3.4'
|
174
216
|
description:
|
175
217
|
email:
|
176
218
|
- none@nowhere.org
|
@@ -182,7 +224,9 @@ extra_rdoc_files:
|
|
182
224
|
- doc/extending.md
|
183
225
|
- doc/indexing_rules.md
|
184
226
|
- doc/other_commands.md
|
227
|
+
- doc/programmatic_use.md
|
185
228
|
- doc/settings.md
|
229
|
+
- doc/xml.md
|
186
230
|
files:
|
187
231
|
- ".gitignore"
|
188
232
|
- ".travis.yml"
|
@@ -198,7 +242,9 @@ files:
|
|
198
242
|
- doc/extending.md
|
199
243
|
- doc/indexing_rules.md
|
200
244
|
- doc/other_commands.md
|
245
|
+
- doc/programmatic_use.md
|
201
246
|
- doc/settings.md
|
247
|
+
- doc/xml.md
|
202
248
|
- index_bench/batch.dat
|
203
249
|
- index_bench/common.rb
|
204
250
|
- index_bench/index_bench.rb
|
@@ -216,12 +262,17 @@ files:
|
|
216
262
|
- index_bench/translation_maps/umich/obsolete_cop.yaml
|
217
263
|
- lib/tasks/load_maps.rake
|
218
264
|
- lib/traject.rb
|
265
|
+
- lib/traject/array_writer.rb
|
219
266
|
- lib/traject/command_line.rb
|
220
267
|
- lib/traject/csv_writer.rb
|
221
268
|
- lib/traject/debug_writer.rb
|
222
269
|
- lib/traject/delimited_writer.rb
|
270
|
+
- lib/traject/experimental_nokogiri_streaming_reader.rb
|
271
|
+
- lib/traject/hashie/indifferent_access_fix.rb
|
223
272
|
- lib/traject/indexer.rb
|
224
273
|
- lib/traject/indexer/context.rb
|
274
|
+
- lib/traject/indexer/marc_indexer.rb
|
275
|
+
- lib/traject/indexer/nokogiri_indexer.rb
|
225
276
|
- lib/traject/indexer/settings.rb
|
226
277
|
- lib/traject/indexer/step.rb
|
227
278
|
- lib/traject/json_writer.rb
|
@@ -230,12 +281,16 @@ files:
|
|
230
281
|
- lib/traject/macros/marc21.rb
|
231
282
|
- lib/traject/macros/marc21_semantics.rb
|
232
283
|
- lib/traject/macros/marc_format_classifier.rb
|
284
|
+
- lib/traject/macros/nokogiri_macros.rb
|
285
|
+
- lib/traject/macros/transformation.rb
|
233
286
|
- lib/traject/marc_extractor.rb
|
234
287
|
- lib/traject/marc_extractor_spec.rb
|
235
288
|
- lib/traject/marc_reader.rb
|
236
289
|
- lib/traject/mock_reader.rb
|
237
290
|
- lib/traject/ndj_reader.rb
|
291
|
+
- lib/traject/nokogiri_reader.rb
|
238
292
|
- lib/traject/null_writer.rb
|
293
|
+
- lib/traject/oai_pmh_nokogiri_reader.rb
|
239
294
|
- lib/traject/qualified_const_get.rb
|
240
295
|
- lib/traject/solr_json_writer.rb
|
241
296
|
- lib/traject/thread_pool.rb
|
@@ -251,8 +306,10 @@ files:
|
|
251
306
|
- lib/translation_maps/marc_languages.yaml
|
252
307
|
- test/debug_writer_test.rb
|
253
308
|
- test/delimited_writer_test.rb
|
309
|
+
- test/experimental_nokogiri_streaming_reader_test.rb
|
254
310
|
- test/indexer/context_test.rb
|
255
311
|
- test/indexer/each_record_test.rb
|
312
|
+
- test/indexer/error_handler_test.rb
|
256
313
|
- test/indexer/load_config_file_test.rb
|
257
314
|
- test/indexer/macros/macros_marc21_semantics_test.rb
|
258
315
|
- test/indexer/macros/marc21/extract_all_marc_values_test.rb
|
@@ -260,7 +317,11 @@ files:
|
|
260
317
|
- test/indexer/macros/marc21/serialize_marc_test.rb
|
261
318
|
- test/indexer/macros/marc21/trim_punctuation_test.rb
|
262
319
|
- test/indexer/macros/to_field_test.rb
|
320
|
+
- test/indexer/macros/transformation_test.rb
|
263
321
|
- test/indexer/map_record_test.rb
|
322
|
+
- test/indexer/nokogiri_indexer_test.rb
|
323
|
+
- test/indexer/process_record_test.rb
|
324
|
+
- test/indexer/process_with_test.rb
|
264
325
|
- test/indexer/read_write_test.rb
|
265
326
|
- test/indexer/settings_test.rb
|
266
327
|
- test/indexer/to_field_test.rb
|
@@ -268,6 +329,8 @@ files:
|
|
268
329
|
- test/marc_extractor_test.rb
|
269
330
|
- test/marc_format_classifier_test.rb
|
270
331
|
- test/marc_reader_test.rb
|
332
|
+
- test/nokogiri_reader_test.rb
|
333
|
+
- test/oai_pmh_nokogiri_reader_test.rb
|
271
334
|
- test/solr_json_writer_test.rb
|
272
335
|
- test/test_helper.rb
|
273
336
|
- test/test_support/245_no_ab.marc
|
@@ -289,15 +352,22 @@ files:
|
|
289
352
|
- test/test_support/multi_era.marc
|
290
353
|
- test/test_support/multi_geo.marc
|
291
354
|
- test/test_support/musical_cage.marc
|
355
|
+
- test/test_support/namespace-test.xml
|
292
356
|
- test/test_support/nature.marc
|
357
|
+
- test/test_support/nokogiri_demo_config.rb
|
358
|
+
- test/test_support/oai-pmh-one-record-2.xml
|
359
|
+
- test/test_support/oai-pmh-one-record-first.xml
|
293
360
|
- test/test_support/one-marc8.mrc
|
294
361
|
- test/test_support/online_only.marc
|
295
362
|
- test/test_support/packed_041a_lang.marc
|
363
|
+
- test/test_support/sample-oai-no-namespace.xml
|
364
|
+
- test/test_support/sample-oai-pmh.xml
|
296
365
|
- test/test_support/test_data.utf8.json
|
297
366
|
- test/test_support/test_data.utf8.marc.xml
|
298
367
|
- test/test_support/test_data.utf8.mrc
|
299
368
|
- test/test_support/test_data.utf8.mrc.gz
|
300
369
|
- test/test_support/the_business_ren.marc
|
370
|
+
- test/thread_pool_test.rb
|
301
371
|
- test/translation_map_test.rb
|
302
372
|
- test/translation_maps/bad_ruby.rb
|
303
373
|
- test/translation_maps/bad_yaml.yaml
|
@@ -326,20 +396,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
326
396
|
version: '0'
|
327
397
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
328
398
|
requirements:
|
329
|
-
- - "
|
399
|
+
- - ">"
|
330
400
|
- !ruby/object:Gem::Version
|
331
|
-
version:
|
401
|
+
version: 1.3.1
|
332
402
|
requirements: []
|
333
403
|
rubyforge_project:
|
334
|
-
rubygems_version: 2.
|
404
|
+
rubygems_version: 2.7.7
|
335
405
|
signing_key:
|
336
406
|
specification_version: 4
|
337
|
-
summary:
|
407
|
+
summary: An easy to use, high-performance, flexible and extensible metadata transformation
|
408
|
+
system, focused on library-archives-museums input, and indexing to Solr as output.
|
338
409
|
test_files:
|
339
410
|
- test/debug_writer_test.rb
|
340
411
|
- test/delimited_writer_test.rb
|
412
|
+
- test/experimental_nokogiri_streaming_reader_test.rb
|
341
413
|
- test/indexer/context_test.rb
|
342
414
|
- test/indexer/each_record_test.rb
|
415
|
+
- test/indexer/error_handler_test.rb
|
343
416
|
- test/indexer/load_config_file_test.rb
|
344
417
|
- test/indexer/macros/macros_marc21_semantics_test.rb
|
345
418
|
- test/indexer/macros/marc21/extract_all_marc_values_test.rb
|
@@ -347,7 +420,11 @@ test_files:
|
|
347
420
|
- test/indexer/macros/marc21/serialize_marc_test.rb
|
348
421
|
- test/indexer/macros/marc21/trim_punctuation_test.rb
|
349
422
|
- test/indexer/macros/to_field_test.rb
|
423
|
+
- test/indexer/macros/transformation_test.rb
|
350
424
|
- test/indexer/map_record_test.rb
|
425
|
+
- test/indexer/nokogiri_indexer_test.rb
|
426
|
+
- test/indexer/process_record_test.rb
|
427
|
+
- test/indexer/process_with_test.rb
|
351
428
|
- test/indexer/read_write_test.rb
|
352
429
|
- test/indexer/settings_test.rb
|
353
430
|
- test/indexer/to_field_test.rb
|
@@ -355,6 +432,8 @@ test_files:
|
|
355
432
|
- test/marc_extractor_test.rb
|
356
433
|
- test/marc_format_classifier_test.rb
|
357
434
|
- test/marc_reader_test.rb
|
435
|
+
- test/nokogiri_reader_test.rb
|
436
|
+
- test/oai_pmh_nokogiri_reader_test.rb
|
358
437
|
- test/solr_json_writer_test.rb
|
359
438
|
- test/test_helper.rb
|
360
439
|
- test/test_support/245_no_ab.marc
|
@@ -376,15 +455,22 @@ test_files:
|
|
376
455
|
- test/test_support/multi_era.marc
|
377
456
|
- test/test_support/multi_geo.marc
|
378
457
|
- test/test_support/musical_cage.marc
|
458
|
+
- test/test_support/namespace-test.xml
|
379
459
|
- test/test_support/nature.marc
|
460
|
+
- test/test_support/nokogiri_demo_config.rb
|
461
|
+
- test/test_support/oai-pmh-one-record-2.xml
|
462
|
+
- test/test_support/oai-pmh-one-record-first.xml
|
380
463
|
- test/test_support/one-marc8.mrc
|
381
464
|
- test/test_support/online_only.marc
|
382
465
|
- test/test_support/packed_041a_lang.marc
|
466
|
+
- test/test_support/sample-oai-no-namespace.xml
|
467
|
+
- test/test_support/sample-oai-pmh.xml
|
383
468
|
- test/test_support/test_data.utf8.json
|
384
469
|
- test/test_support/test_data.utf8.marc.xml
|
385
470
|
- test/test_support/test_data.utf8.mrc
|
386
471
|
- test/test_support/test_data.utf8.mrc.gz
|
387
472
|
- test/test_support/the_business_ren.marc
|
473
|
+
- test/thread_pool_test.rb
|
388
474
|
- test/translation_map_test.rb
|
389
475
|
- test/translation_maps/bad_ruby.rb
|
390
476
|
- test/translation_maps/bad_yaml.yaml
|