traject 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
require 'test_helper'
|
|
2
|
+
|
|
3
|
+
describe "Traject::Indexer#settings" do
|
|
4
|
+
before do
|
|
5
|
+
@indexer = Traject::Indexer.new
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
it "starts out a Hash, that can fill in it's defaults" do
|
|
9
|
+
assert_kind_of Hash, @indexer.settings
|
|
10
|
+
|
|
11
|
+
Traject::Indexer::Settings.defaults.each_pair do |key, value|
|
|
12
|
+
assert_equal value, @indexer.settings[key]
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it "can fill_in_defaults!" do
|
|
17
|
+
@indexer.settings.fill_in_defaults!
|
|
18
|
+
|
|
19
|
+
assert_equal Traject::Indexer::Settings.defaults, @indexer.settings
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it "doesn't overwrite with fill_in_defaults!" do
|
|
23
|
+
key = Traject::Indexer::Settings.defaults.keys.first
|
|
24
|
+
@indexer.settings[ key ] = "MINE KEEP IT"
|
|
25
|
+
|
|
26
|
+
@indexer.settings.fill_in_defaults!
|
|
27
|
+
|
|
28
|
+
assert_equal "MINE KEEP IT", @indexer.settings[key]
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it "can take argument to set" do
|
|
32
|
+
@indexer.settings("foo" => "foo", "bar" => "bar")
|
|
33
|
+
|
|
34
|
+
assert_equal "foo", @indexer.settings["foo"]
|
|
35
|
+
assert_equal "bar", @indexer.settings["bar"]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
it "has settings DSL to set" do
|
|
39
|
+
@indexer.instance_eval do
|
|
40
|
+
settings do
|
|
41
|
+
store "foo", "foo"
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
assert_equal "foo", @indexer.settings["foo"]
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "merges new values, not completely replaces" do
|
|
49
|
+
@indexer.settings("one" => "original", "two" => "original", "three" => "original", "four" => "original")
|
|
50
|
+
|
|
51
|
+
@indexer.settings do
|
|
52
|
+
store "two", "second"
|
|
53
|
+
store "three", "second"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
@indexer.settings do
|
|
57
|
+
store "three", "third"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
@indexer.settings("four" => "fourth")
|
|
61
|
+
|
|
62
|
+
{"one" => "original", "two" => "second", "three" => "third", "four" => "fourth"}.each_pair do |key, value|
|
|
63
|
+
assert_equal value, @indexer.settings[key]
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
it "is indifferent between string and symbol" do
|
|
68
|
+
@indexer.settings[:foo] = "foo 1"
|
|
69
|
+
@indexer.settings["foo"] = "foo 2"
|
|
70
|
+
|
|
71
|
+
assert_equal "foo 2", @indexer.settings[:foo]
|
|
72
|
+
|
|
73
|
+
@indexer.settings do
|
|
74
|
+
store "foo", "foo 3"
|
|
75
|
+
store :foo, "foo 4"
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
assert_equal "foo 4", @indexer.settings["foo"]
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
it "implements #provide as cautious setter" do
|
|
82
|
+
@indexer.settings[:a] = "original"
|
|
83
|
+
|
|
84
|
+
@indexer.settings do
|
|
85
|
+
provide :a, "new"
|
|
86
|
+
provide :b, "new"
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
assert_equal "original", @indexer.settings[:a]
|
|
90
|
+
assert_equal "new", @indexer.settings[:b]
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
it "has reverse_merge" do
|
|
94
|
+
settings = Traject::Indexer::Settings.new("a" => "original", "b" => "original")
|
|
95
|
+
|
|
96
|
+
new_settings = settings.reverse_merge(:a => "new", :c => "new")
|
|
97
|
+
|
|
98
|
+
assert_kind_of Traject::Indexer::Settings, new_settings
|
|
99
|
+
|
|
100
|
+
assert_equal "original", new_settings["a"]
|
|
101
|
+
assert_equal "original", new_settings["b"]
|
|
102
|
+
assert_equal "new", new_settings["c"]
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
it "has reverse_merge!" do
|
|
106
|
+
settings = Traject::Indexer::Settings.new("a" => "original", "b" => "original")
|
|
107
|
+
|
|
108
|
+
settings.reverse_merge!(:a => "new", :c => "new")
|
|
109
|
+
|
|
110
|
+
assert_kind_of Traject::Indexer::Settings, settings
|
|
111
|
+
|
|
112
|
+
assert_equal "original", settings["a"]
|
|
113
|
+
assert_equal "original", settings["b"]
|
|
114
|
+
assert_equal "new", settings["c"]
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
describe "inspect" do
|
|
118
|
+
it "keeps keys ending in 'password' out of inspect" do
|
|
119
|
+
settings = Traject::Indexer::Settings.new("a" => "a",
|
|
120
|
+
"password" => "password", "some_password" => "password",
|
|
121
|
+
"some.password" => "password")
|
|
122
|
+
|
|
123
|
+
parsed = eval( settings.inspect )
|
|
124
|
+
assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
describe "JRuby / MRI" do
|
|
129
|
+
before do
|
|
130
|
+
@indexer = Traject::Indexer.new
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
it "has the right indexer name" do
|
|
134
|
+
if defined? JRUBY_VERSION
|
|
135
|
+
assert_equal "Traject::Marc4JReader", @indexer.settings['reader_class_name']
|
|
136
|
+
else
|
|
137
|
+
assert_equal "Traject::MarcReader", @indexer.settings['reader_class_name']
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# This next one has the added effect of making sure the correct class
|
|
142
|
+
# has actually been loaded -- otherwise the constant wouldn't be available
|
|
143
|
+
it "has the correct default indexer class based on platform" do
|
|
144
|
+
if defined? JRUBY_VERSION
|
|
145
|
+
assert_equal Traject::Marc4JReader, @indexer.reader_class
|
|
146
|
+
else
|
|
147
|
+
assert_equal Traject::MarcReader, @indexer.reader_class
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
require 'test_helper'
|
|
2
|
+
|
|
3
|
+
describe "Traject::Indexer.to_field" do
|
|
4
|
+
before do
|
|
5
|
+
@indexer = Traject::Indexer.new
|
|
6
|
+
end
|
|
7
|
+
describe "checks it's arguments" do
|
|
8
|
+
it "rejects nil first arg" do
|
|
9
|
+
assert_raises(Traject::Indexer::NamingError) { @indexer.to_field(nil) }
|
|
10
|
+
end
|
|
11
|
+
it "rejects empty string first arg" do
|
|
12
|
+
assert_raises(Traject::Indexer::NamingError) {@indexer.to_field("")}
|
|
13
|
+
end
|
|
14
|
+
it "rejects non-string first arg" do
|
|
15
|
+
assert_raises(Traject::Indexer::NamingError) {@indexer.to_field(:symbol)}
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it "rejects one-arg lambda" do
|
|
19
|
+
assert_raises(Traject::Indexer::ArityError) do
|
|
20
|
+
@indexer.to_field("foo") do |one_arg|
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
it "rejects four-arg lambda" do
|
|
25
|
+
assert_raises(Traject::Indexer::ArityError) do
|
|
26
|
+
@indexer.to_field("foo") do |one_arg, two_arg, three_arg, four_arg|
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
it "accepts two arg lambda" do
|
|
31
|
+
@indexer.to_field("foo") do |one, two|
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
it "accepts three arg lambda" do
|
|
35
|
+
@indexer.to_field("foo") {|one, two, three| one }
|
|
36
|
+
end
|
|
37
|
+
it "accepts variable lambda" do
|
|
38
|
+
@indexer.to_field("foo") do |*variable|
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it "outputs error with source location" do
|
|
44
|
+
begin
|
|
45
|
+
@indexer.to_field('foo') {|one, two| }
|
|
46
|
+
@indexer.to_field('') {|one, two| } # bad field name
|
|
47
|
+
flunk("Should have rejected empty field name")
|
|
48
|
+
rescue Traject::Indexer::NamingError => e
|
|
49
|
+
assert_match(/at .*\/.*:\d+/, e.message)
|
|
50
|
+
rescue
|
|
51
|
+
flunk("Should only fail with a NamingError")
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Just verifying this is how it works
|
|
56
|
+
it "doesn't allow you to just wholesale assignment to the accumulator" do
|
|
57
|
+
@indexer.to_field('foo') do |rec, acc|
|
|
58
|
+
acc = ['hello']
|
|
59
|
+
end
|
|
60
|
+
output = @indexer.map_record('never looked at')
|
|
61
|
+
assert_equal nil, output['foo']
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it "allows use of accumulator.replace" do
|
|
65
|
+
@indexer.to_field('foo') do |rec, acc|
|
|
66
|
+
acc.replace ['hello']
|
|
67
|
+
end
|
|
68
|
+
output = @indexer.map_record('never looked at')
|
|
69
|
+
assert_equal ['hello'], output['foo']
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
require 'test_helper'
|
|
4
|
+
require 'traject/marc_extractor'
|
|
5
|
+
|
|
6
|
+
require 'marc'
|
|
7
|
+
|
|
8
|
+
describe "Traject::MarcExtractor" do
|
|
9
|
+
it "is frozen read-only" do
|
|
10
|
+
extractor = Traject::MarcExtractor.new("100abcde", :seperator => ";")
|
|
11
|
+
assert extractor.frozen?
|
|
12
|
+
assert extractor.spec_hash.frozen?
|
|
13
|
+
assert extractor.options.frozen?
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
describe "#parse_marc_spec" do
|
|
18
|
+
it "parses single spec with all elements" do
|
|
19
|
+
parsed = Traject::MarcExtractor.parse_string_spec("245|1*|abcg")
|
|
20
|
+
|
|
21
|
+
assert_kind_of Hash, parsed
|
|
22
|
+
assert_equal 1, parsed.keys.length
|
|
23
|
+
spec = parsed['245'].first
|
|
24
|
+
assert_kind_of Traject::MarcExtractor::Spec, spec
|
|
25
|
+
|
|
26
|
+
assert_equal "1", spec.indicator1
|
|
27
|
+
assert_nil spec.indicator2
|
|
28
|
+
|
|
29
|
+
assert_kind_of Array, spec.subfields
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
it "parses a mixed bag" do
|
|
33
|
+
parsed = Traject::MarcExtractor.parse_string_spec("245abcde:810:700|*4|bcd")
|
|
34
|
+
spec245 = parsed['245'].first
|
|
35
|
+
spec810 = parsed['810'].first
|
|
36
|
+
spec700 = parsed['700'].first
|
|
37
|
+
|
|
38
|
+
assert_length 3, parsed
|
|
39
|
+
|
|
40
|
+
#245abcde
|
|
41
|
+
assert spec245
|
|
42
|
+
assert_nil spec245.indicator1
|
|
43
|
+
assert_nil spec245.indicator2
|
|
44
|
+
assert_equal %w{a b c d e}, spec245.subfields
|
|
45
|
+
|
|
46
|
+
#810
|
|
47
|
+
assert spec810
|
|
48
|
+
assert_nil spec810.indicator1
|
|
49
|
+
assert_nil spec810.indicator2
|
|
50
|
+
assert_nil spec810.subfields, "No subfields"
|
|
51
|
+
|
|
52
|
+
#700-*4bcd
|
|
53
|
+
assert spec700
|
|
54
|
+
assert_nil spec700.indicator1
|
|
55
|
+
assert_equal "4", spec700.indicator2
|
|
56
|
+
assert_equal %w{b c d}, spec700.subfields
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
it "parses fixed field byte offsets" do
|
|
60
|
+
parsed = Traject::MarcExtractor.parse_string_spec("005[5]:008[7-10]")
|
|
61
|
+
|
|
62
|
+
assert_equal 5, parsed["005"].first.bytes
|
|
63
|
+
assert_equal 7..10, parsed["008"].first.bytes
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
it "allows arrays of specs" do
|
|
67
|
+
parsed = Traject::MarcExtractor.parse_string_spec %w(
|
|
68
|
+
245abcde
|
|
69
|
+
810
|
|
70
|
+
700|*4|bcd
|
|
71
|
+
)
|
|
72
|
+
assert_length 3, parsed
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
it "allows mixture of array and colon-delimited specs" do
|
|
76
|
+
parsed = Traject::MarcExtractor.parse_string_spec %w(
|
|
77
|
+
245abcde
|
|
78
|
+
100:110:111
|
|
79
|
+
810
|
|
80
|
+
700|*4|bcd
|
|
81
|
+
)
|
|
82
|
+
assert_length 6, parsed
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Mostly an internal method, not neccesarily API, but
|
|
89
|
+
# an important one, so we unit test some parts of it.
|
|
90
|
+
describe "#specs_covering_field" do
|
|
91
|
+
describe "for alternate script tags" do
|
|
92
|
+
before do
|
|
93
|
+
@record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
|
|
94
|
+
@extractor = Traject::MarcExtractor.new("245")
|
|
95
|
+
|
|
96
|
+
@a245 = @record.fields.find {|f| f.tag == "245"}
|
|
97
|
+
assert ! @a245.nil?, "Found a 245 to test"
|
|
98
|
+
|
|
99
|
+
@a880_245 = @record.fields.find do |field|
|
|
100
|
+
(field.tag == "880") && field['6'] &&
|
|
101
|
+
"245" == field['6'].slice(0,3)
|
|
102
|
+
end
|
|
103
|
+
assert ! @a880_245.nil?, "Found an 880-245 to test"
|
|
104
|
+
|
|
105
|
+
@a880_100 = @record.fields.find do |field|
|
|
106
|
+
(field.tag == "880") && field['6'] &&
|
|
107
|
+
"100" == field['6'].slice(0,3)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
assert ! @a880_100.nil?, "Found an 880-100 to test"
|
|
111
|
+
end
|
|
112
|
+
it "finds spec for relevant 880" do
|
|
113
|
+
assert_equal( [Traject::MarcExtractor::Spec.new(:tag => "245")], @extractor.specs_covering_field(@a880_245) )
|
|
114
|
+
assert_equal [], @extractor.specs_covering_field(@a880_100)
|
|
115
|
+
end
|
|
116
|
+
it "does not find spec for 880 if disabled" do
|
|
117
|
+
@extractor = Traject::MarcExtractor.new("245", :alternate_script => false)
|
|
118
|
+
assert_equal [], @extractor.specs_covering_field(@a880_245)
|
|
119
|
+
end
|
|
120
|
+
it "finds only 880 if so configured" do
|
|
121
|
+
@extractor = Traject::MarcExtractor.new("245", :alternate_script => :only)
|
|
122
|
+
assert_equal [], @extractor.specs_covering_field(@a245)
|
|
123
|
+
assert_equal([Traject::MarcExtractor::Spec.new(:tag => "245")], @extractor.specs_covering_field(@a880_245))
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
describe "#extract_by_spec" do
|
|
129
|
+
before do
|
|
130
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
describe "extracts a basic case" do
|
|
134
|
+
before do
|
|
135
|
+
parsed_spec = Traject::MarcExtractor.parse_string_spec("700abcdef:856|*2|:505|1*|:245ba")
|
|
136
|
+
@values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
it "returns an array" do
|
|
140
|
+
assert_kind_of Array, @values
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it "handles no subfields given" do
|
|
144
|
+
a856s = @record.find_all {|f| f.tag == "856"}
|
|
145
|
+
assert a856s, "Record must have 856 fields for this test to work"
|
|
146
|
+
|
|
147
|
+
a856s.each do |field|
|
|
148
|
+
assert @values.include?( field.subfields.collect(&:value).join(" "))
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
it "does not have 505, due to non-matching indicators" do
|
|
153
|
+
assert ! @values.find {|s| s.include? "propaganda model"}
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
it "respects original record order, for both fields and subfields" do
|
|
159
|
+
expected = ["Manufacturing consent : the political economy of the mass media /",
|
|
160
|
+
"Chomsky, Noam.",
|
|
161
|
+
"Contributor biographical information http://www.loc.gov/catdir/bios/random051/2001050014.html",
|
|
162
|
+
"Publisher description http://www.loc.gov/catdir/description/random044/2001050014.html"]
|
|
163
|
+
assert_equal expected, @values
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
describe "extracts fixed fields" do
|
|
168
|
+
it ", complete" do
|
|
169
|
+
parsed_spec = Traject::MarcExtractor.parse_string_spec("001")
|
|
170
|
+
values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
|
|
171
|
+
|
|
172
|
+
assert_equal ["2710183"], values
|
|
173
|
+
end
|
|
174
|
+
it ", single byte offset" do
|
|
175
|
+
parsed_spec = Traject::MarcExtractor.parse_string_spec("008[5]")
|
|
176
|
+
values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
|
|
177
|
+
|
|
178
|
+
assert_equal ["1"], values
|
|
179
|
+
end
|
|
180
|
+
it ", byte range" do
|
|
181
|
+
parsed_spec = Traject::MarcExtractor.parse_string_spec("008[7-10]")
|
|
182
|
+
values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
|
|
183
|
+
|
|
184
|
+
assert_equal ["2002"], values
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
describe "separator argument" do
|
|
189
|
+
it "causes non-join when nil" do
|
|
190
|
+
parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
|
|
191
|
+
values = Traject::MarcExtractor.new(parsed_spec, :separator => nil).extract(@record)
|
|
192
|
+
|
|
193
|
+
assert_length 3, values
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
it "can be non-default" do
|
|
197
|
+
parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
|
|
198
|
+
values = Traject::MarcExtractor.new(parsed_spec, :separator => "!! ").extract(@record)
|
|
199
|
+
|
|
200
|
+
assert_length 1, values
|
|
201
|
+
assert_equal "Manufacturing consent :!! the political economy of the mass media /!! Edward S. Herman and Noam Chomsky ; with a new introduction by the authors.", values.first
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
describe "extracts alternate script" do
|
|
206
|
+
before do
|
|
207
|
+
@record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
|
|
208
|
+
@parsed_spec = Traject::MarcExtractor.parse_string_spec("245b")
|
|
209
|
+
end
|
|
210
|
+
it "from default :include" do
|
|
211
|
+
|
|
212
|
+
values = Traject::MarcExtractor.new(@parsed_spec).extract(@record)
|
|
213
|
+
|
|
214
|
+
assert_length 2, values # both the original and the 880
|
|
215
|
+
assert_equal ["ben Marṭin Buber le-Aharon Daṿid Gordon /", "בין מרטין בובר לאהרן דוד גורדון /"], values
|
|
216
|
+
end
|
|
217
|
+
it "with :only" do
|
|
218
|
+
values = Traject::MarcExtractor.new(@parsed_spec, :alternate_script => :only).extract(@record)
|
|
219
|
+
|
|
220
|
+
assert_length 1, values
|
|
221
|
+
assert_equal ["בין מרטין בובר לאהרן דוד גורדון /"], values
|
|
222
|
+
end
|
|
223
|
+
it "with false" do
|
|
224
|
+
values = Traject::MarcExtractor.new(@parsed_spec, :alternate_script => false).extract(@record)
|
|
225
|
+
|
|
226
|
+
assert_length 1, values
|
|
227
|
+
assert_equal ["ben Marṭin Buber le-Aharon Daṿid Gordon /"], values
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
it "works with string second arg too" do
|
|
232
|
+
values = Traject::MarcExtractor.new("245abc").extract(@record)
|
|
233
|
+
|
|
234
|
+
assert_length 1, values
|
|
235
|
+
assert values.first.include?("Manufacturing consent"), "Extracted value includes title"
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
it "returns empty array if no matching tags" do
|
|
239
|
+
values = Traject::MarcExtractor.new("999abc").extract(@record)
|
|
240
|
+
assert_equal [], values
|
|
241
|
+
|
|
242
|
+
values = Traject::MarcExtractor.new("999").extract(@record)
|
|
243
|
+
assert_equal [], values
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
it "returns empty array if matching tag but no subfield" do
|
|
247
|
+
values = Traject::MarcExtractor.new("245xyz").extract(@record)
|
|
248
|
+
assert_equal [], values
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
describe "with bad data" do
|
|
254
|
+
it "can ignore an 880 with no $6" do
|
|
255
|
+
@record = MARC::Reader.new(support_file_path "880_with_no_6.utf8.marc").to_a.first
|
|
256
|
+
values = Traject::MarcExtractor.new("001").extract(@record)
|
|
257
|
+
assert_equal ["3468569"], values
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
describe "#each_matching_line" do
|
|
262
|
+
before do
|
|
263
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
264
|
+
@extractor = Traject::MarcExtractor.new("245abc")
|
|
265
|
+
end
|
|
266
|
+
it "yields two args" do
|
|
267
|
+
called = false
|
|
268
|
+
@extractor.each_matching_line(@record) do |field, spec|
|
|
269
|
+
called = true
|
|
270
|
+
assert_kind_of MARC::DataField, field
|
|
271
|
+
assert_kind_of Traject::MarcExtractor::Spec, spec
|
|
272
|
+
end
|
|
273
|
+
assert called, "calls block"
|
|
274
|
+
end
|
|
275
|
+
it "yields three args" do
|
|
276
|
+
called = false
|
|
277
|
+
@extractor.each_matching_line(@record) do |field, spec, extractor|
|
|
278
|
+
called = true
|
|
279
|
+
assert_kind_of MARC::DataField, field
|
|
280
|
+
assert_kind_of Traject::MarcExtractor::Spec, spec
|
|
281
|
+
assert_kind_of Traject::MarcExtractor, extractor
|
|
282
|
+
assert_same @extractor, extractor
|
|
283
|
+
end
|
|
284
|
+
assert called, "calls block"
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
describe "#collect_matching_lines" do
|
|
289
|
+
before do
|
|
290
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
291
|
+
@extractor = Traject::MarcExtractor.new("245abc")
|
|
292
|
+
end
|
|
293
|
+
it "collects with custom block" do
|
|
294
|
+
results = @extractor.collect_matching_lines(@record) do |field, spec, extractor|
|
|
295
|
+
extractor.collect_subfields(field, spec)
|
|
296
|
+
end
|
|
297
|
+
assert_equal ["Manufacturing consent : the political economy of the mass media / Edward S. Herman and Noam Chomsky ; with a new introduction by the authors."], results
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
describe "MarcExtractor.cached" do
|
|
302
|
+
it "creates" do
|
|
303
|
+
extractor = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
|
304
|
+
spec_hash = extractor.spec_hash
|
|
305
|
+
|
|
306
|
+
assert extractor.options[:separator].nil?, "extractor options[:separator] is nil"
|
|
307
|
+
assert_equal({"245"=>[Traject::MarcExtractor::Spec.new(:tag => "245", :subfields=>["a", "b", "c"])]}, spec_hash)
|
|
308
|
+
end
|
|
309
|
+
it "caches" do
|
|
310
|
+
ext1 = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
|
311
|
+
ext2 = Traject::MarcExtractor.cached("245abc", :separator => nil)
|
|
312
|
+
|
|
313
|
+
assert_same ext1, ext2
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
describe "Allows multiple uses of the same tag" do
|
|
319
|
+
before do
|
|
320
|
+
@record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
it "allows repated tags for a variable field" do
|
|
324
|
+
extractor = Traject::MarcExtractor.new("245a:245b")
|
|
325
|
+
values = extractor.extract(@record)
|
|
326
|
+
assert_equal ['Manufacturing consent :', 'the political economy of the mass media /'], values
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
it "allows repeated tags with indicators specs" do
|
|
330
|
+
extractor = Traject::MarcExtractor.new("245|1*|a:245|2*|b")
|
|
331
|
+
@record.append(MARC::DataField.new('245', '2', '0', ['a', 'Subfield A Value'], ['b', 'Subfield B Value']))
|
|
332
|
+
results = extractor.extract(@record)
|
|
333
|
+
assert_equal ['Manufacturing consent :', 'Subfield B Value'], results
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
it "provides multiple values for repeated subfields with single specified subfield" do
|
|
340
|
+
ex = Traject::MarcExtractor.new("245a")
|
|
341
|
+
f = @record.fields('245').first
|
|
342
|
+
title_a = f['a']
|
|
343
|
+
f.append(MARC::Subfield.new('a', title_a))
|
|
344
|
+
results = ex.extract(@record)
|
|
345
|
+
assert_equal [title_a, title_a], results
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
it "concats single subfield spec when given as eg 245aa" do
|
|
349
|
+
ex = Traject::MarcExtractor.new("245aa")
|
|
350
|
+
f = @record.fields('245').first
|
|
351
|
+
title_a = f['a']
|
|
352
|
+
f.append(MARC::Subfield.new('a', title_a))
|
|
353
|
+
results = ex.extract(@record)
|
|
354
|
+
assert_equal ["#{title_a} #{title_a}"], results
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
it "provides single value for repeated subfields with multiple specified subfields" do
|
|
358
|
+
ex = Traject::MarcExtractor.new("245ab")
|
|
359
|
+
f = @record.fields('245').first
|
|
360
|
+
title_a = f['a']
|
|
361
|
+
title_b = f['b']
|
|
362
|
+
f.append(MARC::Subfield.new('a', title_a))
|
|
363
|
+
results = ex.extract(@record)
|
|
364
|
+
assert_equal ["#{title_a} #{title_b} #{title_a}"], results
|
|
365
|
+
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
it "provides single value for repeated subfields with no specified subfield" do
|
|
369
|
+
ex = Traject::MarcExtractor.new("245")
|
|
370
|
+
f = @record.fields('245').first
|
|
371
|
+
title_a = f['a']
|
|
372
|
+
f.append(MARC::Subfield.new('a', title_a))
|
|
373
|
+
results = ex.extract(@record)
|
|
374
|
+
assert_equal 1, results.size
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
it "allows repeated tags for a control field" do
|
|
381
|
+
extractor = Traject::MarcExtractor.new("001[0-1]:001[0-3]")
|
|
382
|
+
values = extractor.extract(@record)
|
|
383
|
+
assert_equal ["27", "2710"], values
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
it "associates indicators properly with repeated tags" do
|
|
387
|
+
@record = MARC::Record.new
|
|
388
|
+
@record.append MARC::DataField.new("100", '1', ' ', ['a', '100a first indicator 1'], ['b', 'should not include 100|1|b'])
|
|
389
|
+
@record.append MARC::DataField.new("100", '2', ' ', ['b', '100b first indicator 2'], ['a', 'should not include 100|2|a'])
|
|
390
|
+
|
|
391
|
+
extractor = Traject::MarcExtractor.new("100|1*|a:100|2*|b")
|
|
392
|
+
|
|
393
|
+
values = extractor.extract(@record)
|
|
394
|
+
|
|
395
|
+
assert_equal ['100a first indicator 1', '100b first indicator 2'], values
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
describe "MarcExtractor::Spec" do
|
|
401
|
+
describe "==" do
|
|
402
|
+
it "equals when equal" do
|
|
403
|
+
assert_equal Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}), Traject::MarcExtractor::Spec.new(:subfields => %w{a b c})
|
|
404
|
+
end
|
|
405
|
+
it "does not equal when not" do
|
|
406
|
+
refute_equal Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}), Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}, :indicator2 => '1')
|
|
407
|
+
end
|
|
408
|
+
end
|
|
409
|
+
end
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
end
|