traject 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,152 @@
1
+ require 'test_helper'
2
+
3
+ describe "Traject::Indexer#settings" do
4
+ before do
5
+ @indexer = Traject::Indexer.new
6
+ end
7
+
8
+ it "starts out a Hash, that can fill in it's defaults" do
9
+ assert_kind_of Hash, @indexer.settings
10
+
11
+ Traject::Indexer::Settings.defaults.each_pair do |key, value|
12
+ assert_equal value, @indexer.settings[key]
13
+ end
14
+ end
15
+
16
+ it "can fill_in_defaults!" do
17
+ @indexer.settings.fill_in_defaults!
18
+
19
+ assert_equal Traject::Indexer::Settings.defaults, @indexer.settings
20
+ end
21
+
22
+ it "doesn't overwrite with fill_in_defaults!" do
23
+ key = Traject::Indexer::Settings.defaults.keys.first
24
+ @indexer.settings[ key ] = "MINE KEEP IT"
25
+
26
+ @indexer.settings.fill_in_defaults!
27
+
28
+ assert_equal "MINE KEEP IT", @indexer.settings[key]
29
+ end
30
+
31
+ it "can take argument to set" do
32
+ @indexer.settings("foo" => "foo", "bar" => "bar")
33
+
34
+ assert_equal "foo", @indexer.settings["foo"]
35
+ assert_equal "bar", @indexer.settings["bar"]
36
+ end
37
+
38
+ it "has settings DSL to set" do
39
+ @indexer.instance_eval do
40
+ settings do
41
+ store "foo", "foo"
42
+ end
43
+ end
44
+
45
+ assert_equal "foo", @indexer.settings["foo"]
46
+ end
47
+
48
+ it "merges new values, not completely replaces" do
49
+ @indexer.settings("one" => "original", "two" => "original", "three" => "original", "four" => "original")
50
+
51
+ @indexer.settings do
52
+ store "two", "second"
53
+ store "three", "second"
54
+ end
55
+
56
+ @indexer.settings do
57
+ store "three", "third"
58
+ end
59
+
60
+ @indexer.settings("four" => "fourth")
61
+
62
+ {"one" => "original", "two" => "second", "three" => "third", "four" => "fourth"}.each_pair do |key, value|
63
+ assert_equal value, @indexer.settings[key]
64
+ end
65
+ end
66
+
67
+ it "is indifferent between string and symbol" do
68
+ @indexer.settings[:foo] = "foo 1"
69
+ @indexer.settings["foo"] = "foo 2"
70
+
71
+ assert_equal "foo 2", @indexer.settings[:foo]
72
+
73
+ @indexer.settings do
74
+ store "foo", "foo 3"
75
+ store :foo, "foo 4"
76
+ end
77
+
78
+ assert_equal "foo 4", @indexer.settings["foo"]
79
+ end
80
+
81
+ it "implements #provide as cautious setter" do
82
+ @indexer.settings[:a] = "original"
83
+
84
+ @indexer.settings do
85
+ provide :a, "new"
86
+ provide :b, "new"
87
+ end
88
+
89
+ assert_equal "original", @indexer.settings[:a]
90
+ assert_equal "new", @indexer.settings[:b]
91
+ end
92
+
93
+ it "has reverse_merge" do
94
+ settings = Traject::Indexer::Settings.new("a" => "original", "b" => "original")
95
+
96
+ new_settings = settings.reverse_merge(:a => "new", :c => "new")
97
+
98
+ assert_kind_of Traject::Indexer::Settings, new_settings
99
+
100
+ assert_equal "original", new_settings["a"]
101
+ assert_equal "original", new_settings["b"]
102
+ assert_equal "new", new_settings["c"]
103
+ end
104
+
105
+ it "has reverse_merge!" do
106
+ settings = Traject::Indexer::Settings.new("a" => "original", "b" => "original")
107
+
108
+ settings.reverse_merge!(:a => "new", :c => "new")
109
+
110
+ assert_kind_of Traject::Indexer::Settings, settings
111
+
112
+ assert_equal "original", settings["a"]
113
+ assert_equal "original", settings["b"]
114
+ assert_equal "new", settings["c"]
115
+ end
116
+
117
+ describe "inspect" do
118
+ it "keeps keys ending in 'password' out of inspect" do
119
+ settings = Traject::Indexer::Settings.new("a" => "a",
120
+ "password" => "password", "some_password" => "password",
121
+ "some.password" => "password")
122
+
123
+ parsed = eval( settings.inspect )
124
+ assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
125
+ end
126
+ end
127
+
128
+ describe "JRuby / MRI" do
129
+ before do
130
+ @indexer = Traject::Indexer.new
131
+ end
132
+
133
+ it "has the right indexer name" do
134
+ if defined? JRUBY_VERSION
135
+ assert_equal "Traject::Marc4JReader", @indexer.settings['reader_class_name']
136
+ else
137
+ assert_equal "Traject::MarcReader", @indexer.settings['reader_class_name']
138
+ end
139
+ end
140
+
141
+ # This next one has the added effect of making sure the correct class
142
+ # has actually been loaded -- otherwise the constant wouldn't be available
143
+ it "has the correct default indexer class based on platform" do
144
+ if defined? JRUBY_VERSION
145
+ assert_equal Traject::Marc4JReader, @indexer.reader_class
146
+ else
147
+ assert_equal Traject::MarcReader, @indexer.reader_class
148
+ end
149
+ end
150
+ end
151
+
152
+ end
@@ -0,0 +1,77 @@
1
+ require 'test_helper'
2
+
3
+ describe "Traject::Indexer.to_field" do
4
+ before do
5
+ @indexer = Traject::Indexer.new
6
+ end
7
+ describe "checks it's arguments" do
8
+ it "rejects nil first arg" do
9
+ assert_raises(Traject::Indexer::NamingError) { @indexer.to_field(nil) }
10
+ end
11
+ it "rejects empty string first arg" do
12
+ assert_raises(Traject::Indexer::NamingError) {@indexer.to_field("")}
13
+ end
14
+ it "rejects non-string first arg" do
15
+ assert_raises(Traject::Indexer::NamingError) {@indexer.to_field(:symbol)}
16
+ end
17
+
18
+ it "rejects one-arg lambda" do
19
+ assert_raises(Traject::Indexer::ArityError) do
20
+ @indexer.to_field("foo") do |one_arg|
21
+ end
22
+ end
23
+ end
24
+ it "rejects four-arg lambda" do
25
+ assert_raises(Traject::Indexer::ArityError) do
26
+ @indexer.to_field("foo") do |one_arg, two_arg, three_arg, four_arg|
27
+ end
28
+ end
29
+ end
30
+ it "accepts two arg lambda" do
31
+ @indexer.to_field("foo") do |one, two|
32
+ end
33
+ end
34
+ it "accepts three arg lambda" do
35
+ @indexer.to_field("foo") {|one, two, three| one }
36
+ end
37
+ it "accepts variable lambda" do
38
+ @indexer.to_field("foo") do |*variable|
39
+ end
40
+ end
41
+ end
42
+
43
+ it "outputs error with source location" do
44
+ begin
45
+ @indexer.to_field('foo') {|one, two| }
46
+ @indexer.to_field('') {|one, two| } # bad field name
47
+ flunk("Should have rejected empty field name")
48
+ rescue Traject::Indexer::NamingError => e
49
+ assert_match(/at .*\/.*:\d+/, e.message)
50
+ rescue
51
+ flunk("Should only fail with a NamingError")
52
+ end
53
+ end
54
+
55
+ # Just verifying this is how it works
56
+ it "doesn't allow you to just wholesale assignment to the accumulator" do
57
+ @indexer.to_field('foo') do |rec, acc|
58
+ acc = ['hello']
59
+ end
60
+ output = @indexer.map_record('never looked at')
61
+ assert_equal nil, output['foo']
62
+ end
63
+
64
+ it "allows use of accumulator.replace" do
65
+ @indexer.to_field('foo') do |rec, acc|
66
+ acc.replace ['hello']
67
+ end
68
+ output = @indexer.map_record('never looked at')
69
+ assert_equal ['hello'], output['foo']
70
+ end
71
+
72
+
73
+ end
74
+
75
+
76
+
77
+
@@ -0,0 +1,412 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'test_helper'
4
+ require 'traject/marc_extractor'
5
+
6
+ require 'marc'
7
+
8
+ describe "Traject::MarcExtractor" do
9
+ it "is frozen read-only" do
10
+ extractor = Traject::MarcExtractor.new("100abcde", :seperator => ";")
11
+ assert extractor.frozen?
12
+ assert extractor.spec_hash.frozen?
13
+ assert extractor.options.frozen?
14
+ end
15
+
16
+
17
+ describe "#parse_marc_spec" do
18
+ it "parses single spec with all elements" do
19
+ parsed = Traject::MarcExtractor.parse_string_spec("245|1*|abcg")
20
+
21
+ assert_kind_of Hash, parsed
22
+ assert_equal 1, parsed.keys.length
23
+ spec = parsed['245'].first
24
+ assert_kind_of Traject::MarcExtractor::Spec, spec
25
+
26
+ assert_equal "1", spec.indicator1
27
+ assert_nil spec.indicator2
28
+
29
+ assert_kind_of Array, spec.subfields
30
+ end
31
+
32
+ it "parses a mixed bag" do
33
+ parsed = Traject::MarcExtractor.parse_string_spec("245abcde:810:700|*4|bcd")
34
+ spec245 = parsed['245'].first
35
+ spec810 = parsed['810'].first
36
+ spec700 = parsed['700'].first
37
+
38
+ assert_length 3, parsed
39
+
40
+ #245abcde
41
+ assert spec245
42
+ assert_nil spec245.indicator1
43
+ assert_nil spec245.indicator2
44
+ assert_equal %w{a b c d e}, spec245.subfields
45
+
46
+ #810
47
+ assert spec810
48
+ assert_nil spec810.indicator1
49
+ assert_nil spec810.indicator2
50
+ assert_nil spec810.subfields, "No subfields"
51
+
52
+ #700-*4bcd
53
+ assert spec700
54
+ assert_nil spec700.indicator1
55
+ assert_equal "4", spec700.indicator2
56
+ assert_equal %w{b c d}, spec700.subfields
57
+ end
58
+
59
+ it "parses fixed field byte offsets" do
60
+ parsed = Traject::MarcExtractor.parse_string_spec("005[5]:008[7-10]")
61
+
62
+ assert_equal 5, parsed["005"].first.bytes
63
+ assert_equal 7..10, parsed["008"].first.bytes
64
+ end
65
+
66
+ it "allows arrays of specs" do
67
+ parsed = Traject::MarcExtractor.parse_string_spec %w(
68
+ 245abcde
69
+ 810
70
+ 700|*4|bcd
71
+ )
72
+ assert_length 3, parsed
73
+ end
74
+
75
+ it "allows mixture of array and colon-delimited specs" do
76
+ parsed = Traject::MarcExtractor.parse_string_spec %w(
77
+ 245abcde
78
+ 100:110:111
79
+ 810
80
+ 700|*4|bcd
81
+ )
82
+ assert_length 6, parsed
83
+ end
84
+
85
+
86
+ end
87
+
88
+ # Mostly an internal method, not neccesarily API, but
89
+ # an important one, so we unit test some parts of it.
90
+ describe "#specs_covering_field" do
91
+ describe "for alternate script tags" do
92
+ before do
93
+ @record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
94
+ @extractor = Traject::MarcExtractor.new("245")
95
+
96
+ @a245 = @record.fields.find {|f| f.tag == "245"}
97
+ assert ! @a245.nil?, "Found a 245 to test"
98
+
99
+ @a880_245 = @record.fields.find do |field|
100
+ (field.tag == "880") && field['6'] &&
101
+ "245" == field['6'].slice(0,3)
102
+ end
103
+ assert ! @a880_245.nil?, "Found an 880-245 to test"
104
+
105
+ @a880_100 = @record.fields.find do |field|
106
+ (field.tag == "880") && field['6'] &&
107
+ "100" == field['6'].slice(0,3)
108
+ end
109
+
110
+ assert ! @a880_100.nil?, "Found an 880-100 to test"
111
+ end
112
+ it "finds spec for relevant 880" do
113
+ assert_equal( [Traject::MarcExtractor::Spec.new(:tag => "245")], @extractor.specs_covering_field(@a880_245) )
114
+ assert_equal [], @extractor.specs_covering_field(@a880_100)
115
+ end
116
+ it "does not find spec for 880 if disabled" do
117
+ @extractor = Traject::MarcExtractor.new("245", :alternate_script => false)
118
+ assert_equal [], @extractor.specs_covering_field(@a880_245)
119
+ end
120
+ it "finds only 880 if so configured" do
121
+ @extractor = Traject::MarcExtractor.new("245", :alternate_script => :only)
122
+ assert_equal [], @extractor.specs_covering_field(@a245)
123
+ assert_equal([Traject::MarcExtractor::Spec.new(:tag => "245")], @extractor.specs_covering_field(@a880_245))
124
+ end
125
+ end
126
+ end
127
+
128
+ describe "#extract_by_spec" do
129
+ before do
130
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
131
+ end
132
+
133
+ describe "extracts a basic case" do
134
+ before do
135
+ parsed_spec = Traject::MarcExtractor.parse_string_spec("700abcdef:856|*2|:505|1*|:245ba")
136
+ @values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
137
+ end
138
+
139
+ it "returns an array" do
140
+ assert_kind_of Array, @values
141
+ end
142
+
143
+ it "handles no subfields given" do
144
+ a856s = @record.find_all {|f| f.tag == "856"}
145
+ assert a856s, "Record must have 856 fields for this test to work"
146
+
147
+ a856s.each do |field|
148
+ assert @values.include?( field.subfields.collect(&:value).join(" "))
149
+ end
150
+ end
151
+
152
+ it "does not have 505, due to non-matching indicators" do
153
+ assert ! @values.find {|s| s.include? "propaganda model"}
154
+ end
155
+
156
+
157
+
158
+ it "respects original record order, for both fields and subfields" do
159
+ expected = ["Manufacturing consent : the political economy of the mass media /",
160
+ "Chomsky, Noam.",
161
+ "Contributor biographical information http://www.loc.gov/catdir/bios/random051/2001050014.html",
162
+ "Publisher description http://www.loc.gov/catdir/description/random044/2001050014.html"]
163
+ assert_equal expected, @values
164
+ end
165
+ end
166
+
167
+ describe "extracts fixed fields" do
168
+ it ", complete" do
169
+ parsed_spec = Traject::MarcExtractor.parse_string_spec("001")
170
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
171
+
172
+ assert_equal ["2710183"], values
173
+ end
174
+ it ", single byte offset" do
175
+ parsed_spec = Traject::MarcExtractor.parse_string_spec("008[5]")
176
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
177
+
178
+ assert_equal ["1"], values
179
+ end
180
+ it ", byte range" do
181
+ parsed_spec = Traject::MarcExtractor.parse_string_spec("008[7-10]")
182
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
183
+
184
+ assert_equal ["2002"], values
185
+ end
186
+ end
187
+
188
+ describe "separator argument" do
189
+ it "causes non-join when nil" do
190
+ parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
191
+ values = Traject::MarcExtractor.new(parsed_spec, :separator => nil).extract(@record)
192
+
193
+ assert_length 3, values
194
+ end
195
+
196
+ it "can be non-default" do
197
+ parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
198
+ values = Traject::MarcExtractor.new(parsed_spec, :separator => "!! ").extract(@record)
199
+
200
+ assert_length 1, values
201
+ assert_equal "Manufacturing consent :!! the political economy of the mass media /!! Edward S. Herman and Noam Chomsky ; with a new introduction by the authors.", values.first
202
+ end
203
+ end
204
+
205
+ describe "extracts alternate script" do
206
+ before do
207
+ @record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
208
+ @parsed_spec = Traject::MarcExtractor.parse_string_spec("245b")
209
+ end
210
+ it "from default :include" do
211
+
212
+ values = Traject::MarcExtractor.new(@parsed_spec).extract(@record)
213
+
214
+ assert_length 2, values # both the original and the 880
215
+ assert_equal ["ben Marṭin Buber le-Aharon Daṿid Gordon /", "בין מרטין בובר לאהרן דוד גורדון /"], values
216
+ end
217
+ it "with :only" do
218
+ values = Traject::MarcExtractor.new(@parsed_spec, :alternate_script => :only).extract(@record)
219
+
220
+ assert_length 1, values
221
+ assert_equal ["בין מרטין בובר לאהרן דוד גורדון /"], values
222
+ end
223
+ it "with false" do
224
+ values = Traject::MarcExtractor.new(@parsed_spec, :alternate_script => false).extract(@record)
225
+
226
+ assert_length 1, values
227
+ assert_equal ["ben Marṭin Buber le-Aharon Daṿid Gordon /"], values
228
+ end
229
+ end
230
+
231
+ it "works with string second arg too" do
232
+ values = Traject::MarcExtractor.new("245abc").extract(@record)
233
+
234
+ assert_length 1, values
235
+ assert values.first.include?("Manufacturing consent"), "Extracted value includes title"
236
+ end
237
+
238
+ it "returns empty array if no matching tags" do
239
+ values = Traject::MarcExtractor.new("999abc").extract(@record)
240
+ assert_equal [], values
241
+
242
+ values = Traject::MarcExtractor.new("999").extract(@record)
243
+ assert_equal [], values
244
+ end
245
+
246
+ it "returns empty array if matching tag but no subfield" do
247
+ values = Traject::MarcExtractor.new("245xyz").extract(@record)
248
+ assert_equal [], values
249
+ end
250
+
251
+ end
252
+
253
+ describe "with bad data" do
254
+ it "can ignore an 880 with no $6" do
255
+ @record = MARC::Reader.new(support_file_path "880_with_no_6.utf8.marc").to_a.first
256
+ values = Traject::MarcExtractor.new("001").extract(@record)
257
+ assert_equal ["3468569"], values
258
+ end
259
+ end
260
+
261
+ describe "#each_matching_line" do
262
+ before do
263
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
264
+ @extractor = Traject::MarcExtractor.new("245abc")
265
+ end
266
+ it "yields two args" do
267
+ called = false
268
+ @extractor.each_matching_line(@record) do |field, spec|
269
+ called = true
270
+ assert_kind_of MARC::DataField, field
271
+ assert_kind_of Traject::MarcExtractor::Spec, spec
272
+ end
273
+ assert called, "calls block"
274
+ end
275
+ it "yields three args" do
276
+ called = false
277
+ @extractor.each_matching_line(@record) do |field, spec, extractor|
278
+ called = true
279
+ assert_kind_of MARC::DataField, field
280
+ assert_kind_of Traject::MarcExtractor::Spec, spec
281
+ assert_kind_of Traject::MarcExtractor, extractor
282
+ assert_same @extractor, extractor
283
+ end
284
+ assert called, "calls block"
285
+ end
286
+ end
287
+
288
+ describe "#collect_matching_lines" do
289
+ before do
290
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
291
+ @extractor = Traject::MarcExtractor.new("245abc")
292
+ end
293
+ it "collects with custom block" do
294
+ results = @extractor.collect_matching_lines(@record) do |field, spec, extractor|
295
+ extractor.collect_subfields(field, spec)
296
+ end
297
+ assert_equal ["Manufacturing consent : the political economy of the mass media / Edward S. Herman and Noam Chomsky ; with a new introduction by the authors."], results
298
+ end
299
+ end
300
+
301
+ describe "MarcExtractor.cached" do
302
+ it "creates" do
303
+ extractor = Traject::MarcExtractor.cached("245abc", :separator => nil)
304
+ spec_hash = extractor.spec_hash
305
+
306
+ assert extractor.options[:separator].nil?, "extractor options[:separator] is nil"
307
+ assert_equal({"245"=>[Traject::MarcExtractor::Spec.new(:tag => "245", :subfields=>["a", "b", "c"])]}, spec_hash)
308
+ end
309
+ it "caches" do
310
+ ext1 = Traject::MarcExtractor.cached("245abc", :separator => nil)
311
+ ext2 = Traject::MarcExtractor.cached("245abc", :separator => nil)
312
+
313
+ assert_same ext1, ext2
314
+ end
315
+ end
316
+
317
+
318
+ describe "Allows multiple uses of the same tag" do
319
+ before do
320
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
321
+ end
322
+
323
+ it "allows repated tags for a variable field" do
324
+ extractor = Traject::MarcExtractor.new("245a:245b")
325
+ values = extractor.extract(@record)
326
+ assert_equal ['Manufacturing consent :', 'the political economy of the mass media /'], values
327
+ end
328
+
329
+ it "allows repeated tags with indicators specs" do
330
+ extractor = Traject::MarcExtractor.new("245|1*|a:245|2*|b")
331
+ @record.append(MARC::DataField.new('245', '2', '0', ['a', 'Subfield A Value'], ['b', 'Subfield B Value']))
332
+ results = extractor.extract(@record)
333
+ assert_equal ['Manufacturing consent :', 'Subfield B Value'], results
334
+ end
335
+
336
+
337
+
338
+
339
+ it "provides multiple values for repeated subfields with single specified subfield" do
340
+ ex = Traject::MarcExtractor.new("245a")
341
+ f = @record.fields('245').first
342
+ title_a = f['a']
343
+ f.append(MARC::Subfield.new('a', title_a))
344
+ results = ex.extract(@record)
345
+ assert_equal [title_a, title_a], results
346
+ end
347
+
348
+ it "concats single subfield spec when given as eg 245aa" do
349
+ ex = Traject::MarcExtractor.new("245aa")
350
+ f = @record.fields('245').first
351
+ title_a = f['a']
352
+ f.append(MARC::Subfield.new('a', title_a))
353
+ results = ex.extract(@record)
354
+ assert_equal ["#{title_a} #{title_a}"], results
355
+ end
356
+
357
+ it "provides single value for repeated subfields with multiple specified subfields" do
358
+ ex = Traject::MarcExtractor.new("245ab")
359
+ f = @record.fields('245').first
360
+ title_a = f['a']
361
+ title_b = f['b']
362
+ f.append(MARC::Subfield.new('a', title_a))
363
+ results = ex.extract(@record)
364
+ assert_equal ["#{title_a} #{title_b} #{title_a}"], results
365
+
366
+ end
367
+
368
+ it "provides single value for repeated subfields with no specified subfield" do
369
+ ex = Traject::MarcExtractor.new("245")
370
+ f = @record.fields('245').first
371
+ title_a = f['a']
372
+ f.append(MARC::Subfield.new('a', title_a))
373
+ results = ex.extract(@record)
374
+ assert_equal 1, results.size
375
+ end
376
+
377
+
378
+
379
+
380
+ it "allows repeated tags for a control field" do
381
+ extractor = Traject::MarcExtractor.new("001[0-1]:001[0-3]")
382
+ values = extractor.extract(@record)
383
+ assert_equal ["27", "2710"], values
384
+ end
385
+
386
+ it "associates indicators properly with repeated tags" do
387
+ @record = MARC::Record.new
388
+ @record.append MARC::DataField.new("100", '1', ' ', ['a', '100a first indicator 1'], ['b', 'should not include 100|1|b'])
389
+ @record.append MARC::DataField.new("100", '2', ' ', ['b', '100b first indicator 2'], ['a', 'should not include 100|2|a'])
390
+
391
+ extractor = Traject::MarcExtractor.new("100|1*|a:100|2*|b")
392
+
393
+ values = extractor.extract(@record)
394
+
395
+ assert_equal ['100a first indicator 1', '100b first indicator 2'], values
396
+ end
397
+
398
+ end
399
+
400
+ describe "MarcExtractor::Spec" do
401
+ describe "==" do
402
+ it "equals when equal" do
403
+ assert_equal Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}), Traject::MarcExtractor::Spec.new(:subfields => %w{a b c})
404
+ end
405
+ it "does not equal when not" do
406
+ refute_equal Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}), Traject::MarcExtractor::Spec.new(:subfields => %w{a b c}, :indicator2 => '1')
407
+ end
408
+ end
409
+ end
410
+
411
+
412
+ end