traject 0.9.1 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "0.9.1"
2
+ VERSION = "0.10.0"
3
3
  end
@@ -0,0 +1,9 @@
1
+ require 'traject/line_writer'
2
+ require 'yaml'
3
+
4
+ class Traject::YamlWriter < Traject::LineWriter
5
+ def serialize(context)
6
+ context.output_hash.to_yaml(:indentation=>3, :line_width => 78) + "\n\n"
7
+ end
8
+ end
9
+
@@ -0,0 +1,38 @@
1
+ require 'test_helper'
2
+ require 'stringio'
3
+
4
+ require 'traject/debug_writer'
5
+ require 'traject'
6
+ require 'marc'
7
+
8
+ describe 'Simple output' do
9
+ before do
10
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
11
+ @indexer = Traject::Indexer.new
12
+ @indexer.instance_eval do
13
+ to_field "id", extract_marc("001", :first => true)
14
+ to_field "title", extract_marc("245ab")
15
+ end
16
+ @io = StringIO.new
17
+ @writer = Traject::DebugWriter.new("output_stream" => @io)
18
+
19
+ @id = "2710183"
20
+ @title = "Manufacturing consent : the political economy of the mass media /"
21
+ end
22
+
23
+ it "does a simple output" do
24
+ @writer.put Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
25
+ expected = [
26
+ "#{@id} id #{@id}",
27
+ "#{@id} title #{@title}",
28
+ "\n"
29
+ ]
30
+ assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
31
+ @writer.close
32
+
33
+ end
34
+
35
+ end
36
+
37
+
38
+
@@ -7,13 +7,13 @@ describe "Traject::Indexer#each_record" do
7
7
 
8
8
  describe "checks arguments" do
9
9
  it "rejects no-arg block" do
10
- assert_raises(ArgumentError) do
10
+ assert_raises(Traject::Indexer::ArityError) do
11
11
  @indexer.each_record do
12
12
  end
13
13
  end
14
14
  end
15
15
  it "rejects three-arg block" do
16
- assert_raises(ArgumentError) do
16
+ assert_raises(Traject::Indexer::ArityError) do
17
17
  @indexer.each_record do |one, two, three|
18
18
  end
19
19
  end
@@ -30,5 +30,30 @@ describe "Traject::Indexer#each_record" do
30
30
  @indexer.each_record do |*variable|
31
31
  end
32
32
  end
33
+
34
+ it "finds first (only) field on each_record error" do
35
+ begin
36
+ @indexer.to_field('foo') {|one, two| }
37
+ @indexer.each_record {|one, two, three| } # bad arity
38
+ flunk("Should have rejected bad arity ")
39
+ rescue Traject::Indexer::ArityError => e
40
+ assert_match(/foo/, e.message)
41
+ rescue
42
+ flunk("Should only fail with a ArityError")
43
+ end
44
+ end
45
+
46
+ it "rejects each_record with a name (e.g., using a to_field syntax)" do
47
+ assert_raises(Traject::Indexer::NamingError) do
48
+ @indexer.each_record('bad_name') {|one, two| }
49
+ end
50
+ end
51
+
52
+ it "reject each_record with no arguments/blocks at all" do
53
+ assert_raises(ArgumentError) do
54
+ @indexer.each_record()
55
+ end
56
+ end
57
+
33
58
  end
34
59
  end
@@ -25,7 +25,18 @@ describe "Traject::Macros::Marc21Semantics" do
25
25
  end
26
26
  output = @indexer.map_record(@record)
27
27
 
28
- assert_equal %w{2710183 47971712}, output["oclcnum"]
28
+ assert_equal %w{47971712}, output["oclcnum"]
29
+ end
30
+
31
+ it "#marc_series_facet" do
32
+ @record = MARC::Reader.new(support_file_path "louis_armstrong.marc").to_a.first
33
+
34
+ @indexer.instance_eval do
35
+ to_field "series_facet", marc_series_facet
36
+ end
37
+ output = @indexer.map_record(@record)
38
+
39
+ assert_equal ["Big bands."], output["series_facet"]
29
40
  end
30
41
 
31
42
  describe "marc_sortable_author" do
@@ -114,8 +114,15 @@ describe "Traject::Indexer#settings" do
114
114
  assert_equal "new", settings["c"]
115
115
  end
116
116
 
117
- describe "defaults" do
118
-
117
+ describe "inspect" do
118
+ it "keeps keys ending in 'password' out of inspect" do
119
+ settings = Traject::Indexer::Settings.new("a" => "a",
120
+ "password" => "password", "some_password" => "password",
121
+ "some.password" => "password")
122
+
123
+ parsed = eval( settings.inspect )
124
+ assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
125
+ end
119
126
  end
120
127
 
121
128
  end
@@ -6,20 +6,23 @@ describe "Traject::Indexer.to_field" do
6
6
  end
7
7
  describe "checks it's arguments" do
8
8
  it "rejects nil first arg" do
9
- assert_raises(ArgumentError) { @indexer.to_field(nil) }
9
+ assert_raises(Traject::Indexer::NamingError) { @indexer.to_field(nil) }
10
10
  end
11
11
  it "rejects empty string first arg" do
12
- assert_raises(ArgumentError) {@indexer.to_field("")}
12
+ assert_raises(Traject::Indexer::NamingError) {@indexer.to_field("")}
13
13
  end
14
+ it "rejects non-string first arg" do
15
+ assert_raises(Traject::Indexer::NamingError) {@indexer.to_field(:symbol)}
16
+ end
17
+
14
18
  it "rejects one-arg lambda" do
15
- assert_raises(ArgumentError) do
19
+ assert_raises(Traject::Indexer::ArityError) do
16
20
  @indexer.to_field("foo") do |one_arg|
17
-
18
21
  end
19
22
  end
20
23
  end
21
24
  it "rejects four-arg lambda" do
22
- assert_raises(ArgumentError) do
25
+ assert_raises(Traject::Indexer::ArityError) do
23
26
  @indexer.to_field("foo") do |one_arg, two_arg, three_arg, four_arg|
24
27
  end
25
28
  end
@@ -36,4 +39,31 @@ describe "Traject::Indexer.to_field" do
36
39
  end
37
40
  end
38
41
  end
42
+
43
+ describe "gives location in error message" do
44
+
45
+ it "finds no previous field on initial error" do
46
+ begin
47
+ @indexer.to_field('') {|one, two| } # bad field name
48
+ flunk("Should have rejected empty field name")
49
+ rescue Traject::Indexer::NamingError => e
50
+ assert_match(/no previous named fields/, e.message)
51
+ rescue
52
+ flunk("Should only fail with a NamingError")
53
+ end
54
+ end
55
+
56
+ it "finds first (only) field on error" do
57
+ begin
58
+ @indexer.to_field('foo') {|one, two| }
59
+ @indexer.to_field('') {|one, two| } # bad field name
60
+ flunk("Should have rejected empty field name")
61
+ rescue Traject::Indexer::NamingError => e
62
+ assert_match(/foo/, e.message)
63
+ rescue
64
+ flunk("Should only fail with a NamingError")
65
+ end
66
+ end
67
+ end
68
+
39
69
  end
@@ -54,6 +54,9 @@ describe "Marc4JReader" do
54
54
  # it's legal, it probably looks weird as a string literal
55
55
  # below, depending on your editor.
56
56
  assert_equal "Por uma outra globalização :", a245a
57
+
58
+ # Set leader byte to proper for unicode
59
+ assert_equal 'a', array.first.leader[9]
57
60
  end
58
61
 
59
62
 
@@ -50,6 +50,67 @@ describe "Traject::MarcExtractor" do
50
50
  assert_equal 5, parsed["005"][:bytes]
51
51
  assert_equal 7..10, parsed["008"][:bytes]
52
52
  end
53
+
54
+ it "allows arrays of specs" do
55
+ parsed = Traject::MarcExtractor.parse_string_spec %w(
56
+ 245abcde
57
+ 810
58
+ 700|*4|bcd
59
+ )
60
+ assert_length 3, parsed
61
+ end
62
+
63
+ it "allows mixture of array and colon-delimited specs" do
64
+ parsed = Traject::MarcExtractor.parse_string_spec %w(
65
+ 245abcde
66
+ 100:110:111
67
+ 810
68
+ 700|*4|bcd
69
+ )
70
+ assert_length 6, parsed
71
+ end
72
+
73
+
74
+ end
75
+
76
+ # Mostly an internal method, not neccesarily API, but
77
+ # an important one, so we unit test some parts of it.
78
+ describe "#spec_covering_field" do
79
+ describe "for alternate script tags" do
80
+ before do
81
+ @record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
82
+ @extractor = Traject::MarcExtractor.new("245")
83
+
84
+ @a245 = @record.fields.find {|f| f.tag == "245"}
85
+ assert ! @a245.nil?, "Found a 245 to test"
86
+
87
+ @a880_245 = @record.fields.find do |field|
88
+ (field.tag == "880") && field['6'] &&
89
+ "245" == field['6'].slice(0,3)
90
+ end
91
+ assert ! @a880_245.nil?, "Found an 880-245 to test"
92
+
93
+ @a880_100 = @record.fields.find do |field|
94
+ (field.tag == "880") && field['6'] &&
95
+ "100" == field['6'].slice(0,3)
96
+ end
97
+
98
+ assert ! @a880_100.nil?, "Found an 880-100 to test"
99
+ end
100
+ it "finds spec for relevant 880" do
101
+ assert_equal( {}, @extractor.spec_covering_field(@a880_245) )
102
+ assert_nil @extractor.spec_covering_field(@a880_100)
103
+ end
104
+ it "does not find spec for 880 if disabled" do
105
+ @extractor = Traject::MarcExtractor.new("245", :alternate_script => false)
106
+ assert_nil @extractor.spec_covering_field(@a880_245)
107
+ end
108
+ it "finds only 880 if so configured" do
109
+ @extractor = Traject::MarcExtractor.new("245", :alternate_script => :only)
110
+ assert_nil @extractor.spec_covering_field(@a245)
111
+ assert_equal({}, @extractor.spec_covering_field(@a880_245))
112
+ end
113
+ end
53
114
  end
54
115
 
55
116
  describe "#extract_by_spec" do
@@ -60,7 +121,7 @@ describe "Traject::MarcExtractor" do
60
121
  describe "extracts a basic case" do
61
122
  before do
62
123
  parsed_spec = Traject::MarcExtractor.parse_string_spec("700abcdef:856|*2|:505|1*|:245ba")
63
- @values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec)
124
+ @values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
64
125
  end
65
126
 
66
127
  it "returns an array" do
@@ -94,19 +155,19 @@ describe "Traject::MarcExtractor" do
94
155
  describe "extracts fixed fields" do
95
156
  it ", complete" do
96
157
  parsed_spec = Traject::MarcExtractor.parse_string_spec("001")
97
- values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec)
158
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
98
159
 
99
160
  assert_equal ["2710183"], values
100
161
  end
101
162
  it ", single byte offset" do
102
163
  parsed_spec = Traject::MarcExtractor.parse_string_spec("008[5]")
103
- values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec)
164
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
104
165
 
105
166
  assert_equal ["1"], values
106
167
  end
107
168
  it ", byte range" do
108
169
  parsed_spec = Traject::MarcExtractor.parse_string_spec("008[7-10]")
109
- values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec)
170
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
110
171
 
111
172
  assert_equal ["2002"], values
112
173
  end
@@ -115,14 +176,14 @@ describe "Traject::MarcExtractor" do
115
176
  describe "seperator argument" do
116
177
  it "causes non-join when nil" do
117
178
  parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
118
- values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec, :seperator => nil)
179
+ values = Traject::MarcExtractor.new(parsed_spec, :seperator => nil).extract(@record)
119
180
 
120
181
  assert_length 3, values
121
182
  end
122
183
 
123
184
  it "can be non-default" do
124
185
  parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
125
- values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec, :seperator => "!! ")
186
+ values = Traject::MarcExtractor.new(parsed_spec, :seperator => "!! ").extract(@record)
126
187
 
127
188
  assert_length 1, values
128
189
  assert_equal "Manufacturing consent :!! the political economy of the mass media /!! Edward S. Herman and Noam Chomsky ; with a new introduction by the authors.", values.first
@@ -136,19 +197,19 @@ describe "Traject::MarcExtractor" do
136
197
  end
137
198
  it "from default :include" do
138
199
 
139
- values = Traject::MarcExtractor.extract_by_spec(@record, @parsed_spec)
200
+ values = Traject::MarcExtractor.new(@parsed_spec).extract(@record)
140
201
 
141
202
  assert_length 2, values # both the original and the 880
142
203
  assert_equal ["ben Marṭin Buber le-Aharon Daṿid Gordon /", "בין מרטין בובר לאהרן דוד גורדון /"], values
143
204
  end
144
205
  it "with :only" do
145
- values = Traject::MarcExtractor.extract_by_spec(@record, @parsed_spec, :alternate_script => :only)
206
+ values = Traject::MarcExtractor.new(@parsed_spec, :alternate_script => :only).extract(@record)
146
207
 
147
208
  assert_length 1, values
148
209
  assert_equal ["בין מרטין בובר לאהרן דוד גורדון /"], values
149
210
  end
150
211
  it "with false" do
151
- values = Traject::MarcExtractor.extract_by_spec(@record, @parsed_spec, :alternate_script => false)
212
+ values = Traject::MarcExtractor.new(@parsed_spec, :alternate_script => false).extract(@record)
152
213
 
153
214
  assert_length 1, values
154
215
  assert_equal ["ben Marṭin Buber le-Aharon Daṿid Gordon /"], values
@@ -156,22 +217,22 @@ describe "Traject::MarcExtractor" do
156
217
  end
157
218
 
158
219
  it "works with string second arg too" do
159
- values = Traject::MarcExtractor.extract_by_spec(@record, "245abc")
220
+ values = Traject::MarcExtractor.new("245abc").extract(@record)
160
221
 
161
222
  assert_length 1, values
162
223
  assert values.first.include?("Manufacturing consent"), "Extracted value includes title"
163
224
  end
164
225
 
165
226
  it "returns empty array if no matching tags" do
166
- values = Traject::MarcExtractor.extract_by_spec(@record, "999abc")
227
+ values = Traject::MarcExtractor.new("999abc").extract(@record)
167
228
  assert_equal [], values
168
229
 
169
- values = Traject::MarcExtractor.extract_by_spec(@record, "999")
230
+ values = Traject::MarcExtractor.new("999").extract(@record)
170
231
  assert_equal [], values
171
232
  end
172
233
 
173
- it "returns empty array if matching tag but no subfield" do
174
- values = Traject::MarcExtractor.extract_by_spec(@record, "245xyz")
234
+ it "returns empty array if matching tag but no subfield" do
235
+ values = Traject::MarcExtractor.new("245xyz").extract(@record)
175
236
  assert_equal [], values
176
237
  end
177
238
 
@@ -180,7 +241,7 @@ describe "Traject::MarcExtractor" do
180
241
  describe "with bad data" do
181
242
  it "can ignore an 880 with no $6" do
182
243
  @record = MARC::Reader.new(support_file_path "880_with_no_6.utf8.marc").to_a.first
183
- values = Traject::MarcExtractor.extract_by_spec(@record, "001")
244
+ values = Traject::MarcExtractor.new("001").extract(@record)
184
245
  assert_equal ["3468569"], values
185
246
  end
186
247
  end
@@ -188,11 +249,11 @@ describe "Traject::MarcExtractor" do
188
249
  describe "#each_matching_line" do
189
250
  before do
190
251
  @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
191
- @extractor = Traject::MarcExtractor.new(@record, "245abc")
252
+ @extractor = Traject::MarcExtractor.new("245abc")
192
253
  end
193
254
  it "yields two args" do
194
255
  called = false
195
- @extractor.each_matching_line do |field, spec|
256
+ @extractor.each_matching_line(@record) do |field, spec|
196
257
  called = true
197
258
  assert_kind_of MARC::DataField, field
198
259
  assert_kind_of Hash, spec
@@ -201,7 +262,7 @@ describe "Traject::MarcExtractor" do
201
262
  end
202
263
  it "yields three args" do
203
264
  called = false
204
- @extractor.each_matching_line do |field, spec, extractor|
265
+ @extractor.each_matching_line(@record) do |field, spec, extractor|
205
266
  called = true
206
267
  assert_kind_of MARC::DataField, field
207
268
  assert_kind_of Hash, spec
@@ -215,16 +276,29 @@ describe "Traject::MarcExtractor" do
215
276
  describe "#collect_matching_lines" do
216
277
  before do
217
278
  @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
218
- @extractor = Traject::MarcExtractor.new(@record, "245abc")
279
+ @extractor = Traject::MarcExtractor.new("245abc")
219
280
  end
220
281
  it "collects with custom block" do
221
- results = @extractor.collect_matching_lines do |field, spec, extractor|
282
+ results = @extractor.collect_matching_lines(@record) do |field, spec, extractor|
222
283
  extractor.collect_subfields(field, spec)
223
284
  end
224
285
  assert_equal ["Manufacturing consent : the political economy of the mass media / Edward S. Herman and Noam Chomsky ; with a new introduction by the authors."], results
225
286
  end
226
287
  end
227
288
 
289
+ describe "MarcExtractor.cached" do
290
+ it "creates" do
291
+ ext = Traject::MarcExtractor.cached("245abc", :seperator => nil)
292
+ assert_equal({"245"=>{:subfields=>["a", "b", "c"]}}, ext.spec_hash)
293
+ assert ext.options[:seperator].nil?, "extractor options[:seperator] is nil"
294
+ end
295
+ it "caches" do
296
+ ext1 = Traject::MarcExtractor.cached("245abc", :seperator => nil)
297
+ ext2 = Traject::MarcExtractor.cached("245abc", :seperator => nil)
298
+
299
+ assert_same ext1, ext2
300
+ end
301
+ end
228
302
 
229
303
 
230
304
  end
@@ -105,11 +105,14 @@ to_field "pub_date", marc_publication_date
105
105
 
106
106
  # LCC to broad class, start with built-in from marc record, but then do our own for local
107
107
  # call numbers.
108
- lcc_map = Traject::TranslationMap.new("lcc_top_level")
108
+ lcc_map = Traject::TranslationMap.new("lcc_top_level")
109
+ holdings_extractor = Traject::MarcExtractor.new("991:937")
110
+ sudoc_extractor = Traject::MarcExtractor.new("086a", :seperator =>nil)
111
+
109
112
  to_field "discipline_facet", marc_lcc_to_broad_category(:default => nil) do |record, accumulator|
110
113
  # add in our local call numbers
111
114
  accumulator.concat(
112
- Traject::MarcExtractor.new(record, "991:937").collect_matching_lines do |field, spec, extractor|
115
+ holdings_extractor.collect_matching_lines(record) do |field, spec, extractor|
113
116
  # we output call type 'processor' in subfield 'f' of our holdings
114
117
  # fields, that sort of maybe tells us if it's an LCC field.
115
118
  # When the data is right, which it often isn't.
@@ -130,7 +133,7 @@ to_field "discipline_facet", marc_lcc_to_broad_category(:default => nil) do |re
130
133
 
131
134
  # If it's got an 086, we'll put it in "Government Publication", to be
132
135
  # consistent with when we do that from a local SuDoc call #.
133
- if Traject::MarcExtractor.extract_by_spec(record, "086a", :seperator =>nil).length > 0
136
+ if sudoc_extractor.extract(record).length > 0
134
137
  accumulator << "Government Publication"
135
138
  end
136
139