traject 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "0.9.1"
2
+ VERSION = "0.10.0"
3
3
  end
@@ -0,0 +1,9 @@
1
+ require 'traject/line_writer'
2
+ require 'yaml'
3
+
4
+ class Traject::YamlWriter < Traject::LineWriter
5
+ def serialize(context)
6
+ context.output_hash.to_yaml(:indentation=>3, :line_width => 78) + "\n\n"
7
+ end
8
+ end
9
+
@@ -0,0 +1,38 @@
1
+ require 'test_helper'
2
+ require 'stringio'
3
+
4
+ require 'traject/debug_writer'
5
+ require 'traject'
6
+ require 'marc'
7
+
8
+ describe 'Simple output' do
9
+ before do
10
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
11
+ @indexer = Traject::Indexer.new
12
+ @indexer.instance_eval do
13
+ to_field "id", extract_marc("001", :first => true)
14
+ to_field "title", extract_marc("245ab")
15
+ end
16
+ @io = StringIO.new
17
+ @writer = Traject::DebugWriter.new("output_stream" => @io)
18
+
19
+ @id = "2710183"
20
+ @title = "Manufacturing consent : the political economy of the mass media /"
21
+ end
22
+
23
+ it "does a simple output" do
24
+ @writer.put Traject::Indexer::Context.new(:output_hash => @indexer.map_record(@record))
25
+ expected = [
26
+ "#{@id} id #{@id}",
27
+ "#{@id} title #{@title}",
28
+ "\n"
29
+ ]
30
+ assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
31
+ @writer.close
32
+
33
+ end
34
+
35
+ end
36
+
37
+
38
+
@@ -7,13 +7,13 @@ describe "Traject::Indexer#each_record" do
7
7
 
8
8
  describe "checks arguments" do
9
9
  it "rejects no-arg block" do
10
- assert_raises(ArgumentError) do
10
+ assert_raises(Traject::Indexer::ArityError) do
11
11
  @indexer.each_record do
12
12
  end
13
13
  end
14
14
  end
15
15
  it "rejects three-arg block" do
16
- assert_raises(ArgumentError) do
16
+ assert_raises(Traject::Indexer::ArityError) do
17
17
  @indexer.each_record do |one, two, three|
18
18
  end
19
19
  end
@@ -30,5 +30,30 @@ describe "Traject::Indexer#each_record" do
30
30
  @indexer.each_record do |*variable|
31
31
  end
32
32
  end
33
+
34
+ it "finds first (only) field on each_record error" do
35
+ begin
36
+ @indexer.to_field('foo') {|one, two| }
37
+ @indexer.each_record {|one, two, three| } # bad arity
38
+ flunk("Should have rejected bad arity ")
39
+ rescue Traject::Indexer::ArityError => e
40
+ assert_match(/foo/, e.message)
41
+ rescue
42
+ flunk("Should only fail with a ArityError")
43
+ end
44
+ end
45
+
46
+ it "rejects each_record with a name (e.g., using a to_field syntax)" do
47
+ assert_raises(Traject::Indexer::NamingError) do
48
+ @indexer.each_record('bad_name') {|one, two| }
49
+ end
50
+ end
51
+
52
+ it "reject each_record with no arguments/blocks at all" do
53
+ assert_raises(ArgumentError) do
54
+ @indexer.each_record()
55
+ end
56
+ end
57
+
33
58
  end
34
59
  end
@@ -25,7 +25,18 @@ describe "Traject::Macros::Marc21Semantics" do
25
25
  end
26
26
  output = @indexer.map_record(@record)
27
27
 
28
- assert_equal %w{2710183 47971712}, output["oclcnum"]
28
+ assert_equal %w{47971712}, output["oclcnum"]
29
+ end
30
+
31
+ it "#marc_series_facet" do
32
+ @record = MARC::Reader.new(support_file_path "louis_armstrong.marc").to_a.first
33
+
34
+ @indexer.instance_eval do
35
+ to_field "series_facet", marc_series_facet
36
+ end
37
+ output = @indexer.map_record(@record)
38
+
39
+ assert_equal ["Big bands."], output["series_facet"]
29
40
  end
30
41
 
31
42
  describe "marc_sortable_author" do
@@ -114,8 +114,15 @@ describe "Traject::Indexer#settings" do
114
114
  assert_equal "new", settings["c"]
115
115
  end
116
116
 
117
- describe "defaults" do
118
-
117
+ describe "inspect" do
118
+ it "keeps keys ending in 'password' out of inspect" do
119
+ settings = Traject::Indexer::Settings.new("a" => "a",
120
+ "password" => "password", "some_password" => "password",
121
+ "some.password" => "password")
122
+
123
+ parsed = eval( settings.inspect )
124
+ assert_equal( {"a" => "a", "password" => "[hidden]", "some_password" => "[hidden]", "some.password" => "[hidden]"}, parsed)
125
+ end
119
126
  end
120
127
 
121
128
  end
@@ -6,20 +6,23 @@ describe "Traject::Indexer.to_field" do
6
6
  end
7
7
  describe "checks it's arguments" do
8
8
  it "rejects nil first arg" do
9
- assert_raises(ArgumentError) { @indexer.to_field(nil) }
9
+ assert_raises(Traject::Indexer::NamingError) { @indexer.to_field(nil) }
10
10
  end
11
11
  it "rejects empty string first arg" do
12
- assert_raises(ArgumentError) {@indexer.to_field("")}
12
+ assert_raises(Traject::Indexer::NamingError) {@indexer.to_field("")}
13
13
  end
14
+ it "rejects non-string first arg" do
15
+ assert_raises(Traject::Indexer::NamingError) {@indexer.to_field(:symbol)}
16
+ end
17
+
14
18
  it "rejects one-arg lambda" do
15
- assert_raises(ArgumentError) do
19
+ assert_raises(Traject::Indexer::ArityError) do
16
20
  @indexer.to_field("foo") do |one_arg|
17
-
18
21
  end
19
22
  end
20
23
  end
21
24
  it "rejects four-arg lambda" do
22
- assert_raises(ArgumentError) do
25
+ assert_raises(Traject::Indexer::ArityError) do
23
26
  @indexer.to_field("foo") do |one_arg, two_arg, three_arg, four_arg|
24
27
  end
25
28
  end
@@ -36,4 +39,31 @@ describe "Traject::Indexer.to_field" do
36
39
  end
37
40
  end
38
41
  end
42
+
43
+ describe "gives location in error message" do
44
+
45
+ it "finds no previous field on initial error" do
46
+ begin
47
+ @indexer.to_field('') {|one, two| } # bad field name
48
+ flunk("Should have rejected empty field name")
49
+ rescue Traject::Indexer::NamingError => e
50
+ assert_match(/no previous named fields/, e.message)
51
+ rescue
52
+ flunk("Should only fail with a NamingError")
53
+ end
54
+ end
55
+
56
+ it "finds first (only) field on error" do
57
+ begin
58
+ @indexer.to_field('foo') {|one, two| }
59
+ @indexer.to_field('') {|one, two| } # bad field name
60
+ flunk("Should have rejected empty field name")
61
+ rescue Traject::Indexer::NamingError => e
62
+ assert_match(/foo/, e.message)
63
+ rescue
64
+ flunk("Should only fail with a NamingError")
65
+ end
66
+ end
67
+ end
68
+
39
69
  end
@@ -54,6 +54,9 @@ describe "Marc4JReader" do
54
54
  # it's legal, it probably looks weird as a string literal
55
55
  # below, depending on your editor.
56
56
  assert_equal "Por uma outra globalização :", a245a
57
+
58
+ # Set leader byte to proper for unicode
59
+ assert_equal 'a', array.first.leader[9]
57
60
  end
58
61
 
59
62
 
@@ -50,6 +50,67 @@ describe "Traject::MarcExtractor" do
50
50
  assert_equal 5, parsed["005"][:bytes]
51
51
  assert_equal 7..10, parsed["008"][:bytes]
52
52
  end
53
+
54
+ it "allows arrays of specs" do
55
+ parsed = Traject::MarcExtractor.parse_string_spec %w(
56
+ 245abcde
57
+ 810
58
+ 700|*4|bcd
59
+ )
60
+ assert_length 3, parsed
61
+ end
62
+
63
+ it "allows mixture of array and colon-delimited specs" do
64
+ parsed = Traject::MarcExtractor.parse_string_spec %w(
65
+ 245abcde
66
+ 100:110:111
67
+ 810
68
+ 700|*4|bcd
69
+ )
70
+ assert_length 6, parsed
71
+ end
72
+
73
+
74
+ end
75
+
76
+ # Mostly an internal method, not neccesarily API, but
77
+ # an important one, so we unit test some parts of it.
78
+ describe "#spec_covering_field" do
79
+ describe "for alternate script tags" do
80
+ before do
81
+ @record = MARC::Reader.new(support_file_path "hebrew880s.marc").to_a.first
82
+ @extractor = Traject::MarcExtractor.new("245")
83
+
84
+ @a245 = @record.fields.find {|f| f.tag == "245"}
85
+ assert ! @a245.nil?, "Found a 245 to test"
86
+
87
+ @a880_245 = @record.fields.find do |field|
88
+ (field.tag == "880") && field['6'] &&
89
+ "245" == field['6'].slice(0,3)
90
+ end
91
+ assert ! @a880_245.nil?, "Found an 880-245 to test"
92
+
93
+ @a880_100 = @record.fields.find do |field|
94
+ (field.tag == "880") && field['6'] &&
95
+ "100" == field['6'].slice(0,3)
96
+ end
97
+
98
+ assert ! @a880_100.nil?, "Found an 880-100 to test"
99
+ end
100
+ it "finds spec for relevant 880" do
101
+ assert_equal( {}, @extractor.spec_covering_field(@a880_245) )
102
+ assert_nil @extractor.spec_covering_field(@a880_100)
103
+ end
104
+ it "does not find spec for 880 if disabled" do
105
+ @extractor = Traject::MarcExtractor.new("245", :alternate_script => false)
106
+ assert_nil @extractor.spec_covering_field(@a880_245)
107
+ end
108
+ it "finds only 880 if so configured" do
109
+ @extractor = Traject::MarcExtractor.new("245", :alternate_script => :only)
110
+ assert_nil @extractor.spec_covering_field(@a245)
111
+ assert_equal({}, @extractor.spec_covering_field(@a880_245))
112
+ end
113
+ end
53
114
  end
54
115
 
55
116
  describe "#extract_by_spec" do
@@ -60,7 +121,7 @@ describe "Traject::MarcExtractor" do
60
121
  describe "extracts a basic case" do
61
122
  before do
62
123
  parsed_spec = Traject::MarcExtractor.parse_string_spec("700abcdef:856|*2|:505|1*|:245ba")
63
- @values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec)
124
+ @values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
64
125
  end
65
126
 
66
127
  it "returns an array" do
@@ -94,19 +155,19 @@ describe "Traject::MarcExtractor" do
94
155
  describe "extracts fixed fields" do
95
156
  it ", complete" do
96
157
  parsed_spec = Traject::MarcExtractor.parse_string_spec("001")
97
- values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec)
158
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
98
159
 
99
160
  assert_equal ["2710183"], values
100
161
  end
101
162
  it ", single byte offset" do
102
163
  parsed_spec = Traject::MarcExtractor.parse_string_spec("008[5]")
103
- values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec)
164
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
104
165
 
105
166
  assert_equal ["1"], values
106
167
  end
107
168
  it ", byte range" do
108
169
  parsed_spec = Traject::MarcExtractor.parse_string_spec("008[7-10]")
109
- values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec)
170
+ values = Traject::MarcExtractor.new(parsed_spec).extract(@record)
110
171
 
111
172
  assert_equal ["2002"], values
112
173
  end
@@ -115,14 +176,14 @@ describe "Traject::MarcExtractor" do
115
176
  describe "seperator argument" do
116
177
  it "causes non-join when nil" do
117
178
  parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
118
- values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec, :seperator => nil)
179
+ values = Traject::MarcExtractor.new(parsed_spec, :seperator => nil).extract(@record)
119
180
 
120
181
  assert_length 3, values
121
182
  end
122
183
 
123
184
  it "can be non-default" do
124
185
  parsed_spec = Traject::MarcExtractor.parse_string_spec("245")
125
- values = Traject::MarcExtractor.extract_by_spec(@record, parsed_spec, :seperator => "!! ")
186
+ values = Traject::MarcExtractor.new(parsed_spec, :seperator => "!! ").extract(@record)
126
187
 
127
188
  assert_length 1, values
128
189
  assert_equal "Manufacturing consent :!! the political economy of the mass media /!! Edward S. Herman and Noam Chomsky ; with a new introduction by the authors.", values.first
@@ -136,19 +197,19 @@ describe "Traject::MarcExtractor" do
136
197
  end
137
198
  it "from default :include" do
138
199
 
139
- values = Traject::MarcExtractor.extract_by_spec(@record, @parsed_spec)
200
+ values = Traject::MarcExtractor.new(@parsed_spec).extract(@record)
140
201
 
141
202
  assert_length 2, values # both the original and the 880
142
203
  assert_equal ["ben Marṭin Buber le-Aharon Daṿid Gordon /", "בין מרטין בובר לאהרן דוד גורדון /"], values
143
204
  end
144
205
  it "with :only" do
145
- values = Traject::MarcExtractor.extract_by_spec(@record, @parsed_spec, :alternate_script => :only)
206
+ values = Traject::MarcExtractor.new(@parsed_spec, :alternate_script => :only).extract(@record)
146
207
 
147
208
  assert_length 1, values
148
209
  assert_equal ["בין מרטין בובר לאהרן דוד גורדון /"], values
149
210
  end
150
211
  it "with false" do
151
- values = Traject::MarcExtractor.extract_by_spec(@record, @parsed_spec, :alternate_script => false)
212
+ values = Traject::MarcExtractor.new(@parsed_spec, :alternate_script => false).extract(@record)
152
213
 
153
214
  assert_length 1, values
154
215
  assert_equal ["ben Marṭin Buber le-Aharon Daṿid Gordon /"], values
@@ -156,22 +217,22 @@ describe "Traject::MarcExtractor" do
156
217
  end
157
218
 
158
219
  it "works with string second arg too" do
159
- values = Traject::MarcExtractor.extract_by_spec(@record, "245abc")
220
+ values = Traject::MarcExtractor.new("245abc").extract(@record)
160
221
 
161
222
  assert_length 1, values
162
223
  assert values.first.include?("Manufacturing consent"), "Extracted value includes title"
163
224
  end
164
225
 
165
226
  it "returns empty array if no matching tags" do
166
- values = Traject::MarcExtractor.extract_by_spec(@record, "999abc")
227
+ values = Traject::MarcExtractor.new("999abc").extract(@record)
167
228
  assert_equal [], values
168
229
 
169
- values = Traject::MarcExtractor.extract_by_spec(@record, "999")
230
+ values = Traject::MarcExtractor.new("999").extract(@record)
170
231
  assert_equal [], values
171
232
  end
172
233
 
173
- it "returns empty array if matching tag but no subfield" do
174
- values = Traject::MarcExtractor.extract_by_spec(@record, "245xyz")
234
+ it "returns empty array if matching tag but no subfield" do
235
+ values = Traject::MarcExtractor.new("245xyz").extract(@record)
175
236
  assert_equal [], values
176
237
  end
177
238
 
@@ -180,7 +241,7 @@ describe "Traject::MarcExtractor" do
180
241
  describe "with bad data" do
181
242
  it "can ignore an 880 with no $6" do
182
243
  @record = MARC::Reader.new(support_file_path "880_with_no_6.utf8.marc").to_a.first
183
- values = Traject::MarcExtractor.extract_by_spec(@record, "001")
244
+ values = Traject::MarcExtractor.new("001").extract(@record)
184
245
  assert_equal ["3468569"], values
185
246
  end
186
247
  end
@@ -188,11 +249,11 @@ describe "Traject::MarcExtractor" do
188
249
  describe "#each_matching_line" do
189
250
  before do
190
251
  @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
191
- @extractor = Traject::MarcExtractor.new(@record, "245abc")
252
+ @extractor = Traject::MarcExtractor.new("245abc")
192
253
  end
193
254
  it "yields two args" do
194
255
  called = false
195
- @extractor.each_matching_line do |field, spec|
256
+ @extractor.each_matching_line(@record) do |field, spec|
196
257
  called = true
197
258
  assert_kind_of MARC::DataField, field
198
259
  assert_kind_of Hash, spec
@@ -201,7 +262,7 @@ describe "Traject::MarcExtractor" do
201
262
  end
202
263
  it "yields three args" do
203
264
  called = false
204
- @extractor.each_matching_line do |field, spec, extractor|
265
+ @extractor.each_matching_line(@record) do |field, spec, extractor|
205
266
  called = true
206
267
  assert_kind_of MARC::DataField, field
207
268
  assert_kind_of Hash, spec
@@ -215,16 +276,29 @@ describe "Traject::MarcExtractor" do
215
276
  describe "#collect_matching_lines" do
216
277
  before do
217
278
  @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
218
- @extractor = Traject::MarcExtractor.new(@record, "245abc")
279
+ @extractor = Traject::MarcExtractor.new("245abc")
219
280
  end
220
281
  it "collects with custom block" do
221
- results = @extractor.collect_matching_lines do |field, spec, extractor|
282
+ results = @extractor.collect_matching_lines(@record) do |field, spec, extractor|
222
283
  extractor.collect_subfields(field, spec)
223
284
  end
224
285
  assert_equal ["Manufacturing consent : the political economy of the mass media / Edward S. Herman and Noam Chomsky ; with a new introduction by the authors."], results
225
286
  end
226
287
  end
227
288
 
289
+ describe "MarcExtractor.cached" do
290
+ it "creates" do
291
+ ext = Traject::MarcExtractor.cached("245abc", :seperator => nil)
292
+ assert_equal({"245"=>{:subfields=>["a", "b", "c"]}}, ext.spec_hash)
293
+ assert ext.options[:seperator].nil?, "extractor options[:seperator] is nil"
294
+ end
295
+ it "caches" do
296
+ ext1 = Traject::MarcExtractor.cached("245abc", :seperator => nil)
297
+ ext2 = Traject::MarcExtractor.cached("245abc", :seperator => nil)
298
+
299
+ assert_same ext1, ext2
300
+ end
301
+ end
228
302
 
229
303
 
230
304
  end
@@ -105,11 +105,14 @@ to_field "pub_date", marc_publication_date
105
105
 
106
106
  # LCC to broad class, start with built-in from marc record, but then do our own for local
107
107
  # call numbers.
108
- lcc_map = Traject::TranslationMap.new("lcc_top_level")
108
+ lcc_map = Traject::TranslationMap.new("lcc_top_level")
109
+ holdings_extractor = Traject::MarcExtractor.new("991:937")
110
+ sudoc_extractor = Traject::MarcExtractor.new("086a", :seperator =>nil)
111
+
109
112
  to_field "discipline_facet", marc_lcc_to_broad_category(:default => nil) do |record, accumulator|
110
113
  # add in our local call numbers
111
114
  accumulator.concat(
112
- Traject::MarcExtractor.new(record, "991:937").collect_matching_lines do |field, spec, extractor|
115
+ holdings_extractor.collect_matching_lines(record) do |field, spec, extractor|
113
116
  # we output call type 'processor' in subfield 'f' of our holdings
114
117
  # fields, that sort of maybe tells us if it's an LCC field.
115
118
  # When the data is right, which it often isn't.
@@ -130,7 +133,7 @@ to_field "discipline_facet", marc_lcc_to_broad_category(:default => nil) do |re
130
133
 
131
134
  # If it's got an 086, we'll put it in "Government Publication", to be
132
135
  # consistent with when we do that from a local SuDoc call #.
133
- if Traject::MarcExtractor.extract_by_spec(record, "086a", :seperator =>nil).length > 0
136
+ if sudoc_extractor.extract(record).length > 0
134
137
  accumulator << "Government Publication"
135
138
  end
136
139