eccsv 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8dd728098ee2f3066326be199f2657c34b33facd
4
+ data.tar.gz: 0472be8f00b3f6c9079cc20808bfd89804ff2212
5
+ SHA512:
6
+ metadata.gz: 9238b550d38a2766e5c53ca152855dfc7f782cc79c98dad60030095138e4a40a68655b14f22c3e376214901294ad81ad3e835e9091c2ee9d810c302e3f744ff3
7
+ data.tar.gz: 12599cfb6ae428dc134dd67c64fac344fa2a0306ba28de4e5f7a183b2754780d5c3e865ead20f6e20e2979b076e49e13ca83bbd11eb51ddf03587839d648b693
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in csv_parser.gemspec
4
+ gemspec
5
+
6
+ gem 'byebug'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Vanderbilt University
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,113 @@
1
+ # ECCSV
2
+
3
+ ECCSV (error correcting comma seperated values) is a CSV parsing library with
4
+ advanced error reporting and correcting.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'eccsv'
11
+
12
+ And then execute:
13
+
14
+ $ bundle
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install eccsv
19
+
20
+ ## Basic Usage
21
+
22
+ ```ruby
23
+ require 'eccsv'
24
+
25
+ data = <<EOF
26
+ foo,bar
27
+ baz,qux
28
+ EOF
29
+
30
+ parser = ECCSV::Parser.new
31
+ parser.parse(data) #=> [["foo", "bar"], ["baz", "qux"]]
32
+ ```
33
+
34
+ ## Errors
35
+
36
+ One of the goals of this project is to give you descriptive error messages.
37
+
38
+ Each error type is a subclass of `ECCSV::Error` and contains the exact line
39
+ number (via `Error#line`) and column number (via `Error#col`) where the error
40
+ took place.
41
+
42
+ * missing closing quote (`ECCSV::UnmatchedQuoteError`)
43
+ * quote in the wrong place (`ECCSV::StrayQuoteError`)
44
+ * rows with not enough fields (`ECCSV::MissingFieldsError`)
45
+ * rows with too many fields (`ECCSV::ExtraFieldsError`)
46
+
47
+ Since missing/extra fields do not cause the CSV to be unparsable, they are
48
+ treated as warnings instead of errors (see example below).
49
+
50
+ ### Examples
51
+
52
+ #### Unmatched quote
53
+
54
+ If there was an error, `#parse` will return `nil` and set `#error`.
55
+
56
+ ```ruby
57
+ require 'eccsv'
58
+
59
+ data = <<EOF
60
+ foo,"bar
61
+ baz,qux
62
+ EOF
63
+
64
+ parser = ECCSV::Parser.new
65
+ parser.parse(data) #=> nil
66
+ parser.error #=> #<ECCSV::UnmatchedQuoteError: unmatched quote at line 1, column 5>
67
+ parser.error.line #=> 1
68
+ parser.error.col #=> 5
69
+ ```
70
+
71
+ #### Missing fields
72
+
73
+ If there was a warning, `#parse` will return the records and add to `#warnings`.
74
+
75
+ ```ruby
76
+ require 'eccsv'
77
+
78
+ data = <<EOF
79
+ foo,bar
80
+ baz
81
+ EOF
82
+
83
+ parser = ECCSV::Parser.new
84
+ parser.parse(data) #=> [["foo", "bar"], ["baz"]]
85
+ parser.warnings #=> [#<ECCSV::MissingFieldsError: expected 1 more fields on line 2>]
86
+ parser.warnings[0].line #=> 2
87
+ parser.warnings[0].col #=> 4
88
+ ```
89
+
90
+ #### Extra fields
91
+
92
+ ```ruby
93
+ require 'eccsv'
94
+
95
+ data = <<EOF
96
+ foo
97
+ bar,baz
98
+ EOF
99
+
100
+ parser = ECCSV::Parser.new
101
+ parser.parse(data) #=> [["foo"], ["bar", "baz"]]
102
+ parser.warnings #=> [#<ECCSV::ExtraFieldsError: 1 extra fields found on line 2, column 4>]
103
+ parser.warnings[0].line #=> 2
104
+ parser.warnings[0].col #=> 4
105
+ ```
106
+
107
+ ## Contributing
108
+
109
+ 1. Fork it
110
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
111
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
112
+ 4. Push to the branch (`git push origin my-new-feature`)
113
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,16 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.pattern = 'test/**/test*.rb'
7
+ end
8
+ task :test => :racc
9
+ task :default => :test
10
+
11
+ desc "Compile racc grammar"
12
+ task :racc => "lib/eccsv/parser.rb"
13
+
14
+ file "lib/eccsv/parser.rb" => "lib/eccsv/parser.y" do |t|
15
+ system("racc -v -o #{t.name} #{t.source}")
16
+ end
data/eccsv.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'eccsv/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "eccsv"
8
+ spec.version = ECCSV::VERSION
9
+ spec.authors = ["Jeremy Stephens"]
10
+ spec.email = ["jeremy.f.stephens@vanderbilt.edu"]
11
+ spec.description = %q{CSV library with advanced error reporting}
12
+ spec.summary = %q{CSV library with advanced error reporting}
13
+ spec.homepage = "https://github.com/coupler/eccsv"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "test-unit"
24
+ spec.add_development_dependency "racc"
25
+ end
@@ -0,0 +1,16 @@
1
+ module ECCSV
2
+ class Error < Exception
3
+ attr_reader :line, :col
4
+
5
+ def initialize(msg = nil, line = nil, col = nil)
6
+ super(msg)
7
+ @line = line
8
+ @col = col
9
+ end
10
+ end
11
+
12
+ class UnmatchedQuoteError < Error; end
13
+ class StrayQuoteError < Error; end
14
+ class MissingFieldsError < Error; end
15
+ class ExtraFieldsError < Error; end
16
+ end
@@ -0,0 +1,465 @@
1
+ #
2
+ # DO NOT MODIFY!!!!
3
+ # This file is automatically generated by Racc 1.4.12
4
+ # from Racc grammer file "".
5
+ #
6
+
7
+ require 'racc/parser.rb'
8
+
9
+ require 'strscan'
10
+
11
+ module ECCSV
12
+ class Parser < Racc::Parser
13
+
14
+ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 36)
15
+ class Node
16
+ attr_reader :value, :token, :line, :col
17
+
18
+ def initialize(value = "", token = nil, line = nil, col = nil)
19
+ @value = value
20
+ @token = token
21
+ @line = line
22
+ @col = col
23
+ end
24
+ end
25
+
26
+ class ParentNode < Node
27
+ def initialize(children = [], line = nil, col = nil)
28
+ last = children.last
29
+ if last && last.is_a?(Node)
30
+ line = last.line
31
+ col = last.col
32
+ end
33
+ super(nil, nil, line, col)
34
+ @children = children
35
+ end
36
+ end
37
+
38
+ class QuotedTextNode < ParentNode
39
+ def value
40
+ @value ||= @children.collect(&:value).join
41
+ end
42
+ end
43
+
44
+ class FieldNode < ParentNode
45
+ def value
46
+ @value ||=
47
+ if @children[0].token == :TEXT
48
+ @children[0].value
49
+ else
50
+ # quoted text
51
+ @children[1].value
52
+ end
53
+ end
54
+ end
55
+
56
+ class DelimFieldNode < ParentNode
57
+ def value
58
+ @value ||= @children[0].value
59
+ end
60
+ end
61
+
62
+ class DelimFieldsNode < ParentNode
63
+ def value
64
+ @value ||=
65
+ if @children.empty?
66
+ []
67
+ else
68
+ @children[0].value + [@children[1].value]
69
+ end
70
+ end
71
+ end
72
+
73
+ class RecordNode < ParentNode
74
+ def value
75
+ # TODO: 'consume' children to produce value to reduce memory footprint
76
+ @value ||= @children[0].value + [@children[1].value]
77
+ end
78
+ end
79
+
80
+ class DelimRecordNode < ParentNode
81
+ def value
82
+ @value ||= @children.length == 1 ? [] : @children[0].value
83
+ end
84
+ end
85
+
86
+ class DelimRecordsNode < ParentNode
87
+ def value
88
+ if @value.nil?
89
+ if @children.empty?
90
+ @value = []
91
+ else
92
+ @value = @children[0].value
93
+ val = @children[1].value
94
+ if !val.empty?
95
+ @value += [val]
96
+ end
97
+ end
98
+ end
99
+ @value
100
+ end
101
+ end
102
+
103
+ class RootNode < ParentNode
104
+ def value
105
+ if @value.nil?
106
+ @value = @children[0].value
107
+ if @children[1]
108
+ @value += [@children[1].value]
109
+ end
110
+ end
111
+ @value
112
+ end
113
+ end
114
+
115
+ attr_reader :error
116
+
117
+ def parse(str)
118
+ @scanner = StringScanner.new(str)
119
+ @line = 1
120
+ @col = 1
121
+ do_parse
122
+ end
123
+
124
+ def next_token
125
+ until @scanner.empty?
126
+ next_line = @line
127
+ next_col = @col
128
+ case
129
+ when match = @scanner.scan(/,/)
130
+ token = :COMMA
131
+ when match = @scanner.scan(/"/)
132
+ token = :QUOTE
133
+ when match = @scanner.scan(/\n/)
134
+ token = :NEWLINE
135
+ next_line += 1
136
+ next_col = 0
137
+ when match = @scanner.scan(/[^,\n"]+/)
138
+ token = :TEXT
139
+ else
140
+ raise "can't recognize <#{@scanner.peek(5)}>"
141
+ end
142
+ next_col += match.length
143
+
144
+ value = node(match, token)
145
+ @line = next_line
146
+ @col = next_col
147
+
148
+ return [token, value]
149
+ end
150
+ end
151
+
152
+ def warnings
153
+ @warnings ||= []
154
+ end
155
+
156
+ private
157
+
158
+ def node(value = "", token = nil, line = @line, col = @col)
159
+ Node.new(value, token, line, col)
160
+ end
161
+
162
+ def quoted_text(children = [], line = @line, col = @col)
163
+ QuotedTextNode.new(children, line, col)
164
+ end
165
+
166
+ def field(children = [], line = @line, col = @col)
167
+ FieldNode.new(children, line, col)
168
+ end
169
+
170
+ def delim_field(children = [], line = @line, col = @col)
171
+ DelimFieldNode.new(children, line, col)
172
+ end
173
+
174
+ def delim_fields(children = [], line = @line, col = @col)
175
+ DelimFieldsNode.new(children, line, col)
176
+ end
177
+
178
+ def record(children = [], line = @line, col = @col)
179
+ record = RecordNode.new(children, line, col)
180
+ value = record.value
181
+ if defined? @num_fields
182
+ first = children[0]
183
+ line = first.line
184
+ col = first.col
185
+ if @num_fields > value.length
186
+ msg = "expected %d more fields on line %d" % [@num_fields - value.length, line]
187
+ self.warnings.push(MissingFieldsError.new(msg, line, col))
188
+ elsif @num_fields < value.length
189
+ msg = "%d extra fields found on line %d, column %d" % [value.length - @num_fields, line, col]
190
+ self.warnings.push(ExtraFieldsError.new(msg, line, col))
191
+ end
192
+ else
193
+ @num_fields = value.length
194
+ end
195
+
196
+ record
197
+ end
198
+
199
+ def delim_record(children = [], line = @line, col = @col)
200
+ DelimRecordNode.new(children, line, col)
201
+ end
202
+
203
+ def delim_records(children = [], line = @line, col = @col)
204
+ DelimRecordsNode.new(children, line, col)
205
+ end
206
+
207
+ def root(children = [], line = @line, col = @col)
208
+ RootNode.new(children, line, col)
209
+ end
210
+
211
+ def on_error(t, val, stack)
212
+ #pp t
213
+ #pp val
214
+ #pp stack
215
+
216
+ # figure out what error we have
217
+ if t == 0
218
+ # unexpected EOF
219
+ type = nil
220
+ stack.reverse_each do |node|
221
+ case node
222
+ when QuotedTextNode
223
+ type = :unmatched_quote
224
+ when Node
225
+ if type == :unmatched_quote && node.token == :QUOTE
226
+ line = node.line
227
+ col = node.col
228
+ @error = UnmatchedQuoteError.new("unmatched quote at line #{line}, column #{col}", line, col)
229
+ end
230
+ end
231
+ end
232
+
233
+ if @error.nil?
234
+ @error = Error.new("unexpected EOF")
235
+ end
236
+ elsif val.is_a?(Node) && val.token == :QUOTE
237
+ line = val.line
238
+ col = val.col
239
+ @error = StrayQuoteError.new("stray quote at line #{line}, column #{col}", line, col)
240
+ end
241
+ end
242
+ ...end parser.y/module_eval...
243
+ ##### State transition tables begin ###
244
+
245
+ racc_action_table = [
246
+ 18, 17, 19, 16, -1, 9, 6, 13, 12, 8,
247
+ 14, 3 ]
248
+
249
+ racc_action_check = [
250
+ 15, 15, 15, 15, 2, 4, 2, 7, 7, 3,
251
+ 10, 1 ]
252
+
253
+ racc_action_pointer = [
254
+ nil, 11, 4, 9, 3, nil, nil, 3, nil, nil,
255
+ 7, nil, nil, nil, nil, -2, nil, nil, nil, nil ]
256
+
257
+ racc_action_default = [
258
+ -3, -17, -8, -17, -2, -4, -5, -17, 20, -6,
259
+ -7, -9, -13, -12, -10, -17, -11, -14, -15, -16 ]
260
+
261
+ racc_goto_table = [
262
+ 1, 2, 4, 5, 7, 10, 11, 15 ]
263
+
264
+ racc_goto_check = [
265
+ 1, 2, 3, 4, 5, 6, 7, 8 ]
266
+
267
+ racc_goto_pointer = [
268
+ nil, 0, 1, 0, 1, 2, -2, -1, -5 ]
269
+
270
+ racc_goto_default = [
271
+ nil, nil, nil, nil, nil, nil, nil, nil, nil ]
272
+
273
+ racc_reduce_table = [
274
+ 0, 0, :racc_error,
275
+ 1, 7, :_reduce_1,
276
+ 2, 7, :_reduce_2,
277
+ 0, 8, :_reduce_3,
278
+ 2, 8, :_reduce_4,
279
+ 1, 10, :_reduce_5,
280
+ 2, 10, :_reduce_6,
281
+ 2, 9, :_reduce_7,
282
+ 0, 11, :_reduce_8,
283
+ 2, 11, :_reduce_9,
284
+ 2, 13, :_reduce_10,
285
+ 3, 12, :_reduce_11,
286
+ 1, 12, :_reduce_12,
287
+ 0, 14, :_reduce_13,
288
+ 2, 14, :_reduce_14,
289
+ 2, 14, :_reduce_15,
290
+ 2, 14, :_reduce_16 ]
291
+
292
+ racc_reduce_n = 17
293
+
294
+ racc_shift_n = 20
295
+
296
+ racc_token_table = {
297
+ false => 0,
298
+ :error => 1,
299
+ :NEWLINE => 2,
300
+ :COMMA => 3,
301
+ :TEXT => 4,
302
+ :QUOTE => 5 }
303
+
304
+ racc_nt_base = 6
305
+
306
+ racc_use_result_var = true
307
+
308
+ Racc_arg = [
309
+ racc_action_table,
310
+ racc_action_check,
311
+ racc_action_default,
312
+ racc_action_pointer,
313
+ racc_goto_table,
314
+ racc_goto_check,
315
+ racc_goto_default,
316
+ racc_goto_pointer,
317
+ racc_nt_base,
318
+ racc_reduce_table,
319
+ racc_token_table,
320
+ racc_shift_n,
321
+ racc_reduce_n,
322
+ racc_use_result_var ]
323
+
324
+ Racc_token_to_s_table = [
325
+ "$end",
326
+ "error",
327
+ "NEWLINE",
328
+ "COMMA",
329
+ "TEXT",
330
+ "QUOTE",
331
+ "$start",
332
+ "root",
333
+ "delim_records",
334
+ "record",
335
+ "delim_record",
336
+ "delim_fields",
337
+ "field",
338
+ "delim_field",
339
+ "quoted_text" ]
340
+
341
+ Racc_debug_parser = true
342
+
343
+ ##### State transition tables end #####
344
+
345
+ # reduce 0 omitted
346
+
347
+ module_eval(<<'.,.,', 'parser.y', 4)
348
+ def _reduce_1(val, _values, result)
349
+ result = root(val).value
350
+ result
351
+ end
352
+ .,.,
353
+
354
+ module_eval(<<'.,.,', 'parser.y', 5)
355
+ def _reduce_2(val, _values, result)
356
+ result = root(val).value
357
+ result
358
+ end
359
+ .,.,
360
+
361
+ module_eval(<<'.,.,', 'parser.y', 7)
362
+ def _reduce_3(val, _values, result)
363
+ result = delim_records
364
+ result
365
+ end
366
+ .,.,
367
+
368
+ module_eval(<<'.,.,', 'parser.y', 8)
369
+ def _reduce_4(val, _values, result)
370
+ result = delim_records(val)
371
+ result
372
+ end
373
+ .,.,
374
+
375
+ module_eval(<<'.,.,', 'parser.y', 10)
376
+ def _reduce_5(val, _values, result)
377
+ result = delim_record(val)
378
+ result
379
+ end
380
+ .,.,
381
+
382
+ module_eval(<<'.,.,', 'parser.y', 11)
383
+ def _reduce_6(val, _values, result)
384
+ result = delim_record(val)
385
+ result
386
+ end
387
+ .,.,
388
+
389
+ module_eval(<<'.,.,', 'parser.y', 14)
390
+ def _reduce_7(val, _values, result)
391
+ result = record(val)
392
+ result
393
+ end
394
+ .,.,
395
+
396
+ module_eval(<<'.,.,', 'parser.y', 16)
397
+ def _reduce_8(val, _values, result)
398
+ result = delim_fields
399
+ result
400
+ end
401
+ .,.,
402
+
403
+ module_eval(<<'.,.,', 'parser.y', 17)
404
+ def _reduce_9(val, _values, result)
405
+ result = delim_fields(val)
406
+ result
407
+ end
408
+ .,.,
409
+
410
+ module_eval(<<'.,.,', 'parser.y', 19)
411
+ def _reduce_10(val, _values, result)
412
+ result = delim_field(val)
413
+ result
414
+ end
415
+ .,.,
416
+
417
+ module_eval(<<'.,.,', 'parser.y', 21)
418
+ def _reduce_11(val, _values, result)
419
+ result = field(val)
420
+ result
421
+ end
422
+ .,.,
423
+
424
+ module_eval(<<'.,.,', 'parser.y', 22)
425
+ def _reduce_12(val, _values, result)
426
+ result = field(val)
427
+ result
428
+ end
429
+ .,.,
430
+
431
+ module_eval(<<'.,.,', 'parser.y', 24)
432
+ def _reduce_13(val, _values, result)
433
+ result = quoted_text
434
+ result
435
+ end
436
+ .,.,
437
+
438
+ module_eval(<<'.,.,', 'parser.y', 25)
439
+ def _reduce_14(val, _values, result)
440
+ result = quoted_text(val)
441
+ result
442
+ end
443
+ .,.,
444
+
445
+ module_eval(<<'.,.,', 'parser.y', 26)
446
+ def _reduce_15(val, _values, result)
447
+ result = quoted_text(val)
448
+ result
449
+ end
450
+ .,.,
451
+
452
+ module_eval(<<'.,.,', 'parser.y', 27)
453
+ def _reduce_16(val, _values, result)
454
+ result = quoted_text(val)
455
+ result
456
+ end
457
+ .,.,
458
+
459
+ def _reduce_none(val, _values, result)
460
+ val[0]
461
+ end
462
+
463
+ end # class Parser
464
+
465
+ end
@@ -0,0 +1,264 @@
1
+ class Parser
2
+ token NEWLINE COMMA TEXT QUOTE
3
+
4
+ rule
5
+ root: delim_records { result = root(val).value }
6
+ | delim_records record { result = root(val).value }
7
+
8
+ delim_records: { result = delim_records }
9
+ | delim_records delim_record { result = delim_records(val) }
10
+
11
+ delim_record: NEWLINE { result = delim_record(val) }
12
+ | record NEWLINE { result = delim_record(val) }
13
+
14
+ # TODO: reduce record nodes
15
+ record: delim_fields field { result = record(val) }
16
+
17
+ delim_fields: { result = delim_fields }
18
+ | delim_fields delim_field { result = delim_fields(val) }
19
+
20
+ delim_field: field COMMA { result = delim_field(val) }
21
+
22
+ field: QUOTE quoted_text QUOTE { result = field(val) }
23
+ | TEXT { result = field(val) }
24
+
25
+ quoted_text: { result = quoted_text }
26
+ | quoted_text COMMA { result = quoted_text(val) }
27
+ | quoted_text NEWLINE { result = quoted_text(val) }
28
+ | quoted_text TEXT { result = quoted_text(val) }
29
+ end
30
+
31
+ ---- header
32
+ require 'strscan'
33
+
34
+ module ECCSV
35
+ ---- inner
36
+ class Node
37
+ attr_reader :value, :token, :line, :col
38
+
39
+ def initialize(value = "", token = nil, line = nil, col = nil)
40
+ @value = value
41
+ @token = token
42
+ @line = line
43
+ @col = col
44
+ end
45
+ end
46
+
47
+ class ParentNode < Node
48
+ def initialize(children = [], line = nil, col = nil)
49
+ last = children.last
50
+ if last && last.is_a?(Node)
51
+ line = last.line
52
+ col = last.col
53
+ end
54
+ super(nil, nil, line, col)
55
+ @children = children
56
+ end
57
+ end
58
+
59
+ class QuotedTextNode < ParentNode
60
+ def value
61
+ @value ||= @children.collect(&:value).join
62
+ end
63
+ end
64
+
65
+ class FieldNode < ParentNode
66
+ def value
67
+ @value ||=
68
+ if @children[0].token == :TEXT
69
+ @children[0].value
70
+ else
71
+ # quoted text
72
+ @children[1].value
73
+ end
74
+ end
75
+ end
76
+
77
+ class DelimFieldNode < ParentNode
78
+ def value
79
+ @value ||= @children[0].value
80
+ end
81
+ end
82
+
83
+ class DelimFieldsNode < ParentNode
84
+ def value
85
+ @value ||=
86
+ if @children.empty?
87
+ []
88
+ else
89
+ @children[0].value + [@children[1].value]
90
+ end
91
+ end
92
+ end
93
+
94
+ class RecordNode < ParentNode
95
+ def value
96
+ # TODO: 'consume' children to produce value to reduce memory footprint
97
+ @value ||= @children[0].value + [@children[1].value]
98
+ end
99
+ end
100
+
101
+ class DelimRecordNode < ParentNode
102
+ def value
103
+ @value ||= @children.length == 1 ? [] : @children[0].value
104
+ end
105
+ end
106
+
107
+ class DelimRecordsNode < ParentNode
108
+ def value
109
+ if @value.nil?
110
+ if @children.empty?
111
+ @value = []
112
+ else
113
+ @value = @children[0].value
114
+ val = @children[1].value
115
+ if !val.empty?
116
+ @value += [val]
117
+ end
118
+ end
119
+ end
120
+ @value
121
+ end
122
+ end
123
+
124
+ class RootNode < ParentNode
125
+ def value
126
+ if @value.nil?
127
+ @value = @children[0].value
128
+ if @children[1]
129
+ @value += [@children[1].value]
130
+ end
131
+ end
132
+ @value
133
+ end
134
+ end
135
+
136
+ attr_reader :error
137
+
138
+ def parse(str)
139
+ @scanner = StringScanner.new(str)
140
+ @line = 1
141
+ @col = 1
142
+ do_parse
143
+ end
144
+
145
+ def next_token
146
+ until @scanner.empty?
147
+ next_line = @line
148
+ next_col = @col
149
+ case
150
+ when match = @scanner.scan(/,/)
151
+ token = :COMMA
152
+ when match = @scanner.scan(/"/)
153
+ token = :QUOTE
154
+ when match = @scanner.scan(/\n/)
155
+ token = :NEWLINE
156
+ next_line += 1
157
+ next_col = 0
158
+ when match = @scanner.scan(/[^,\n"]+/)
159
+ token = :TEXT
160
+ else
161
+ raise "can't recognize <#{@scanner.peek(5)}>"
162
+ end
163
+ next_col += match.length
164
+
165
+ value = node(match, token)
166
+ @line = next_line
167
+ @col = next_col
168
+
169
+ return [token, value]
170
+ end
171
+ end
172
+
173
+ def warnings
174
+ @warnings ||= []
175
+ end
176
+
177
+ private
178
+
179
+ def node(value = "", token = nil, line = @line, col = @col)
180
+ Node.new(value, token, line, col)
181
+ end
182
+
183
+ def quoted_text(children = [], line = @line, col = @col)
184
+ QuotedTextNode.new(children, line, col)
185
+ end
186
+
187
+ def field(children = [], line = @line, col = @col)
188
+ FieldNode.new(children, line, col)
189
+ end
190
+
191
+ def delim_field(children = [], line = @line, col = @col)
192
+ DelimFieldNode.new(children, line, col)
193
+ end
194
+
195
+ def delim_fields(children = [], line = @line, col = @col)
196
+ DelimFieldsNode.new(children, line, col)
197
+ end
198
+
199
+ def record(children = [], line = @line, col = @col)
200
+ record = RecordNode.new(children, line, col)
201
+ value = record.value
202
+ if defined? @num_fields
203
+ first = children[0]
204
+ line = first.line
205
+ col = first.col
206
+ if @num_fields > value.length
207
+ msg = "expected %d more fields on line %d" % [@num_fields - value.length, line]
208
+ self.warnings.push(MissingFieldsError.new(msg, line, col))
209
+ elsif @num_fields < value.length
210
+ msg = "%d extra fields found on line %d, column %d" % [value.length - @num_fields, line, col]
211
+ self.warnings.push(ExtraFieldsError.new(msg, line, col))
212
+ end
213
+ else
214
+ @num_fields = value.length
215
+ end
216
+
217
+ record
218
+ end
219
+
220
+ def delim_record(children = [], line = @line, col = @col)
221
+ DelimRecordNode.new(children, line, col)
222
+ end
223
+
224
+ def delim_records(children = [], line = @line, col = @col)
225
+ DelimRecordsNode.new(children, line, col)
226
+ end
227
+
228
+ def root(children = [], line = @line, col = @col)
229
+ RootNode.new(children, line, col)
230
+ end
231
+
232
+ def on_error(t, val, stack)
233
+ #pp t
234
+ #pp val
235
+ #pp stack
236
+
237
+ # figure out what error we have
238
+ if t == 0
239
+ # unexpected EOF
240
+ type = nil
241
+ stack.reverse_each do |node|
242
+ case node
243
+ when QuotedTextNode
244
+ type = :unmatched_quote
245
+ when Node
246
+ if type == :unmatched_quote && node.token == :QUOTE
247
+ line = node.line
248
+ col = node.col
249
+ @error = UnmatchedQuoteError.new("unmatched quote at line #{line}, column #{col}", line, col)
250
+ end
251
+ end
252
+ end
253
+
254
+ if @error.nil?
255
+ @error = Error.new("unexpected EOF")
256
+ end
257
+ elsif val.is_a?(Node) && val.token == :QUOTE
258
+ line = val.line
259
+ col = val.col
260
+ @error = StrayQuoteError.new("stray quote at line #{line}, column #{col}", line, col)
261
+ end
262
+ end
263
+ ---- footer
264
+ end
@@ -0,0 +1,3 @@
1
+ module ECCSV
2
+ VERSION = "0.0.1"
3
+ end
data/lib/eccsv.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'eccsv/version'
2
+ require 'eccsv/errors'
3
+ require 'eccsv/parser'
4
+
5
+ module ECCSV
6
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,16 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+
11
+ require 'test/unit'
12
+ require 'byebug'
13
+
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
16
+ require 'eccsv'
@@ -0,0 +1,230 @@
1
+ require 'helper'
2
+
3
+ class TestParser < Test::Unit::TestCase
4
+ def parse(string)
5
+ parser = ECCSV::Parser.new
6
+ parser.parse(string)
7
+ end
8
+
9
+ test "one record with two fields" do
10
+ assert_equal [['foo', 'bar']], parse("foo,bar")
11
+ end
12
+
13
+ test "one record with one field" do
14
+ assert_equal [['foo']], parse("foo")
15
+ end
16
+
17
+ test "empty records" do
18
+ assert_equal [], parse("")
19
+ end
20
+
21
+ test "empty record is skipped by default" do
22
+ assert_equal [['foo'], ['bar']], parse("foo\n\nbar")
23
+ end
24
+
25
+ test "skipping empty record at end" do
26
+ assert_equal [['foo'], ['bar']], parse("foo\nbar\n")
27
+ end
28
+
29
+ =begin
30
+ test "not skipping an empty record" do
31
+ parser = ECCSV::Parser.new
32
+ parser.skip_empty_record = false
33
+ result = parser.parse("foo\n\nbar")
34
+ assert_equal [['foo'], [], ['bar']], result.value
35
+ end
36
+
37
+ test "not skipping empty record at end" do
38
+ parser = ECCSV::Parser.new
39
+ parser.skip_empty_record = false
40
+ result = parser.parse("foo\nbar\n")
41
+ assert_equal [['foo'], ['bar'], []], result.value
42
+ end
43
+ =end
44
+
45
+ test "two records" do
46
+ assert_equal [['foo', 'bar'], ['baz', 'qux']], parse("foo,bar\nbaz,qux")
47
+ end
48
+
49
+ test "quoted field" do
50
+ assert_equal [["foo,bar"]], parse(%{"foo,bar"})
51
+ end
52
+
53
+ test "missing closing quote" do
54
+ parser = ECCSV::Parser.new
55
+ result = parser.parse(%{foo,bar\n"foo})
56
+ assert !result
57
+ assert_kind_of ECCSV::UnmatchedQuoteError, parser.error
58
+ assert_equal 2, parser.error.line
59
+ assert_equal 1, parser.error.col
60
+ end
61
+
62
+ test "quote inside unquoted field" do
63
+ parser = ECCSV::Parser.new
64
+ result = parser.parse(%{f"oo})
65
+ assert !result
66
+ assert_kind_of ECCSV::StrayQuoteError, parser.error
67
+ assert_equal 1, parser.error.line
68
+ assert_equal 2, parser.error.col
69
+ end
70
+
71
+ test "missing fields gets warning by default" do
72
+ parser = ECCSV::Parser.new
73
+ result = parser.parse(%{foo,bar\nbaz})
74
+ assert_equal [['foo', 'bar'], ['baz']], result
75
+ assert_equal 1, parser.warnings.length
76
+ warning = parser.warnings[0]
77
+ assert_kind_of ECCSV::MissingFieldsError, warning
78
+ assert_equal 2, warning.line
79
+ assert_equal 4, warning.col
80
+ end
81
+
82
+ =begin
83
+ test "missing fields when disallowed" do
84
+ parser = ECCSV::Parser.new
85
+ parser.allow_uneven_records = false
86
+ result = parser.parse(%{foo,bar\nbaz})
87
+ assert !result
88
+ assert_equal :missing_fields, parser.failure_type
89
+ end
90
+ =end
91
+
92
+ test "extra fields gets warning by default" do
93
+ parser = ECCSV::Parser.new
94
+ result = parser.parse(%{foo\nbar,baz})
95
+ assert_equal [['foo'], ['bar', 'baz']], result
96
+ assert_equal 1, parser.warnings.length
97
+ warning = parser.warnings[0]
98
+ assert_kind_of ECCSV::ExtraFieldsError, warning
99
+ assert_equal 2, warning.line
100
+ assert_equal 4, warning.col
101
+ end
102
+
103
+ =begin
104
+ test "extra fields when disallowed" do
105
+ parser = ECCSV::Parser.new
106
+ parser.allow_uneven_records = false
107
+ result = parser.parse(%{foo\nbar,baz})
108
+ assert !result
109
+ assert_equal :extra_fields, parser.failure_type
110
+ end
111
+
112
+ test "single-character custom field separator" do
113
+ parser = ECCSV::Parser.new
114
+ parser.field_sep = "\t"
115
+ result = parser.parse("foo\tbar")
116
+ assert result, parser.failure_reason
117
+ assert_equal [['foo', 'bar']], result.value
118
+ end
119
+
120
+ test "multi-character custom field separator" do
121
+ parser = ECCSV::Parser.new
122
+ parser.field_sep = "foo"
123
+ result = parser.parse("bazfoobar")
124
+ assert result, parser.failure_reason
125
+ assert_equal [['baz', 'bar']], result.value
126
+ end
127
+
128
+ test "single-character custom record separator" do
129
+ parser = ECCSV::Parser.new
130
+ parser.record_sep = "x"
131
+ result = parser.parse("fooxbar")
132
+ assert result, parser.failure_reason
133
+ assert_equal [['foo'], ['bar']], result.value
134
+ end
135
+
136
+ test "multi-character custom record separator" do
137
+ parser = ECCSV::Parser.new
138
+ parser.record_sep = "foo"
139
+ result = parser.parse("barfoobaz")
140
+ assert result, parser.failure_reason
141
+ assert_equal [['bar'], ['baz']], result.value
142
+ end
143
+
144
+ test "custom quote character" do
145
+ parser = ECCSV::Parser.new
146
+ parser.quote_char = "'"
147
+ result = parser.parse("'foo,bar'")
148
+ assert result, parser.failure_reason
149
+ assert_equal [['foo,bar']], result.value
150
+ end
151
+
152
+ test "parse helper" do
153
+ result = CsvParser.parse("foo,bar")
154
+ assert_equal [['foo', 'bar']], result.data
155
+ end
156
+
157
+ test "parse helper with options" do
158
+ result = CsvParser.parse("foo\tbar", :field_sep => "\t")
159
+ assert_equal [['foo', 'bar']], result.data
160
+ end
161
+
162
+ test "parse helper with missing closing quote" do
163
+ error = nil
164
+ begin
165
+ CsvParser.parse(%{"foo})
166
+ rescue CsvParser::MissingQuoteError => error
167
+ assert_equal 1, error.line
168
+ assert_equal 1, error.column
169
+ assert_equal "no ending quote found for quote on line 1, column 1", error.message
170
+ end
171
+ assert error
172
+ end
173
+
174
+ test "parse helper with stray quote" do
175
+ error = nil
176
+ begin
177
+ CsvParser.parse(%{f"oo})
178
+ rescue CsvParser::StrayQuoteError => error
179
+ assert_equal 1, error.line
180
+ assert_equal 2, error.column
181
+ assert_equal "invalid quote found on line 1, column 2", error.message
182
+ end
183
+ assert error
184
+ end
185
+
186
+ test "parse helper with allowed short records" do
187
+ result = CsvParser.parse(%{foo,bar\nbaz})
188
+ assert_equal 1, result.warnings.length
189
+ assert_kind_of CsvParser::MissingFieldsError, result.warnings[0]
190
+ error = result.warnings[0]
191
+ assert_equal 2, error.line
192
+ assert_equal 4, error.column
193
+ assert_equal "record on line 2 had too few fields", error.message
194
+ end
195
+
196
+ test "parse helper with disallowed short records" do
197
+ error = nil
198
+ begin
199
+ CsvParser.parse(%{foo,bar\nbaz}, :allow_uneven_records => false)
200
+ rescue CsvParser::MissingFieldsError => error
201
+ assert_equal 2, error.line
202
+ assert_equal 4, error.column
203
+ assert_equal "record on line 2 had too few fields", error.message
204
+ end
205
+ assert error
206
+ end
207
+
208
+ test "parse helper with allowed long records" do
209
+ result = CsvParser.parse(%{foo\nbar,baz})
210
+ assert_equal 1, result.warnings.length
211
+ assert_kind_of CsvParser::ExtraFieldsError, result.warnings[0]
212
+ error = result.warnings[0]
213
+ assert_equal 2, error.line
214
+ assert_equal 5, error.column
215
+ assert_equal "record on line 2 had too many fields", error.message
216
+ end
217
+
218
+ test "parse helper with disallowed long records" do
219
+ error = nil
220
+ begin
221
+ CsvParser.parse(%{foo\nbar,baz}, :allow_uneven_records => false)
222
+ rescue CsvParser::ExtraFieldsError => error
223
+ assert_equal 2, error.line
224
+ assert_equal 5, error.column
225
+ assert_equal "record on line 2 had too many fields", error.message
226
+ end
227
+ assert error
228
+ end
229
+ =end
230
+ end
metadata ADDED
@@ -0,0 +1,115 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: eccsv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jeremy Stephens
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: test-unit
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: racc
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: CSV library with advanced error reporting
70
+ email:
71
+ - jeremy.f.stephens@vanderbilt.edu
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - ".gitignore"
77
+ - Gemfile
78
+ - LICENSE.txt
79
+ - README.md
80
+ - Rakefile
81
+ - eccsv.gemspec
82
+ - lib/eccsv.rb
83
+ - lib/eccsv/errors.rb
84
+ - lib/eccsv/parser.rb
85
+ - lib/eccsv/parser.y
86
+ - lib/eccsv/version.rb
87
+ - test/helper.rb
88
+ - test/test_parser.rb
89
+ homepage: https://github.com/coupler/eccsv
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.2.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: CSV library with advanced error reporting
113
+ test_files:
114
+ - test/helper.rb
115
+ - test/test_parser.rb