eccsv 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8dd728098ee2f3066326be199f2657c34b33facd
4
+ data.tar.gz: 0472be8f00b3f6c9079cc20808bfd89804ff2212
5
+ SHA512:
6
+ metadata.gz: 9238b550d38a2766e5c53ca152855dfc7f782cc79c98dad60030095138e4a40a68655b14f22c3e376214901294ad81ad3e835e9091c2ee9d810c302e3f744ff3
7
+ data.tar.gz: 12599cfb6ae428dc134dd67c64fac344fa2a0306ba28de4e5f7a183b2754780d5c3e865ead20f6e20e2979b076e49e13ca83bbd11eb51ddf03587839d648b693
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in csv_parser.gemspec
4
+ gemspec
5
+
6
+ gem 'byebug'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Vanderbilt University
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,113 @@
1
+ # ECCSV
2
+
3
+ ECCSV (error correcting comma seperated values) is a CSV parsing library with
4
+ advanced error reporting and correcting.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'eccsv'
11
+
12
+ And then execute:
13
+
14
+ $ bundle
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install eccsv
19
+
20
+ ## Basic Usage
21
+
22
+ ```ruby
23
+ require 'eccsv'
24
+
25
+ data = <<EOF
26
+ foo,bar
27
+ baz,qux
28
+ EOF
29
+
30
+ parser = ECCSV::Parser.new
31
+ parser.parse(data) #=> [["foo", "bar"], ["baz", "qux"]]
32
+ ```
33
+
34
+ ## Errors
35
+
36
+ One of the goals of this project is to give you descriptive error messages.
37
+
38
+ Each error type is a subclass of `ECCSV::Error` and contains the exact line
39
+ number (via `Error#line`) and column number (via `Error#col`) where the error
40
+ took place.
41
+
42
+ * missing closing quote (`ECCSV::UnmatchedQuoteError`)
43
+ * quote in the wrong place (`ECCSV::StrayQuoteError`)
44
+ * rows with not enough fields (`ECCSV::MissingFieldsError`)
45
+ * rows with too many fields (`ECCSV::ExtraFieldsError`)
46
+
47
+ Since missing/extra fields do not cause the CSV to be unparsable, they are
48
+ treated as warnings instead of errors (see example below).
49
+
50
+ ### Examples
51
+
52
+ #### Unmatched quote
53
+
54
+ If there was an error, `#parse` will return `nil` and set `#error`.
55
+
56
+ ```ruby
57
+ require 'eccsv'
58
+
59
+ data = <<EOF
60
+ foo,"bar
61
+ baz,qux
62
+ EOF
63
+
64
+ parser = ECCSV::Parser.new
65
+ parser.parse(data) #=> nil
66
+ parser.error #=> #<ECCSV::UnmatchedQuoteError: unmatched quote at line 1, column 5>
67
+ parser.error.line #=> 1
68
+ parser.error.col #=> 5
69
+ ```
70
+
71
+ #### Missing fields
72
+
73
+ If there was a warning, `#parse` will return the records and add to `#warnings`.
74
+
75
+ ```ruby
76
+ require 'eccsv'
77
+
78
+ data = <<EOF
79
+ foo,bar
80
+ baz
81
+ EOF
82
+
83
+ parser = ECCSV::Parser.new
84
+ parser.parse(data) #=> [["foo", "bar"], ["baz"]]
85
+ parser.warnings #=> [#<ECCSV::MissingFieldsError: expected 1 more fields on line 2>]
86
+ parser.warnings[0].line #=> 2
87
+ parser.warnings[0].col #=> 4
88
+ ```
89
+
90
+ #### Extra fields
91
+
92
+ ```ruby
93
+ require 'eccsv'
94
+
95
+ data = <<EOF
96
+ foo
97
+ bar,baz
98
+ EOF
99
+
100
+ parser = ECCSV::Parser.new
101
+ parser.parse(data) #=> [["foo"], ["bar", "baz"]]
102
+ parser.warnings #=> [#<ECCSV::ExtraFieldsError: 1 extra fields found on line 2, column 4>]
103
+ parser.warnings[0].line #=> 2
104
+ parser.warnings[0].col #=> 4
105
+ ```
106
+
107
+ ## Contributing
108
+
109
+ 1. Fork it
110
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
111
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
112
+ 4. Push to the branch (`git push origin my-new-feature`)
113
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,16 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.pattern = 'test/**/test*.rb'
7
+ end
8
+ task :test => :racc
9
+ task :default => :test
10
+
11
+ desc "Compile racc grammar"
12
+ task :racc => "lib/eccsv/parser.rb"
13
+
14
+ file "lib/eccsv/parser.rb" => "lib/eccsv/parser.y" do |t|
15
+ system("racc -v -o #{t.name} #{t.source}")
16
+ end
data/eccsv.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'eccsv/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "eccsv"
8
+ spec.version = ECCSV::VERSION
9
+ spec.authors = ["Jeremy Stephens"]
10
+ spec.email = ["jeremy.f.stephens@vanderbilt.edu"]
11
+ spec.description = %q{CSV library with advanced error reporting}
12
+ spec.summary = %q{CSV library with advanced error reporting}
13
+ spec.homepage = "https://github.com/coupler/eccsv"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "test-unit"
24
+ spec.add_development_dependency "racc"
25
+ end
@@ -0,0 +1,16 @@
1
+ module ECCSV
2
+ class Error < Exception
3
+ attr_reader :line, :col
4
+
5
+ def initialize(msg = nil, line = nil, col = nil)
6
+ super(msg)
7
+ @line = line
8
+ @col = col
9
+ end
10
+ end
11
+
12
+ class UnmatchedQuoteError < Error; end
13
+ class StrayQuoteError < Error; end
14
+ class MissingFieldsError < Error; end
15
+ class ExtraFieldsError < Error; end
16
+ end
@@ -0,0 +1,465 @@
1
+ #
2
+ # DO NOT MODIFY!!!!
3
+ # This file is automatically generated by Racc 1.4.12
4
+ # from Racc grammer file "".
5
+ #
6
+
7
+ require 'racc/parser.rb'
8
+
9
+ require 'strscan'
10
+
11
+ module ECCSV
12
+ class Parser < Racc::Parser
13
+
14
+ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 36)
15
+ class Node
16
+ attr_reader :value, :token, :line, :col
17
+
18
+ def initialize(value = "", token = nil, line = nil, col = nil)
19
+ @value = value
20
+ @token = token
21
+ @line = line
22
+ @col = col
23
+ end
24
+ end
25
+
26
+ class ParentNode < Node
27
+ def initialize(children = [], line = nil, col = nil)
28
+ last = children.last
29
+ if last && last.is_a?(Node)
30
+ line = last.line
31
+ col = last.col
32
+ end
33
+ super(nil, nil, line, col)
34
+ @children = children
35
+ end
36
+ end
37
+
38
+ class QuotedTextNode < ParentNode
39
+ def value
40
+ @value ||= @children.collect(&:value).join
41
+ end
42
+ end
43
+
44
+ class FieldNode < ParentNode
45
+ def value
46
+ @value ||=
47
+ if @children[0].token == :TEXT
48
+ @children[0].value
49
+ else
50
+ # quoted text
51
+ @children[1].value
52
+ end
53
+ end
54
+ end
55
+
56
+ class DelimFieldNode < ParentNode
57
+ def value
58
+ @value ||= @children[0].value
59
+ end
60
+ end
61
+
62
+ class DelimFieldsNode < ParentNode
63
+ def value
64
+ @value ||=
65
+ if @children.empty?
66
+ []
67
+ else
68
+ @children[0].value + [@children[1].value]
69
+ end
70
+ end
71
+ end
72
+
73
+ class RecordNode < ParentNode
74
+ def value
75
+ # TODO: 'consume' children to produce value to reduce memory footprint
76
+ @value ||= @children[0].value + [@children[1].value]
77
+ end
78
+ end
79
+
80
+ class DelimRecordNode < ParentNode
81
+ def value
82
+ @value ||= @children.length == 1 ? [] : @children[0].value
83
+ end
84
+ end
85
+
86
+ class DelimRecordsNode < ParentNode
87
+ def value
88
+ if @value.nil?
89
+ if @children.empty?
90
+ @value = []
91
+ else
92
+ @value = @children[0].value
93
+ val = @children[1].value
94
+ if !val.empty?
95
+ @value += [val]
96
+ end
97
+ end
98
+ end
99
+ @value
100
+ end
101
+ end
102
+
103
+ class RootNode < ParentNode
104
+ def value
105
+ if @value.nil?
106
+ @value = @children[0].value
107
+ if @children[1]
108
+ @value += [@children[1].value]
109
+ end
110
+ end
111
+ @value
112
+ end
113
+ end
114
+
115
+ attr_reader :error
116
+
117
+ def parse(str)
118
+ @scanner = StringScanner.new(str)
119
+ @line = 1
120
+ @col = 1
121
+ do_parse
122
+ end
123
+
124
+ def next_token
125
+ until @scanner.empty?
126
+ next_line = @line
127
+ next_col = @col
128
+ case
129
+ when match = @scanner.scan(/,/)
130
+ token = :COMMA
131
+ when match = @scanner.scan(/"/)
132
+ token = :QUOTE
133
+ when match = @scanner.scan(/\n/)
134
+ token = :NEWLINE
135
+ next_line += 1
136
+ next_col = 0
137
+ when match = @scanner.scan(/[^,\n"]+/)
138
+ token = :TEXT
139
+ else
140
+ raise "can't recognize <#{@scanner.peek(5)}>"
141
+ end
142
+ next_col += match.length
143
+
144
+ value = node(match, token)
145
+ @line = next_line
146
+ @col = next_col
147
+
148
+ return [token, value]
149
+ end
150
+ end
151
+
152
+ def warnings
153
+ @warnings ||= []
154
+ end
155
+
156
+ private
157
+
158
+ def node(value = "", token = nil, line = @line, col = @col)
159
+ Node.new(value, token, line, col)
160
+ end
161
+
162
+ def quoted_text(children = [], line = @line, col = @col)
163
+ QuotedTextNode.new(children, line, col)
164
+ end
165
+
166
+ def field(children = [], line = @line, col = @col)
167
+ FieldNode.new(children, line, col)
168
+ end
169
+
170
+ def delim_field(children = [], line = @line, col = @col)
171
+ DelimFieldNode.new(children, line, col)
172
+ end
173
+
174
+ def delim_fields(children = [], line = @line, col = @col)
175
+ DelimFieldsNode.new(children, line, col)
176
+ end
177
+
178
+ def record(children = [], line = @line, col = @col)
179
+ record = RecordNode.new(children, line, col)
180
+ value = record.value
181
+ if defined? @num_fields
182
+ first = children[0]
183
+ line = first.line
184
+ col = first.col
185
+ if @num_fields > value.length
186
+ msg = "expected %d more fields on line %d" % [@num_fields - value.length, line]
187
+ self.warnings.push(MissingFieldsError.new(msg, line, col))
188
+ elsif @num_fields < value.length
189
+ msg = "%d extra fields found on line %d, column %d" % [value.length - @num_fields, line, col]
190
+ self.warnings.push(ExtraFieldsError.new(msg, line, col))
191
+ end
192
+ else
193
+ @num_fields = value.length
194
+ end
195
+
196
+ record
197
+ end
198
+
199
+ def delim_record(children = [], line = @line, col = @col)
200
+ DelimRecordNode.new(children, line, col)
201
+ end
202
+
203
+ def delim_records(children = [], line = @line, col = @col)
204
+ DelimRecordsNode.new(children, line, col)
205
+ end
206
+
207
+ def root(children = [], line = @line, col = @col)
208
+ RootNode.new(children, line, col)
209
+ end
210
+
211
+ def on_error(t, val, stack)
212
+ #pp t
213
+ #pp val
214
+ #pp stack
215
+
216
+ # figure out what error we have
217
+ if t == 0
218
+ # unexpected EOF
219
+ type = nil
220
+ stack.reverse_each do |node|
221
+ case node
222
+ when QuotedTextNode
223
+ type = :unmatched_quote
224
+ when Node
225
+ if type == :unmatched_quote && node.token == :QUOTE
226
+ line = node.line
227
+ col = node.col
228
+ @error = UnmatchedQuoteError.new("unmatched quote at line #{line}, column #{col}", line, col)
229
+ end
230
+ end
231
+ end
232
+
233
+ if @error.nil?
234
+ @error = Error.new("unexpected EOF")
235
+ end
236
+ elsif val.is_a?(Node) && val.token == :QUOTE
237
+ line = val.line
238
+ col = val.col
239
+ @error = StrayQuoteError.new("stray quote at line #{line}, column #{col}", line, col)
240
+ end
241
+ end
242
+ ...end parser.y/module_eval...
243
+ ##### State transition tables begin ###
244
+
245
+ racc_action_table = [
246
+ 18, 17, 19, 16, -1, 9, 6, 13, 12, 8,
247
+ 14, 3 ]
248
+
249
+ racc_action_check = [
250
+ 15, 15, 15, 15, 2, 4, 2, 7, 7, 3,
251
+ 10, 1 ]
252
+
253
+ racc_action_pointer = [
254
+ nil, 11, 4, 9, 3, nil, nil, 3, nil, nil,
255
+ 7, nil, nil, nil, nil, -2, nil, nil, nil, nil ]
256
+
257
+ racc_action_default = [
258
+ -3, -17, -8, -17, -2, -4, -5, -17, 20, -6,
259
+ -7, -9, -13, -12, -10, -17, -11, -14, -15, -16 ]
260
+
261
+ racc_goto_table = [
262
+ 1, 2, 4, 5, 7, 10, 11, 15 ]
263
+
264
+ racc_goto_check = [
265
+ 1, 2, 3, 4, 5, 6, 7, 8 ]
266
+
267
+ racc_goto_pointer = [
268
+ nil, 0, 1, 0, 1, 2, -2, -1, -5 ]
269
+
270
+ racc_goto_default = [
271
+ nil, nil, nil, nil, nil, nil, nil, nil, nil ]
272
+
273
+ racc_reduce_table = [
274
+ 0, 0, :racc_error,
275
+ 1, 7, :_reduce_1,
276
+ 2, 7, :_reduce_2,
277
+ 0, 8, :_reduce_3,
278
+ 2, 8, :_reduce_4,
279
+ 1, 10, :_reduce_5,
280
+ 2, 10, :_reduce_6,
281
+ 2, 9, :_reduce_7,
282
+ 0, 11, :_reduce_8,
283
+ 2, 11, :_reduce_9,
284
+ 2, 13, :_reduce_10,
285
+ 3, 12, :_reduce_11,
286
+ 1, 12, :_reduce_12,
287
+ 0, 14, :_reduce_13,
288
+ 2, 14, :_reduce_14,
289
+ 2, 14, :_reduce_15,
290
+ 2, 14, :_reduce_16 ]
291
+
292
+ racc_reduce_n = 17
293
+
294
+ racc_shift_n = 20
295
+
296
+ racc_token_table = {
297
+ false => 0,
298
+ :error => 1,
299
+ :NEWLINE => 2,
300
+ :COMMA => 3,
301
+ :TEXT => 4,
302
+ :QUOTE => 5 }
303
+
304
+ racc_nt_base = 6
305
+
306
+ racc_use_result_var = true
307
+
308
+ Racc_arg = [
309
+ racc_action_table,
310
+ racc_action_check,
311
+ racc_action_default,
312
+ racc_action_pointer,
313
+ racc_goto_table,
314
+ racc_goto_check,
315
+ racc_goto_default,
316
+ racc_goto_pointer,
317
+ racc_nt_base,
318
+ racc_reduce_table,
319
+ racc_token_table,
320
+ racc_shift_n,
321
+ racc_reduce_n,
322
+ racc_use_result_var ]
323
+
324
+ Racc_token_to_s_table = [
325
+ "$end",
326
+ "error",
327
+ "NEWLINE",
328
+ "COMMA",
329
+ "TEXT",
330
+ "QUOTE",
331
+ "$start",
332
+ "root",
333
+ "delim_records",
334
+ "record",
335
+ "delim_record",
336
+ "delim_fields",
337
+ "field",
338
+ "delim_field",
339
+ "quoted_text" ]
340
+
341
+ Racc_debug_parser = true
342
+
343
+ ##### State transition tables end #####
344
+
345
+ # reduce 0 omitted
346
+
347
+ module_eval(<<'.,.,', 'parser.y', 4)
348
+ def _reduce_1(val, _values, result)
349
+ result = root(val).value
350
+ result
351
+ end
352
+ .,.,
353
+
354
+ module_eval(<<'.,.,', 'parser.y', 5)
355
+ def _reduce_2(val, _values, result)
356
+ result = root(val).value
357
+ result
358
+ end
359
+ .,.,
360
+
361
+ module_eval(<<'.,.,', 'parser.y', 7)
362
+ def _reduce_3(val, _values, result)
363
+ result = delim_records
364
+ result
365
+ end
366
+ .,.,
367
+
368
+ module_eval(<<'.,.,', 'parser.y', 8)
369
+ def _reduce_4(val, _values, result)
370
+ result = delim_records(val)
371
+ result
372
+ end
373
+ .,.,
374
+
375
+ module_eval(<<'.,.,', 'parser.y', 10)
376
+ def _reduce_5(val, _values, result)
377
+ result = delim_record(val)
378
+ result
379
+ end
380
+ .,.,
381
+
382
+ module_eval(<<'.,.,', 'parser.y', 11)
383
+ def _reduce_6(val, _values, result)
384
+ result = delim_record(val)
385
+ result
386
+ end
387
+ .,.,
388
+
389
+ module_eval(<<'.,.,', 'parser.y', 14)
390
+ def _reduce_7(val, _values, result)
391
+ result = record(val)
392
+ result
393
+ end
394
+ .,.,
395
+
396
+ module_eval(<<'.,.,', 'parser.y', 16)
397
+ def _reduce_8(val, _values, result)
398
+ result = delim_fields
399
+ result
400
+ end
401
+ .,.,
402
+
403
+ module_eval(<<'.,.,', 'parser.y', 17)
404
+ def _reduce_9(val, _values, result)
405
+ result = delim_fields(val)
406
+ result
407
+ end
408
+ .,.,
409
+
410
+ module_eval(<<'.,.,', 'parser.y', 19)
411
+ def _reduce_10(val, _values, result)
412
+ result = delim_field(val)
413
+ result
414
+ end
415
+ .,.,
416
+
417
+ module_eval(<<'.,.,', 'parser.y', 21)
418
+ def _reduce_11(val, _values, result)
419
+ result = field(val)
420
+ result
421
+ end
422
+ .,.,
423
+
424
+ module_eval(<<'.,.,', 'parser.y', 22)
425
+ def _reduce_12(val, _values, result)
426
+ result = field(val)
427
+ result
428
+ end
429
+ .,.,
430
+
431
+ module_eval(<<'.,.,', 'parser.y', 24)
432
+ def _reduce_13(val, _values, result)
433
+ result = quoted_text
434
+ result
435
+ end
436
+ .,.,
437
+
438
+ module_eval(<<'.,.,', 'parser.y', 25)
439
+ def _reduce_14(val, _values, result)
440
+ result = quoted_text(val)
441
+ result
442
+ end
443
+ .,.,
444
+
445
+ module_eval(<<'.,.,', 'parser.y', 26)
446
+ def _reduce_15(val, _values, result)
447
+ result = quoted_text(val)
448
+ result
449
+ end
450
+ .,.,
451
+
452
+ module_eval(<<'.,.,', 'parser.y', 27)
453
+ def _reduce_16(val, _values, result)
454
+ result = quoted_text(val)
455
+ result
456
+ end
457
+ .,.,
458
+
459
+ def _reduce_none(val, _values, result)
460
+ val[0]
461
+ end
462
+
463
+ end # class Parser
464
+
465
+ end
@@ -0,0 +1,264 @@
1
+ class Parser
2
+ token NEWLINE COMMA TEXT QUOTE
3
+
4
+ rule
5
+ root: delim_records { result = root(val).value }
6
+ | delim_records record { result = root(val).value }
7
+
8
+ delim_records: { result = delim_records }
9
+ | delim_records delim_record { result = delim_records(val) }
10
+
11
+ delim_record: NEWLINE { result = delim_record(val) }
12
+ | record NEWLINE { result = delim_record(val) }
13
+
14
+ # TODO: reduce record nodes
15
+ record: delim_fields field { result = record(val) }
16
+
17
+ delim_fields: { result = delim_fields }
18
+ | delim_fields delim_field { result = delim_fields(val) }
19
+
20
+ delim_field: field COMMA { result = delim_field(val) }
21
+
22
+ field: QUOTE quoted_text QUOTE { result = field(val) }
23
+ | TEXT { result = field(val) }
24
+
25
+ quoted_text: { result = quoted_text }
26
+ | quoted_text COMMA { result = quoted_text(val) }
27
+ | quoted_text NEWLINE { result = quoted_text(val) }
28
+ | quoted_text TEXT { result = quoted_text(val) }
29
+ end
30
+
31
+ ---- header
32
+ require 'strscan'
33
+
34
+ module ECCSV
35
+ ---- inner
36
+ class Node
37
+ attr_reader :value, :token, :line, :col
38
+
39
+ def initialize(value = "", token = nil, line = nil, col = nil)
40
+ @value = value
41
+ @token = token
42
+ @line = line
43
+ @col = col
44
+ end
45
+ end
46
+
47
+ class ParentNode < Node
48
+ def initialize(children = [], line = nil, col = nil)
49
+ last = children.last
50
+ if last && last.is_a?(Node)
51
+ line = last.line
52
+ col = last.col
53
+ end
54
+ super(nil, nil, line, col)
55
+ @children = children
56
+ end
57
+ end
58
+
59
+ class QuotedTextNode < ParentNode
60
+ def value
61
+ @value ||= @children.collect(&:value).join
62
+ end
63
+ end
64
+
65
+ class FieldNode < ParentNode
66
+ def value
67
+ @value ||=
68
+ if @children[0].token == :TEXT
69
+ @children[0].value
70
+ else
71
+ # quoted text
72
+ @children[1].value
73
+ end
74
+ end
75
+ end
76
+
77
+ class DelimFieldNode < ParentNode
78
+ def value
79
+ @value ||= @children[0].value
80
+ end
81
+ end
82
+
83
+ class DelimFieldsNode < ParentNode
84
+ def value
85
+ @value ||=
86
+ if @children.empty?
87
+ []
88
+ else
89
+ @children[0].value + [@children[1].value]
90
+ end
91
+ end
92
+ end
93
+
94
+ class RecordNode < ParentNode
95
+ def value
96
+ # TODO: 'consume' children to produce value to reduce memory footprint
97
+ @value ||= @children[0].value + [@children[1].value]
98
+ end
99
+ end
100
+
101
+ class DelimRecordNode < ParentNode
102
+ def value
103
+ @value ||= @children.length == 1 ? [] : @children[0].value
104
+ end
105
+ end
106
+
107
+ class DelimRecordsNode < ParentNode
108
+ def value
109
+ if @value.nil?
110
+ if @children.empty?
111
+ @value = []
112
+ else
113
+ @value = @children[0].value
114
+ val = @children[1].value
115
+ if !val.empty?
116
+ @value += [val]
117
+ end
118
+ end
119
+ end
120
+ @value
121
+ end
122
+ end
123
+
124
+ class RootNode < ParentNode
125
+ def value
126
+ if @value.nil?
127
+ @value = @children[0].value
128
+ if @children[1]
129
+ @value += [@children[1].value]
130
+ end
131
+ end
132
+ @value
133
+ end
134
+ end
135
+
136
+ attr_reader :error
137
+
138
+ def parse(str)
139
+ @scanner = StringScanner.new(str)
140
+ @line = 1
141
+ @col = 1
142
+ do_parse
143
+ end
144
+
145
+ def next_token
146
+ until @scanner.empty?
147
+ next_line = @line
148
+ next_col = @col
149
+ case
150
+ when match = @scanner.scan(/,/)
151
+ token = :COMMA
152
+ when match = @scanner.scan(/"/)
153
+ token = :QUOTE
154
+ when match = @scanner.scan(/\n/)
155
+ token = :NEWLINE
156
+ next_line += 1
157
+ next_col = 0
158
+ when match = @scanner.scan(/[^,\n"]+/)
159
+ token = :TEXT
160
+ else
161
+ raise "can't recognize <#{@scanner.peek(5)}>"
162
+ end
163
+ next_col += match.length
164
+
165
+ value = node(match, token)
166
+ @line = next_line
167
+ @col = next_col
168
+
169
+ return [token, value]
170
+ end
171
+ end
172
+
173
+ def warnings
174
+ @warnings ||= []
175
+ end
176
+
177
+ private
178
+
179
+ def node(value = "", token = nil, line = @line, col = @col)
180
+ Node.new(value, token, line, col)
181
+ end
182
+
183
+ def quoted_text(children = [], line = @line, col = @col)
184
+ QuotedTextNode.new(children, line, col)
185
+ end
186
+
187
+ def field(children = [], line = @line, col = @col)
188
+ FieldNode.new(children, line, col)
189
+ end
190
+
191
+ def delim_field(children = [], line = @line, col = @col)
192
+ DelimFieldNode.new(children, line, col)
193
+ end
194
+
195
+ def delim_fields(children = [], line = @line, col = @col)
196
+ DelimFieldsNode.new(children, line, col)
197
+ end
198
+
199
+ def record(children = [], line = @line, col = @col)
200
+ record = RecordNode.new(children, line, col)
201
+ value = record.value
202
+ if defined? @num_fields
203
+ first = children[0]
204
+ line = first.line
205
+ col = first.col
206
+ if @num_fields > value.length
207
+ msg = "expected %d more fields on line %d" % [@num_fields - value.length, line]
208
+ self.warnings.push(MissingFieldsError.new(msg, line, col))
209
+ elsif @num_fields < value.length
210
+ msg = "%d extra fields found on line %d, column %d" % [value.length - @num_fields, line, col]
211
+ self.warnings.push(ExtraFieldsError.new(msg, line, col))
212
+ end
213
+ else
214
+ @num_fields = value.length
215
+ end
216
+
217
+ record
218
+ end
219
+
220
+ def delim_record(children = [], line = @line, col = @col)
221
+ DelimRecordNode.new(children, line, col)
222
+ end
223
+
224
+ def delim_records(children = [], line = @line, col = @col)
225
+ DelimRecordsNode.new(children, line, col)
226
+ end
227
+
228
+ def root(children = [], line = @line, col = @col)
229
+ RootNode.new(children, line, col)
230
+ end
231
+
232
+ def on_error(t, val, stack)
233
+ #pp t
234
+ #pp val
235
+ #pp stack
236
+
237
+ # figure out what error we have
238
+ if t == 0
239
+ # unexpected EOF
240
+ type = nil
241
+ stack.reverse_each do |node|
242
+ case node
243
+ when QuotedTextNode
244
+ type = :unmatched_quote
245
+ when Node
246
+ if type == :unmatched_quote && node.token == :QUOTE
247
+ line = node.line
248
+ col = node.col
249
+ @error = UnmatchedQuoteError.new("unmatched quote at line #{line}, column #{col}", line, col)
250
+ end
251
+ end
252
+ end
253
+
254
+ if @error.nil?
255
+ @error = Error.new("unexpected EOF")
256
+ end
257
+ elsif val.is_a?(Node) && val.token == :QUOTE
258
+ line = val.line
259
+ col = val.col
260
+ @error = StrayQuoteError.new("stray quote at line #{line}, column #{col}", line, col)
261
+ end
262
+ end
263
+ ---- footer
264
+ end
@@ -0,0 +1,3 @@
1
+ module ECCSV
2
+ VERSION = "0.0.1"
3
+ end
data/lib/eccsv.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'eccsv/version'
2
+ require 'eccsv/errors'
3
+ require 'eccsv/parser'
4
+
5
+ module ECCSV
6
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,16 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+
11
+ require 'test/unit'
12
+ require 'byebug'
13
+
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
16
+ require 'eccsv'
@@ -0,0 +1,230 @@
1
+ require 'helper'
2
+
3
+ class TestParser < Test::Unit::TestCase
4
+ def parse(string)
5
+ parser = ECCSV::Parser.new
6
+ parser.parse(string)
7
+ end
8
+
9
+ test "one record with two fields" do
10
+ assert_equal [['foo', 'bar']], parse("foo,bar")
11
+ end
12
+
13
+ test "one record with one field" do
14
+ assert_equal [['foo']], parse("foo")
15
+ end
16
+
17
+ test "empty records" do
18
+ assert_equal [], parse("")
19
+ end
20
+
21
+ test "empty record is skipped by default" do
22
+ assert_equal [['foo'], ['bar']], parse("foo\n\nbar")
23
+ end
24
+
25
+ test "skipping empty record at end" do
26
+ assert_equal [['foo'], ['bar']], parse("foo\nbar\n")
27
+ end
28
+
29
+ =begin
30
+ test "not skipping an empty record" do
31
+ parser = ECCSV::Parser.new
32
+ parser.skip_empty_record = false
33
+ result = parser.parse("foo\n\nbar")
34
+ assert_equal [['foo'], [], ['bar']], result.value
35
+ end
36
+
37
+ test "not skipping empty record at end" do
38
+ parser = ECCSV::Parser.new
39
+ parser.skip_empty_record = false
40
+ result = parser.parse("foo\nbar\n")
41
+ assert_equal [['foo'], ['bar'], []], result.value
42
+ end
43
+ =end
44
+
45
+ test "two records" do
46
+ assert_equal [['foo', 'bar'], ['baz', 'qux']], parse("foo,bar\nbaz,qux")
47
+ end
48
+
49
+ test "quoted field" do
50
+ assert_equal [["foo,bar"]], parse(%{"foo,bar"})
51
+ end
52
+
53
+ test "missing closing quote" do
54
+ parser = ECCSV::Parser.new
55
+ result = parser.parse(%{foo,bar\n"foo})
56
+ assert !result
57
+ assert_kind_of ECCSV::UnmatchedQuoteError, parser.error
58
+ assert_equal 2, parser.error.line
59
+ assert_equal 1, parser.error.col
60
+ end
61
+
62
+ test "quote inside unquoted field" do
63
+ parser = ECCSV::Parser.new
64
+ result = parser.parse(%{f"oo})
65
+ assert !result
66
+ assert_kind_of ECCSV::StrayQuoteError, parser.error
67
+ assert_equal 1, parser.error.line
68
+ assert_equal 2, parser.error.col
69
+ end
70
+
71
+ test "missing fields gets warning by default" do
72
+ parser = ECCSV::Parser.new
73
+ result = parser.parse(%{foo,bar\nbaz})
74
+ assert_equal [['foo', 'bar'], ['baz']], result
75
+ assert_equal 1, parser.warnings.length
76
+ warning = parser.warnings[0]
77
+ assert_kind_of ECCSV::MissingFieldsError, warning
78
+ assert_equal 2, warning.line
79
+ assert_equal 4, warning.col
80
+ end
81
+
82
+ =begin
83
+ test "missing fields when disallowed" do
84
+ parser = ECCSV::Parser.new
85
+ parser.allow_uneven_records = false
86
+ result = parser.parse(%{foo,bar\nbaz})
87
+ assert !result
88
+ assert_equal :missing_fields, parser.failure_type
89
+ end
90
+ =end
91
+
92
+ test "extra fields gets warning by default" do
93
+ parser = ECCSV::Parser.new
94
+ result = parser.parse(%{foo\nbar,baz})
95
+ assert_equal [['foo'], ['bar', 'baz']], result
96
+ assert_equal 1, parser.warnings.length
97
+ warning = parser.warnings[0]
98
+ assert_kind_of ECCSV::ExtraFieldsError, warning
99
+ assert_equal 2, warning.line
100
+ assert_equal 4, warning.col
101
+ end
102
+
103
+ =begin
104
+ test "extra fields when disallowed" do
105
+ parser = ECCSV::Parser.new
106
+ parser.allow_uneven_records = false
107
+ result = parser.parse(%{foo\nbar,baz})
108
+ assert !result
109
+ assert_equal :extra_fields, parser.failure_type
110
+ end
111
+
112
+ test "single-character custom field separator" do
113
+ parser = ECCSV::Parser.new
114
+ parser.field_sep = "\t"
115
+ result = parser.parse("foo\tbar")
116
+ assert result, parser.failure_reason
117
+ assert_equal [['foo', 'bar']], result.value
118
+ end
119
+
120
+ test "multi-character custom field separator" do
121
+ parser = ECCSV::Parser.new
122
+ parser.field_sep = "foo"
123
+ result = parser.parse("bazfoobar")
124
+ assert result, parser.failure_reason
125
+ assert_equal [['baz', 'bar']], result.value
126
+ end
127
+
128
+ test "single-character custom record separator" do
129
+ parser = ECCSV::Parser.new
130
+ parser.record_sep = "x"
131
+ result = parser.parse("fooxbar")
132
+ assert result, parser.failure_reason
133
+ assert_equal [['foo'], ['bar']], result.value
134
+ end
135
+
136
+ test "multi-character custom record separator" do
137
+ parser = ECCSV::Parser.new
138
+ parser.record_sep = "foo"
139
+ result = parser.parse("barfoobaz")
140
+ assert result, parser.failure_reason
141
+ assert_equal [['bar'], ['baz']], result.value
142
+ end
143
+
144
+ test "custom quote character" do
145
+ parser = ECCSV::Parser.new
146
+ parser.quote_char = "'"
147
+ result = parser.parse("'foo,bar'")
148
+ assert result, parser.failure_reason
149
+ assert_equal [['foo,bar']], result.value
150
+ end
151
+
152
+ test "parse helper" do
153
+ result = CsvParser.parse("foo,bar")
154
+ assert_equal [['foo', 'bar']], result.data
155
+ end
156
+
157
+ test "parse helper with options" do
158
+ result = CsvParser.parse("foo\tbar", :field_sep => "\t")
159
+ assert_equal [['foo', 'bar']], result.data
160
+ end
161
+
162
+ test "parse helper with missing closing quote" do
163
+ error = nil
164
+ begin
165
+ CsvParser.parse(%{"foo})
166
+ rescue CsvParser::MissingQuoteError => error
167
+ assert_equal 1, error.line
168
+ assert_equal 1, error.column
169
+ assert_equal "no ending quote found for quote on line 1, column 1", error.message
170
+ end
171
+ assert error
172
+ end
173
+
174
+ test "parse helper with stray quote" do
175
+ error = nil
176
+ begin
177
+ CsvParser.parse(%{f"oo})
178
+ rescue CsvParser::StrayQuoteError => error
179
+ assert_equal 1, error.line
180
+ assert_equal 2, error.column
181
+ assert_equal "invalid quote found on line 1, column 2", error.message
182
+ end
183
+ assert error
184
+ end
185
+
186
+ test "parse helper with allowed short records" do
187
+ result = CsvParser.parse(%{foo,bar\nbaz})
188
+ assert_equal 1, result.warnings.length
189
+ assert_kind_of CsvParser::MissingFieldsError, result.warnings[0]
190
+ error = result.warnings[0]
191
+ assert_equal 2, error.line
192
+ assert_equal 4, error.column
193
+ assert_equal "record on line 2 had too few fields", error.message
194
+ end
195
+
196
+ test "parse helper with disallowed short records" do
197
+ error = nil
198
+ begin
199
+ CsvParser.parse(%{foo,bar\nbaz}, :allow_uneven_records => false)
200
+ rescue CsvParser::MissingFieldsError => error
201
+ assert_equal 2, error.line
202
+ assert_equal 4, error.column
203
+ assert_equal "record on line 2 had too few fields", error.message
204
+ end
205
+ assert error
206
+ end
207
+
208
+ test "parse helper with allowed long records" do
209
+ result = CsvParser.parse(%{foo\nbar,baz})
210
+ assert_equal 1, result.warnings.length
211
+ assert_kind_of CsvParser::ExtraFieldsError, result.warnings[0]
212
+ error = result.warnings[0]
213
+ assert_equal 2, error.line
214
+ assert_equal 5, error.column
215
+ assert_equal "record on line 2 had too many fields", error.message
216
+ end
217
+
218
+ test "parse helper with disallowed long records" do
219
+ error = nil
220
+ begin
221
+ CsvParser.parse(%{foo\nbar,baz}, :allow_uneven_records => false)
222
+ rescue CsvParser::ExtraFieldsError => error
223
+ assert_equal 2, error.line
224
+ assert_equal 5, error.column
225
+ assert_equal "record on line 2 had too many fields", error.message
226
+ end
227
+ assert error
228
+ end
229
+ =end
230
+ end
metadata ADDED
@@ -0,0 +1,115 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: eccsv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jeremy Stephens
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: test-unit
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: racc
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: CSV library with advanced error reporting
70
+ email:
71
+ - jeremy.f.stephens@vanderbilt.edu
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - ".gitignore"
77
+ - Gemfile
78
+ - LICENSE.txt
79
+ - README.md
80
+ - Rakefile
81
+ - eccsv.gemspec
82
+ - lib/eccsv.rb
83
+ - lib/eccsv/errors.rb
84
+ - lib/eccsv/parser.rb
85
+ - lib/eccsv/parser.y
86
+ - lib/eccsv/version.rb
87
+ - test/helper.rb
88
+ - test/test_parser.rb
89
+ homepage: https://github.com/coupler/eccsv
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.2.2
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: CSV library with advanced error reporting
113
+ test_files:
114
+ - test/helper.rb
115
+ - test/test_parser.rb