csv_parser 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +22 -0
- data/README.md +64 -0
- data/Rakefile +16 -0
- data/csv_parser.gemspec +25 -0
- data/lib/csv_parser.rb +60 -0
- data/lib/csv_parser.treetop +232 -0
- data/lib/csv_parser/csv_parser.rb +1342 -0
- data/lib/csv_parser/parser_extensions.rb +75 -0
- data/lib/csv_parser/result.rb +9 -0
- data/lib/csv_parser/version.rb +3 -0
- data/test/helper.rb +16 -0
- data/test/test_csv_parser.rb +226 -0
- metadata +116 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0182c5c8d43682eaf967338eab5d249c5036ed54
|
4
|
+
data.tar.gz: 249c47d4015daba1d0def6019449d400b1c8bb2c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 29835c6d4346804386d2882ee8141207a24eef5fc1fd4163b7bb01d0582b63fa091c9968e934e6dff4a425763393f953a40a0e787054c9a69b75c88a056323d0
|
7
|
+
data.tar.gz: 2ce0c0176e2611e7e569c295cf2b511d2266cc52bbb371ab23422186b3b1a11a023508649d922fc7ee75c237e6567300fb7d002b776a38fcf358c70b752e5519
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Vanderbilt University
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# CsvParser
|
2
|
+
|
3
|
+
CsvParser is a CSV parser that focuses on identifying errors.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'csv_parser'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install csv_parser
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
CsvParser's goal is to give you descriptive error messages.
|
22
|
+
|
23
|
+
### Error types
|
24
|
+
|
25
|
+
* missing closing quote (`CsvParser::MissingQuoteError`)
|
26
|
+
* quote in the wrong place (`CsvParser::StrayQuoteError`)
|
27
|
+
* rows with not enough fields (`CsvParser::MissingFieldsError`)
|
28
|
+
* rows with too many fields (`CsvParser::ExtraFieldsError`)
|
29
|
+
|
30
|
+
### Options
|
31
|
+
|
32
|
+
You can pass in an options hash to the `CsvParser.parse` method
|
33
|
+
that contains one or more of the following options:
|
34
|
+
|
35
|
+
* `:field_sep` - specify field separator (default is `","`)
|
36
|
+
* `:record_sep` - specify record separator (default is `"\n"`)
|
37
|
+
* `:quote_char` - specify quote character (default is `"\""`)
|
38
|
+
* `:allow_empty_record` - specify whether empty records are allowed (default is `true`)
|
39
|
+
* `:skip_empty_record` - specify whether empty records are skipped (default is `true`)
|
40
|
+
* `:allow_uneven_records` - specify whether records with different field lengths are allowed (default is `true`)
|
41
|
+
|
42
|
+
### Example
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
require 'csv_parser'
|
46
|
+
|
47
|
+
data = <<EOF
|
48
|
+
foo,"bar
|
49
|
+
baz,quz
|
50
|
+
EOF
|
51
|
+
begin
|
52
|
+
result = CsvParser.parse(data)
|
53
|
+
rescue CsvParser::Error => e
|
54
|
+
# e is a CsvParser::MissingQuoteError
|
55
|
+
end
|
56
|
+
```
|
57
|
+
|
58
|
+
## Contributing
|
59
|
+
|
60
|
+
1. Fork it
|
61
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
62
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
63
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
64
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rake/testtask"
|
3
|
+
|
4
|
+
Rake::TestTask.new do |t|
|
5
|
+
t.libs << "test"
|
6
|
+
t.pattern = 'test/**/test*.rb'
|
7
|
+
end
|
8
|
+
task :test => :treetop
|
9
|
+
task :default => :test
|
10
|
+
|
11
|
+
desc "Compile treetop grammar"
|
12
|
+
task :treetop => "lib/csv_parser/csv_parser.rb"
|
13
|
+
|
14
|
+
file "lib/csv_parser/csv_parser.rb" => "lib/csv_parser.treetop" do
|
15
|
+
system("tt lib/csv_parser.treetop -o lib/csv_parser/csv_parser.rb")
|
16
|
+
end
|
data/csv_parser.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'csv_parser/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "csv_parser"
|
8
|
+
spec.version = CsvParser::VERSION
|
9
|
+
spec.authors = ["Jeremy Stephens"]
|
10
|
+
spec.email = ["jeremy.f.stephens@vanderbilt.edu"]
|
11
|
+
spec.description = %q{CSV parser with advanced error reporting}
|
12
|
+
spec.summary = %q{CSV parser with advanced error reporting}
|
13
|
+
spec.homepage = "https://github.com/coupler/csv_parser"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency 'treetop'
|
22
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
23
|
+
spec.add_development_dependency "rake"
|
24
|
+
spec.add_development_dependency "test-unit"
|
25
|
+
end
|
data/lib/csv_parser.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'treetop'
|
2
|
+
|
3
|
+
require 'csv_parser/version'
|
4
|
+
require 'csv_parser/parser_extensions'
|
5
|
+
require 'csv_parser/result'
|
6
|
+
|
7
|
+
#Treetop.load(File.join(File.dirname(__FILE__), 'csv_parser.treetop'))
|
8
|
+
require 'csv_parser/csv_parser'
|
9
|
+
|
10
|
+
module CsvParser
|
11
|
+
class Error < Exception
|
12
|
+
attr_reader :line, :column
|
13
|
+
|
14
|
+
def initialize(msg, line, column)
|
15
|
+
super(msg)
|
16
|
+
@line = line
|
17
|
+
@column = column
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class MissingQuoteError < Error; end
|
22
|
+
class StrayQuoteError < Error; end
|
23
|
+
class MissingFieldsError < Error; end
|
24
|
+
class ExtraFieldsError < Error; end
|
25
|
+
|
26
|
+
def self.parse(data, options = {})
|
27
|
+
parser = ::CsvParser::CsvParser.new
|
28
|
+
options.each_pair do |key, value|
|
29
|
+
parser.send("#{key}=", value)
|
30
|
+
end
|
31
|
+
result = parser.parse(data)
|
32
|
+
if result
|
33
|
+
warnings = parser.warnings.collect do |(desc, line, col)|
|
34
|
+
error(desc, line, col)
|
35
|
+
end
|
36
|
+
Result.new(result.value, warnings)
|
37
|
+
else
|
38
|
+
raise error(parser.failure_type, parser.failure_line,
|
39
|
+
parser.failure_column, parser.failure_reason)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.error(type, line, column, msg = nil)
|
44
|
+
klass, msg =
|
45
|
+
case type
|
46
|
+
when :missing_quote
|
47
|
+
[MissingQuoteError, "no ending quote found for quote on line #{line}, column #{column}"]
|
48
|
+
when :stray_quote
|
49
|
+
[StrayQuoteError, "invalid quote found on line #{line}, column #{column}"]
|
50
|
+
when :missing_fields
|
51
|
+
[MissingFieldsError, "record on line #{line} had too few fields"]
|
52
|
+
when :extra_fields
|
53
|
+
[ExtraFieldsError, "record on line #{line} had too many fields"]
|
54
|
+
else
|
55
|
+
Error
|
56
|
+
end
|
57
|
+
|
58
|
+
klass.new(msg, line, column)
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,232 @@
|
|
1
|
+
module CsvParser
|
2
|
+
grammar Csv
|
3
|
+
include ParserExtensions
|
4
|
+
|
5
|
+
rule records
|
6
|
+
non_empty_records / empty_records
|
7
|
+
end
|
8
|
+
|
9
|
+
rule non_empty_records
|
10
|
+
first_record
|
11
|
+
other_records
|
12
|
+
{
|
13
|
+
def value
|
14
|
+
arr = [first_record.value]
|
15
|
+
rest = other_records.value
|
16
|
+
if rest
|
17
|
+
arr.push(*rest)
|
18
|
+
end
|
19
|
+
arr
|
20
|
+
end
|
21
|
+
}
|
22
|
+
end
|
23
|
+
|
24
|
+
rule empty_records
|
25
|
+
''
|
26
|
+
{
|
27
|
+
def value
|
28
|
+
[]
|
29
|
+
end
|
30
|
+
}
|
31
|
+
end
|
32
|
+
|
33
|
+
rule first_record
|
34
|
+
'' &{ |s| @first_record = true; true }
|
35
|
+
non_empty_record
|
36
|
+
&{ |s| @first_record_length = @record_length; true }
|
37
|
+
{
|
38
|
+
def value
|
39
|
+
non_empty_record.value
|
40
|
+
end
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
rule other_records
|
45
|
+
'' &{ |s| @first_record = false; true }
|
46
|
+
(
|
47
|
+
(
|
48
|
+
&{ |s| skip_empty_record? }
|
49
|
+
(
|
50
|
+
record_sep
|
51
|
+
( empty_record record_sep )*
|
52
|
+
non_empty_record
|
53
|
+
)*
|
54
|
+
(
|
55
|
+
&{ |s| skip_empty_record? }
|
56
|
+
( record_sep empty_record )+
|
57
|
+
)?
|
58
|
+
{
|
59
|
+
def value
|
60
|
+
val = elements[1].elements.collect { |elt| elt.non_empty_record.value }
|
61
|
+
val.empty? ? nil : val
|
62
|
+
end
|
63
|
+
}
|
64
|
+
)
|
65
|
+
/
|
66
|
+
(
|
67
|
+
&{ |s| !skip_empty_record? }
|
68
|
+
( record_sep record )*
|
69
|
+
{
|
70
|
+
def value
|
71
|
+
val = elements[1].elements.collect { |elt| elt.record.value }
|
72
|
+
val.empty? ? nil : val
|
73
|
+
end
|
74
|
+
}
|
75
|
+
)
|
76
|
+
)
|
77
|
+
{
|
78
|
+
def value
|
79
|
+
elements[2].value
|
80
|
+
end
|
81
|
+
}
|
82
|
+
end
|
83
|
+
|
84
|
+
rule record
|
85
|
+
non_empty_record / empty_record
|
86
|
+
end
|
87
|
+
|
88
|
+
rule non_empty_record
|
89
|
+
first:field
|
90
|
+
&{ |s| @record_length = 1; @warning = nil; true }
|
91
|
+
rest:(
|
92
|
+
&{ |s|
|
93
|
+
if @first_record || @record_length < @first_record_length
|
94
|
+
true
|
95
|
+
else
|
96
|
+
if allow_uneven_records?
|
97
|
+
@warning ||= [:extra_fields, input.line_of(index + 1), input.column_of(index + 1)]
|
98
|
+
true
|
99
|
+
else
|
100
|
+
@failure_type = :extra_fields
|
101
|
+
false
|
102
|
+
end
|
103
|
+
end
|
104
|
+
}
|
105
|
+
field_sep
|
106
|
+
field
|
107
|
+
&{ |s| @record_length += 1; true }
|
108
|
+
)*
|
109
|
+
&{ |s|
|
110
|
+
if @first_record || @record_length >= @first_record_length
|
111
|
+
if @warning
|
112
|
+
warnings << @warning
|
113
|
+
end
|
114
|
+
true
|
115
|
+
else
|
116
|
+
if allow_uneven_records?
|
117
|
+
warnings << [:missing_fields, input.line_of(index), input.column_of(index)]
|
118
|
+
true
|
119
|
+
else
|
120
|
+
@failure_type = :missing_fields
|
121
|
+
false
|
122
|
+
end
|
123
|
+
end
|
124
|
+
}
|
125
|
+
{
|
126
|
+
def value
|
127
|
+
arr = [first.value]
|
128
|
+
rest.elements.each do |elt|
|
129
|
+
arr << elt.field.value
|
130
|
+
end
|
131
|
+
arr
|
132
|
+
end
|
133
|
+
}
|
134
|
+
end
|
135
|
+
|
136
|
+
rule empty_record
|
137
|
+
''
|
138
|
+
&{ |s|
|
139
|
+
if allow_empty_record?
|
140
|
+
true
|
141
|
+
else
|
142
|
+
@failure_type = :missing_fields
|
143
|
+
false
|
144
|
+
end
|
145
|
+
}
|
146
|
+
{
|
147
|
+
def value
|
148
|
+
[]
|
149
|
+
end
|
150
|
+
}
|
151
|
+
end
|
152
|
+
|
153
|
+
rule field
|
154
|
+
unquoted_text
|
155
|
+
{
|
156
|
+
def value
|
157
|
+
elements.map(&:text_value).join
|
158
|
+
end
|
159
|
+
}
|
160
|
+
/
|
161
|
+
quoted_text
|
162
|
+
{
|
163
|
+
def value
|
164
|
+
elements[1..-2].map(&:text_value).join
|
165
|
+
end
|
166
|
+
}
|
167
|
+
end
|
168
|
+
|
169
|
+
rule quoted_text
|
170
|
+
quote
|
171
|
+
( !quote . )+
|
172
|
+
(
|
173
|
+
quote
|
174
|
+
/
|
175
|
+
'' !{ |s| @failure_type = :missing_quote; @failure_index = start_index; true }
|
176
|
+
)
|
177
|
+
end
|
178
|
+
|
179
|
+
rule unquoted_text
|
180
|
+
(
|
181
|
+
!field_sep
|
182
|
+
!record_sep
|
183
|
+
(
|
184
|
+
!quote
|
185
|
+
/
|
186
|
+
'' !{ |s| @failure_type = :stray_quote; true }
|
187
|
+
)
|
188
|
+
.
|
189
|
+
)+
|
190
|
+
end
|
191
|
+
|
192
|
+
rule field_sep
|
193
|
+
&{ |s| @field_sep_index = 0; true }
|
194
|
+
(
|
195
|
+
!record_sep
|
196
|
+
!quote
|
197
|
+
.
|
198
|
+
&{ |s|
|
199
|
+
if @field_sep_index < field_sep.length &&
|
200
|
+
s[2].text_value == field_sep[@field_sep_index]
|
201
|
+
@field_sep_index += 1
|
202
|
+
true
|
203
|
+
else
|
204
|
+
false
|
205
|
+
end
|
206
|
+
}
|
207
|
+
)+
|
208
|
+
&{ |s| s.map(&:text_value).join == field_sep }
|
209
|
+
end
|
210
|
+
|
211
|
+
rule record_sep
|
212
|
+
&{ |s| @record_sep_index = 0; true }
|
213
|
+
(
|
214
|
+
.
|
215
|
+
&{ |s|
|
216
|
+
if @record_sep_index < record_sep.length &&
|
217
|
+
s[0].text_value == record_sep[@record_sep_index]
|
218
|
+
@record_sep_index += 1
|
219
|
+
true
|
220
|
+
else
|
221
|
+
false
|
222
|
+
end
|
223
|
+
}
|
224
|
+
)+
|
225
|
+
&{ |s| s.map(&:text_value).join == record_sep }
|
226
|
+
end
|
227
|
+
|
228
|
+
rule quote
|
229
|
+
!"\\" . &{ |s| s[1].text_value[0] == quote_char[0] }
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|