wtf_csv 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +4 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +31 -0
- data/README.md +44 -0
- data/Rakefile +19 -0
- data/lib/wtf_csv/version.rb +3 -0
- data/lib/wtf_csv/wtf_csv.rb +206 -0
- data/lib/wtf_csv.rb +3 -0
- data/spec/fixtures/check_column_count_fixed.csv +5 -0
- data/spec/fixtures/check_column_count_smart_threshold.csv +5 -0
- data/spec/fixtures/encoding.csv +4 -0
- data/spec/fixtures/escape_char.csv +4 -0
- data/spec/fixtures/escape_char_equals_quote_char.csv +4 -0
- data/spec/fixtures/ignore_string.csv +6 -0
- data/spec/fixtures/max_chars_in_field_n.csv +4 -0
- data/spec/fixtures/max_chars_in_field_rn.csv +4 -0
- data/spec/fixtures/pipe_delimited.csv +4 -0
- data/spec/fixtures/quote_char.csv +3 -0
- data/spec/fixtures/quoted_fields_n.csv +4 -0
- data/spec/fixtures/quoted_fields_r.csv +1 -0
- data/spec/fixtures/quoted_fields_rn.csv +4 -0
- data/spec/fixtures/quoted_newlines.csv +4 -0
- data/spec/spec_helper.rb +12 -0
- data/spec/wtf_csv/test_spec.rb +248 -0
- data/wtf_csv.gemspec +21 -0
- metadata +100 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 94b4fdd4e3201ed522d4867b95a5d93d4327b00e
|
4
|
+
data.tar.gz: 6782a9fe9282463989816d7532dbd6f5862412bd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d28528baac8f1d4dec98c5085a363595ad6930710d28d46f64dda87398b3ec872149c0bab36a8f0fe761953d751e00097632b5a62e23b5e7553127c26715b8a8
|
7
|
+
data.tar.gz: 77ab306c01fc249efe80b2f27976475da2c111348d490257ec2a2188ae5404b2f2bcb320ff3f36a0f5b6e071523055becaac752212015d055e460ecb3a4d825b
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
wtf_csv (0.0.0)
|
5
|
+
smarter_csv
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: https://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.2.5)
|
11
|
+
rspec (3.3.0)
|
12
|
+
rspec-core (~> 3.3.0)
|
13
|
+
rspec-expectations (~> 3.3.0)
|
14
|
+
rspec-mocks (~> 3.3.0)
|
15
|
+
rspec-core (3.3.2)
|
16
|
+
rspec-support (~> 3.3.0)
|
17
|
+
rspec-expectations (3.3.1)
|
18
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
19
|
+
rspec-support (~> 3.3.0)
|
20
|
+
rspec-mocks (3.3.2)
|
21
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
22
|
+
rspec-support (~> 3.3.0)
|
23
|
+
rspec-support (3.3.0)
|
24
|
+
smarter_csv (1.1.0)
|
25
|
+
|
26
|
+
PLATFORMS
|
27
|
+
ruby
|
28
|
+
|
29
|
+
DEPENDENCIES
|
30
|
+
rspec
|
31
|
+
wtf_csv!
|
data/README.md
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# wtf_csv
|
2
|
+
Ruby gem to detect formatting issues in a CSV
|
3
|
+
|
4
|
+
The CSV file format is meant to be an easy way to transport data. Anyone who has had to maintain an import process, however, knows that it's easy to mess up. Usually the entire landscape looks like this:
|
5
|
+
1. An importer expects CSV files to be provided in some specific format
|
6
|
+
2. The files are given in a different format
|
7
|
+
3. The import fails; or even worse, the import succeeds but the data is mangled
|
8
|
+
4. Some poor souls must dig through the CSV file to figure out what happened. Usually issues are related to bad cell quoting, inconsistent column counts, etc.
|
9
|
+
|
10
|
+
This gem seeks to make this process less terrible by providing a way to easily surface common formatting issues on a CSV file.
|
11
|
+
|
12
|
+
`WtfCSV.scan` has the following options:
|
13
|
+
```
|
14
|
+
|---------------------------------|----------|--------------------------------------------------------------------------------------|
|
15
|
+
| Option | Default | Explanation |
|
16
|
+
|---------------------------------|----------|--------------------------------------------------------------------------------------|
|
17
|
+
| :col_sep | ',' | Column separator |
|
18
|
+
| :row_sep | $/ ,"\n" | Row separator - defaults to system's $/ , which defaults to "\n" |
|
19
|
+
| | | This can also be set to :auto, but will process the whole cvs file first (slow!) |
|
20
|
+
| :quote_char | '"' | Quotation character |
|
21
|
+
| :escape_char | '\' | Character to escape quotes |
|
22
|
+
|---------------------------------|----------|--------------------------------------------------------------------------------------|
|
23
|
+
| :check_col_count | true | If set, checks for issues in the number of columns that are present |
|
24
|
+
| :num_cols | 0 | If :check_col_count is set and this value is non-zero, will return errors if any |
|
25
|
+
| | | line does not have this number of columns |
|
26
|
+
| :col_threshold | 80 | If :check_col_count is set, this is the percentage of rows that must have a column |
|
27
|
+
| | | count in order for the module to assume this is the target number of columns. |
|
28
|
+
| | | For example, if there are 10 line in the file, and this value is set to 80, then |
|
29
|
+
| | | at least 8 lines must have a certain number of columns for the module to assume |
|
30
|
+
| | | this is the number of columns that rows are supposed to have |
|
31
|
+
|---------------------------------|----------|--------------------------------------------------------------------------------------|
|
32
|
+
| :ignore_string | nil | If a line is equal to this string, the line will not be checked for issues |
|
33
|
+
|---------------------------------|----------|--------------------------------------------------------------------------------------|
|
34
|
+
| :allow_row_sep_in_quoted_fields | false | Allows :row_sep characters to be present in quoted fields. Otherwise if there are |
|
35
|
+
| | | line ending characters in a field, they will be treat as sequential lines and you'll |
|
36
|
+
| | | likely receive column count errors (if you're checking for them) |
|
37
|
+
|---------------------------------|----------|--------------------------------------------------------------------------------------|
|
38
|
+
| :max_chars_in_field | nil | Ensures that fields have less than or equal to the provided number of characters |
|
39
|
+
|---------------------------------|----------|--------------------------------------------------------------------------------------|
|
40
|
+
| :file_encoding | 'utf-8' | Set the file encoding |
|
41
|
+
|---------------------------------|----------|--------------------------------------------------------------------------------------|
|
42
|
+
```
|
43
|
+
|
44
|
+
If you happen upon this, know that this is in development - though should be very stable at this point. Soon this will be available on https://rubygems.org, where you'll be able to install with `gem install wtf_csv` or putting `require 'wtf_csv'` in your gemfile. Until then, feel free to install from source and bundle it into a gem yourself - just give credit where it is due.
|
data/Rakefile
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'rake'
|
6
|
+
|
7
|
+
require 'rspec/core/rake_task'
|
8
|
+
|
9
|
+
desc "Run RSpec"
|
10
|
+
RSpec::Core::RakeTask.new do |t|
|
11
|
+
t.verbose = false
|
12
|
+
end
|
13
|
+
|
14
|
+
desc "Run specs for all test cases"
|
15
|
+
task :spec_all do
|
16
|
+
system "rake spec"
|
17
|
+
end
|
18
|
+
|
19
|
+
task :default => :spec
|
@@ -0,0 +1,206 @@
|
|
1
|
+
module WtfCSV
|
2
|
+
def WtfCSV.scan(file, options = {}, &block)
|
3
|
+
default_options = {
|
4
|
+
:col_sep => ',',
|
5
|
+
:row_sep => $/,
|
6
|
+
:quote_char => '"',
|
7
|
+
:escape_char => '\\',
|
8
|
+
:check_col_count => true,
|
9
|
+
:col_threshold => 80,
|
10
|
+
:num_cols => 0,
|
11
|
+
:ignore_string => nil,
|
12
|
+
:allow_row_sep_in_quoted_fields => false,
|
13
|
+
:max_chars_in_field => nil,
|
14
|
+
:file_encoding => 'utf-8',
|
15
|
+
}
|
16
|
+
options = default_options.merge(options)
|
17
|
+
|
18
|
+
f = File.open(file, "r:#{options[:file_encoding]}")
|
19
|
+
trgt_line_count = `wc -l "#{file}"`.strip.split(' ')[0].to_i if block_given?
|
20
|
+
|
21
|
+
if options[:row_sep] == :auto
|
22
|
+
options[:row_sep] = SmarterCSV.guess_line_ending(f, options)
|
23
|
+
f.rewind
|
24
|
+
end
|
25
|
+
|
26
|
+
# credit to tilo, author of smarter_csv, on how to loop over lines without reading whole file into memory
|
27
|
+
old_row_sep = $/
|
28
|
+
$/ = options[:row_sep]
|
29
|
+
|
30
|
+
quote_errors = Array.new
|
31
|
+
encoding_errors = Array.new
|
32
|
+
column_errors = Array.new
|
33
|
+
column_counts = Array.new if options[:check_col_count]
|
34
|
+
length_errors = Array.new
|
35
|
+
|
36
|
+
line_number = 0
|
37
|
+
col_number = 0
|
38
|
+
percent_done = 0
|
39
|
+
previous_line = ""
|
40
|
+
last_line_ended_quoted = false if options[:allow_row_sep_in_quoted_fields]
|
41
|
+
field_length = 0 if ! options[:max_chars_in_field].nil?
|
42
|
+
|
43
|
+
begin
|
44
|
+
while ! f.eof?
|
45
|
+
line = f.readline
|
46
|
+
begin
|
47
|
+
if block_given? and ((line_number.to_f / trgt_line_count)*100).to_i > percent_done
|
48
|
+
percent_done = ((line_number.to_f / trgt_line_count)*100).to_i
|
49
|
+
yield percent_done
|
50
|
+
end
|
51
|
+
|
52
|
+
line.chomp!
|
53
|
+
|
54
|
+
next if ! options[:ignore_string].nil? and line == options[:ignore_string]
|
55
|
+
|
56
|
+
if options[:allow_row_sep_in_quoted_fields] and last_line_ended_quoted
|
57
|
+
line_number -= 1
|
58
|
+
last_line_ended_quoted = false
|
59
|
+
field_length += options[:row_sep].length if ! options[:max_chars_in_field].nil?
|
60
|
+
else
|
61
|
+
is_quoted = false
|
62
|
+
new_col = true
|
63
|
+
quote_has_ended = false
|
64
|
+
quote_error = false
|
65
|
+
escape_char = false
|
66
|
+
col_number = 0
|
67
|
+
end
|
68
|
+
pos_start = 0
|
69
|
+
|
70
|
+
line.each_char.with_index do |char, position|
|
71
|
+
begin
|
72
|
+
char.ord # this is here to check encoding. if the encoding is bad this will throw an exception
|
73
|
+
|
74
|
+
field_length += 1 if ! options[:max_chars_in_field].nil?
|
75
|
+
|
76
|
+
if escape_char and options[:escape_char] == options[:quote_char] and char != options[:quote_char]
|
77
|
+
escape_char = false
|
78
|
+
is_quoted = ! is_quoted
|
79
|
+
if ! is_quoted
|
80
|
+
quote_has_ended = true
|
81
|
+
elsif ! new_col
|
82
|
+
quote_error = true
|
83
|
+
is_quoted = false
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
if char != options[:quote_char] and char != options[:col_sep] and char != options[:escape_char] ## escape_char part
|
88
|
+
new_col = false
|
89
|
+
if quote_has_ended
|
90
|
+
quote_error = true
|
91
|
+
end
|
92
|
+
elsif char == options[:quote_char] and escape_char
|
93
|
+
escape_char = false
|
94
|
+
elsif char == options[:escape_char]
|
95
|
+
escape_char = true
|
96
|
+
elsif char == options[:quote_char] and is_quoted
|
97
|
+
quote_has_ended = true
|
98
|
+
is_quoted = false
|
99
|
+
elsif char == options[:quote_char]
|
100
|
+
if new_col
|
101
|
+
is_quoted = true
|
102
|
+
new_col = false
|
103
|
+
else
|
104
|
+
quote_error = true
|
105
|
+
end
|
106
|
+
elsif char == options[:col_sep] and ! is_quoted
|
107
|
+
if quote_error
|
108
|
+
quote_errors.push([line_number + 1,col_number + 1,"#{previous_line}#{line[pos_start..(position - 1)]}"])
|
109
|
+
quote_error = false
|
110
|
+
end
|
111
|
+
if ! options[:max_chars_in_field].nil?
|
112
|
+
length_errors.push([line_number + 1,col_number + 1,field_length - 1]) if (field_length - 1) > options[:max_chars_in_field]
|
113
|
+
field_length = 0
|
114
|
+
end
|
115
|
+
new_col = true
|
116
|
+
quote_has_ended = false
|
117
|
+
previous_line = ""
|
118
|
+
pos_start = position + 1
|
119
|
+
col_number += 1
|
120
|
+
end
|
121
|
+
rescue Exception => e
|
122
|
+
if e.message == 'invalid byte sequence in UTF-8'
|
123
|
+
encoding_errors.push([line_number + 1,col_number + 1])
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
if escape_char and options[:escape_char] == options[:quote_char]
|
129
|
+
if ! new_col and ! is_quoted
|
130
|
+
quote_error = true
|
131
|
+
else
|
132
|
+
is_quoted = ! is_quoted
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
if is_quoted
|
137
|
+
if options[:allow_row_sep_in_quoted_fields]
|
138
|
+
last_line_ended_quoted = true
|
139
|
+
previous_line = "#{previous_line}#{line[pos_start...line.length]}#{options[:row_sep]}"
|
140
|
+
next
|
141
|
+
else
|
142
|
+
quote_error = true
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
quote_errors.push([line_number + 1,col_number + 1,line[pos_start..line.length]]) if quote_error
|
147
|
+
|
148
|
+
if ! options[:max_chars_in_field].nil?
|
149
|
+
length_errors.push([line_number + 1,col_number + 1,field_length]) if field_length > options[:max_chars_in_field]
|
150
|
+
field_length = 0
|
151
|
+
end
|
152
|
+
|
153
|
+
if options[:check_col_count]
|
154
|
+
fnd = false
|
155
|
+
column_counts.each do |val|
|
156
|
+
if val[0] == col_number + 1
|
157
|
+
val[1].push(line_number)
|
158
|
+
fnd = true
|
159
|
+
break
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
if ! fnd
|
164
|
+
column_counts.push([col_number + 1, [line_number + 1]])
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
rescue Exception => e
|
169
|
+
# don't do anything
|
170
|
+
ensure
|
171
|
+
line_number += 1
|
172
|
+
end
|
173
|
+
end
|
174
|
+
ensure
|
175
|
+
$/ = old_row_sep
|
176
|
+
end
|
177
|
+
|
178
|
+
if options[:check_col_count]
|
179
|
+
column_counts.sort_by! { |val| val[1].length }
|
180
|
+
column_counts.reverse!
|
181
|
+
|
182
|
+
# if we're looking for an absolute number...
|
183
|
+
if options[:num_cols] != 0
|
184
|
+
column_counts.each do |val|
|
185
|
+
if val[0] != options[:num_cols]
|
186
|
+
val[1].each { |row| column_errors.push([row,val[0],options[:num_cols]]) }
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
# else we'll try to figure out the target number of columns with :col_threshold
|
191
|
+
elsif column_counts.length > 1
|
192
|
+
if column_counts[0][1].length >= line_number * (options[:col_threshold].to_f / 100)
|
193
|
+
column_counts.drop(1).each { |val| val[1].each { |row| column_errors.push([row,val[0],column_counts[0][0]]) } }
|
194
|
+
else
|
195
|
+
column_counts.each { |val| column_errors.push([val[0],val[1].length]) }
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
return {quote_errors: quote_errors,
|
201
|
+
encoding_errors: encoding_errors,
|
202
|
+
column_errors: column_errors,
|
203
|
+
length_errors: length_errors}
|
204
|
+
|
205
|
+
end
|
206
|
+
end
|
data/lib/wtf_csv.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
"animal","sound",color,size
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,248 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
fixture_path = 'spec/fixtures'
|
4
|
+
# writing some files with special characters:
|
5
|
+
# File.write("#{fixture_path}/quoted_fields_r.csv", "\"animal\",\"sound\",color,size\r\"cat\",meow,\"calico\",small\rdog,\"bark\",\"golden\",medium\rhorse,whinny,\"brown\",\"large\"")
|
6
|
+
# File.write("#{fixture_path}/max_chars_in_field_rn.csv", "\"animal\",\"sound\",color,\"\\\"size\r\ncat\",meow,\"calico\",small_and_this_is_a_long_field_with_44_chars\r\ndog,\"field_with_26_characters\",\"golden,medium\r\nhorse,whinny\\\"\",\"brown\",\"large\"")
|
7
|
+
# File.write("#{fixture_path}/encoding.csv", "\"animal\",\"sound\",co\255lor,\"\\\"size\ncat\",meow,\"calico\",small_and_this_is_a_long_field_with_44_chars\ndog,\"field_with_26_charact\255ers\"\255,\"golden,medium\nhorse,whinny\\\"\",\"brown\",\"large\"")
|
8
|
+
|
9
|
+
describe 'a file with \n line endings' do
|
10
|
+
it 'should have no errors' do
|
11
|
+
options = {:row_sep => "\n"}
|
12
|
+
output = WtfCSV.scan("#{fixture_path}/quoted_fields_n.csv", options)
|
13
|
+
output[:quote_errors].length.should be == 0
|
14
|
+
output[:column_errors].length.should be == 0
|
15
|
+
output[:length_errors].length.should be == 0
|
16
|
+
output[:encoding_errors].length.should be == 0
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe 'a file with \r line endings' do
|
21
|
+
it 'should have no errors' do
|
22
|
+
options = {:row_sep => "\r"}
|
23
|
+
output = WtfCSV.scan("#{fixture_path}/quoted_fields_r.csv", options)
|
24
|
+
output[:quote_errors].length.should be == 0
|
25
|
+
output[:column_errors].length.should be == 0
|
26
|
+
output[:length_errors].length.should be == 0
|
27
|
+
output[:encoding_errors].length.should be == 0
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe 'a file with \r\n line endings' do
|
32
|
+
it 'should have no errors' do
|
33
|
+
options = {:row_sep => "\r\n"}
|
34
|
+
output = WtfCSV.scan("#{fixture_path}/quoted_fields_rn.csv", options)
|
35
|
+
output[:quote_errors].length.should be == 0
|
36
|
+
output[:column_errors].length.should be == 0
|
37
|
+
output[:length_errors].length.should be == 0
|
38
|
+
output[:encoding_errors].length.should be == 0
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe 'a file with pipe delimiters' do
|
43
|
+
it 'should have no errors' do
|
44
|
+
options = {:col_sep => '|'}
|
45
|
+
output = WtfCSV.scan("#{fixture_path}/pipe_delimited.csv", options)
|
46
|
+
output[:quote_errors].length.should be == 0
|
47
|
+
output[:column_errors].length.should be == 0
|
48
|
+
output[:length_errors].length.should be == 0
|
49
|
+
output[:encoding_errors].length.should be == 0
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
describe 'a file with custom quote characters' do
|
54
|
+
it 'should behave correctly' do
|
55
|
+
options = {:quote_char => '+'}
|
56
|
+
output = WtfCSV.scan("#{fixture_path}/quote_char.csv", options)
|
57
|
+
output[:quote_errors].length.should be == 2
|
58
|
+
if output[:quote_errors].length == 2
|
59
|
+
output[:quote_errors].should include [2,1,"++cat++"]
|
60
|
+
output[:quote_errors].should include [3,4,"+medium+a"]
|
61
|
+
end
|
62
|
+
output[:column_errors].length.should be == 0
|
63
|
+
output[:length_errors].length.should be == 0
|
64
|
+
output[:encoding_errors].length.should be == 0
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
describe 'a file with custom escape character' do
|
69
|
+
it 'should behave correctly' do
|
70
|
+
options = {:escape_char => '='}
|
71
|
+
output = WtfCSV.scan("#{fixture_path}/escape_char.csv", options)
|
72
|
+
output[:quote_errors].length.should be == 0
|
73
|
+
output[:column_errors].length.should be == 0
|
74
|
+
output[:length_errors].length.should be == 0
|
75
|
+
output[:encoding_errors].length.should be == 0
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe 'a file with the same escape and quote characters' do
|
80
|
+
it 'should behave correctly' do
|
81
|
+
options = {:escape_char => '"', :quote_char => '"'}
|
82
|
+
output = WtfCSV.scan("#{fixture_path}/escape_char_equals_quote_char.csv", options)
|
83
|
+
output[:quote_errors].length.should be == 1
|
84
|
+
if output[:quote_errors].length == 1
|
85
|
+
output[:quote_errors].should include [2,4,'"""small""']
|
86
|
+
end
|
87
|
+
output[:column_errors].length.should be == 0
|
88
|
+
output[:length_errors].length.should be == 0
|
89
|
+
output[:encoding_errors].length.should be == 0
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
describe 'checking a file for a fixed number of columns in each row' do
|
94
|
+
it 'should catch errors' do
|
95
|
+
options = {:num_cols => 4}
|
96
|
+
output = WtfCSV.scan("#{fixture_path}/check_column_count_fixed.csv", options)
|
97
|
+
output[:quote_errors].length.should be == 0
|
98
|
+
output[:column_errors].length.should be == 2
|
99
|
+
if output[:column_errors].length == 2
|
100
|
+
output[:column_errors].should include [2,5,4]
|
101
|
+
output[:column_errors].should include [5,3,4]
|
102
|
+
end
|
103
|
+
output[:length_errors].length.should be == 0
|
104
|
+
output[:encoding_errors].length.should be == 0
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
describe 'checking a file for column counts and determining the number of columns with a threshold' do
|
109
|
+
it 'should only return the counts of how many rows have different numbers of columns if under the threshold' do
|
110
|
+
options = {:col_threshold => 70}
|
111
|
+
output = WtfCSV.scan("#{fixture_path}/check_column_count_smart_threshold.csv", options)
|
112
|
+
output[:quote_errors].length.should be == 0
|
113
|
+
output[:column_errors].length.should be == 3
|
114
|
+
if output[:column_errors].length == 3
|
115
|
+
output[:column_errors].should include [4,3]
|
116
|
+
output[:column_errors].should include [5,1]
|
117
|
+
output[:column_errors].should include [3,1]
|
118
|
+
end
|
119
|
+
output[:length_errors].length.should be == 0
|
120
|
+
output[:encoding_errors].length.should be == 0
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
describe 'checking a file for column counts and determining the number of columns with a threshold' do
|
125
|
+
it 'should determine the target number of columns using a threshold' do
|
126
|
+
options = {:col_threshold => 60}
|
127
|
+
output = WtfCSV.scan("#{fixture_path}/check_column_count_smart_threshold.csv", options)
|
128
|
+
output[:quote_errors].length.should be == 0
|
129
|
+
output[:column_errors].length.should be == 2
|
130
|
+
if output[:column_errors].length == 2
|
131
|
+
output[:column_errors].should include [2,5,4]
|
132
|
+
output[:column_errors].should include [5,3,4]
|
133
|
+
end
|
134
|
+
output[:length_errors].length.should be == 0
|
135
|
+
output[:encoding_errors].length.should be == 0
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
describe 'using the :ignore_string option' do
|
140
|
+
it 'should skip the line that matches this string' do
|
141
|
+
options = {:ignore_string => 'ignore_this_line',
|
142
|
+
:col_threshold => 60}
|
143
|
+
output = WtfCSV.scan("#{fixture_path}/ignore_string.csv", options)
|
144
|
+
output[:quote_errors].length.should be == 0
|
145
|
+
output[:column_errors].length.should be == 1
|
146
|
+
if output[:column_errors].length == 1
|
147
|
+
output[:column_errors].should include [5,1,4]
|
148
|
+
end
|
149
|
+
output[:length_errors].length.should be == 0
|
150
|
+
output[:encoding_errors].length.should be == 0
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
describe 'a file with quoted newlines' do
|
155
|
+
it 'should provide accurate errors if :allow_row_sep_in_quoted_fields is not set to true' do
|
156
|
+
options = {:col_threshold => 70}
|
157
|
+
output = WtfCSV.scan("#{fixture_path}/quoted_newlines.csv", options)
|
158
|
+
output[:quote_errors].length.should be == 4
|
159
|
+
if output[:quote_errors].length == 4
|
160
|
+
output[:quote_errors].should include [1,4,'"\"size']
|
161
|
+
output[:quote_errors].should include [2,1,'cat"']
|
162
|
+
output[:quote_errors].should include [3,3,'"golden,medium']
|
163
|
+
output[:quote_errors].should include [4,2,'whinny"']
|
164
|
+
end
|
165
|
+
output[:column_errors].length.should be == 1
|
166
|
+
if output[:column_errors].length == 1
|
167
|
+
output[:column_errors].should include [3,3,4]
|
168
|
+
end
|
169
|
+
output[:length_errors].length.should be == 0
|
170
|
+
output[:encoding_errors].length.should be == 0
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
describe 'a file with quoted newlines' do
|
175
|
+
it 'should allow the quote newlines if :allow_row_sep_in_quoted_fields is set to true' do
|
176
|
+
options = {:allow_row_sep_in_quoted_fields => true}
|
177
|
+
output = WtfCSV.scan("#{fixture_path}/quoted_newlines.csv", options)
|
178
|
+
output[:quote_errors].length.should be == 0
|
179
|
+
output[:column_errors].length.should be == 2
|
180
|
+
if output[:column_errors].length == 2
|
181
|
+
output[:column_errors].should include [7,1]
|
182
|
+
output[:column_errors].should include [5,1]
|
183
|
+
end
|
184
|
+
output[:length_errors].length.should be == 0
|
185
|
+
output[:encoding_errors].length.should be == 0
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
describe 'a file that has greater than the number of characters in :max_chars_in_field in a single field' do
|
190
|
+
it 'should give appropriate errors, and should count a \n in the field as a single character' do
|
191
|
+
options = {:max_chars_in_field => 15,
|
192
|
+
:allow_row_sep_in_quoted_fields => true,
|
193
|
+
:check_col_count => false}
|
194
|
+
output = WtfCSV.scan("#{fixture_path}/max_chars_in_field_n.csv", options)
|
195
|
+
output[:quote_errors].length.should be == 0
|
196
|
+
output[:column_errors].length.should be == 0
|
197
|
+
output[:length_errors].length.should be == 3
|
198
|
+
if output[:length_errors].length == 3
|
199
|
+
output[:length_errors].should include [1,7,44]
|
200
|
+
output[:length_errors].should include [2,2,26]
|
201
|
+
output[:length_errors].should include [2,3,30]
|
202
|
+
end
|
203
|
+
output[:encoding_errors].length.should be == 0
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
describe 'a file that has greater than the number of characters in :max_chars_in_field in a single field' do
|
208
|
+
it 'should give appropriate errors, and should count a \r\n in the field as two characters' do
|
209
|
+
options = {:max_chars_in_field => 15,
|
210
|
+
:allow_row_sep_in_quoted_fields => true,
|
211
|
+
:row_sep => "\r\n",
|
212
|
+
:check_col_count => false}
|
213
|
+
output = WtfCSV.scan("#{fixture_path}/max_chars_in_field_rn.csv", options)
|
214
|
+
output[:quote_errors].length.should be == 0
|
215
|
+
output[:column_errors].length.should be == 0
|
216
|
+
output[:length_errors].length.should be == 3
|
217
|
+
if output[:length_errors].length == 3
|
218
|
+
output[:length_errors].should include [1,7,44]
|
219
|
+
output[:length_errors].should include [2,2,26]
|
220
|
+
output[:length_errors].should include [2,3,31]
|
221
|
+
end
|
222
|
+
output[:encoding_errors].length.should be == 0
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
describe 'a file that has encoding errors' do
|
227
|
+
it 'should give appropriate errors' do
|
228
|
+
options = {:allow_row_sep_in_quoted_fields => true,
|
229
|
+
:check_col_count => false}
|
230
|
+
output = WtfCSV.scan("#{fixture_path}/encoding.csv", options)
|
231
|
+
output[:encoding_errors]
|
232
|
+
output[:quote_errors].length.should be == 0
|
233
|
+
output[:column_errors].length.should be == 0
|
234
|
+
output[:length_errors].length.should be == 0
|
235
|
+
output[:encoding_errors].length.should be == 3
|
236
|
+
if output[:encoding_errors].length == 3
|
237
|
+
output[:encoding_errors].should include [1,3]
|
238
|
+
output[:encoding_errors].should include [2,2]
|
239
|
+
output[:encoding_errors].each_with_index do |err, idx|
|
240
|
+
if err == [2,2]
|
241
|
+
output[:encoding_errors].delete_at(idx)
|
242
|
+
break
|
243
|
+
end
|
244
|
+
end
|
245
|
+
output[:encoding_errors].should include [2,2]
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
data/wtf_csv.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/wtf_csv/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = 'wtf_csv'
|
6
|
+
s.version = WtfCSV::VERSION
|
7
|
+
s.date = '2015-09-11'
|
8
|
+
s.summary = %q{Ruby gem to detect formatting issues in a CSV}
|
9
|
+
s.description = %q{Ruby gem to detect formatting issues in a CSV. Can find quoting issues, incorrect column counts, and can properly handle quote-escaped line endings.}
|
10
|
+
s.authors = ["Greg Merritt"]
|
11
|
+
s.email = ["greg@evertrue.com"]
|
12
|
+
s.homepage = 'https://github.com/gremerritt/wtf_csv'
|
13
|
+
s.files = `git ls-files`.split($\)
|
14
|
+
s.require_paths = ["lib"]
|
15
|
+
s.licenses = ['MIT']
|
16
|
+
|
17
|
+
s.add_runtime_dependency 'smarter_csv'
|
18
|
+
s.add_development_dependency 'rspec'
|
19
|
+
|
20
|
+
s.requirements << 'smarter_csv'
|
21
|
+
end
|
metadata
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wtf_csv
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Greg Merritt
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-09-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: smarter_csv
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: Ruby gem to detect formatting issues in a CSV. Can find quoting issues,
|
42
|
+
incorrect column counts, and can properly handle quote-escaped line endings.
|
43
|
+
email:
|
44
|
+
- greg@evertrue.com
|
45
|
+
executables: []
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- ".gitignore"
|
50
|
+
- ".rspec"
|
51
|
+
- Gemfile
|
52
|
+
- Gemfile.lock
|
53
|
+
- README.md
|
54
|
+
- Rakefile
|
55
|
+
- lib/wtf_csv.rb
|
56
|
+
- lib/wtf_csv/version.rb
|
57
|
+
- lib/wtf_csv/wtf_csv.rb
|
58
|
+
- spec/fixtures/check_column_count_fixed.csv
|
59
|
+
- spec/fixtures/check_column_count_smart_threshold.csv
|
60
|
+
- spec/fixtures/encoding.csv
|
61
|
+
- spec/fixtures/escape_char.csv
|
62
|
+
- spec/fixtures/escape_char_equals_quote_char.csv
|
63
|
+
- spec/fixtures/ignore_string.csv
|
64
|
+
- spec/fixtures/max_chars_in_field_n.csv
|
65
|
+
- spec/fixtures/max_chars_in_field_rn.csv
|
66
|
+
- spec/fixtures/pipe_delimited.csv
|
67
|
+
- spec/fixtures/quote_char.csv
|
68
|
+
- spec/fixtures/quoted_fields_n.csv
|
69
|
+
- spec/fixtures/quoted_fields_r.csv
|
70
|
+
- spec/fixtures/quoted_fields_rn.csv
|
71
|
+
- spec/fixtures/quoted_newlines.csv
|
72
|
+
- spec/spec_helper.rb
|
73
|
+
- spec/wtf_csv/test_spec.rb
|
74
|
+
- wtf_csv.gemspec
|
75
|
+
homepage: https://github.com/gremerritt/wtf_csv
|
76
|
+
licenses:
|
77
|
+
- MIT
|
78
|
+
metadata: {}
|
79
|
+
post_install_message:
|
80
|
+
rdoc_options: []
|
81
|
+
require_paths:
|
82
|
+
- lib
|
83
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements:
|
94
|
+
- smarter_csv
|
95
|
+
rubyforge_project:
|
96
|
+
rubygems_version: 2.0.3
|
97
|
+
signing_key:
|
98
|
+
specification_version: 4
|
99
|
+
summary: Ruby gem to detect formatting issues in a CSV
|
100
|
+
test_files: []
|