wtf_csv 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 94b4fdd4e3201ed522d4867b95a5d93d4327b00e
4
+ data.tar.gz: 6782a9fe9282463989816d7532dbd6f5862412bd
5
+ SHA512:
6
+ metadata.gz: d28528baac8f1d4dec98c5085a363595ad6930710d28d46f64dda87398b3ec872149c0bab36a8f0fe761953d751e00097632b5a62e23b5e7553127c26715b8a8
7
+ data.tar.gz: 77ab306c01fc249efe80b2f27976475da2c111348d490257ec2a2188ae5404b2f2bcb320ff3f36a0f5b6e071523055becaac752212015d055e460ecb3a4d825b
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.json
2
+ lib/*.csv
3
+ *.py
4
+ *.gem
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+ # required gems in .gemspec
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,31 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ wtf_csv (0.0.0)
5
+ smarter_csv
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ diff-lcs (1.2.5)
11
+ rspec (3.3.0)
12
+ rspec-core (~> 3.3.0)
13
+ rspec-expectations (~> 3.3.0)
14
+ rspec-mocks (~> 3.3.0)
15
+ rspec-core (3.3.2)
16
+ rspec-support (~> 3.3.0)
17
+ rspec-expectations (3.3.1)
18
+ diff-lcs (>= 1.2.0, < 2.0)
19
+ rspec-support (~> 3.3.0)
20
+ rspec-mocks (3.3.2)
21
+ diff-lcs (>= 1.2.0, < 2.0)
22
+ rspec-support (~> 3.3.0)
23
+ rspec-support (3.3.0)
24
+ smarter_csv (1.1.0)
25
+
26
+ PLATFORMS
27
+ ruby
28
+
29
+ DEPENDENCIES
30
+ rspec
31
+ wtf_csv!
data/README.md ADDED
@@ -0,0 +1,44 @@
1
+ # wtf_csv
2
+ Ruby gem to detect formatting issues in a CSV
3
+
4
+ The CSV file format is meant to be an easy way to transport data. Anyone who has had to maintain an import process, however, knows that it's easy to mess up. Usually the entire landscape looks like this:
5
+ 1. An importer expects CSV files to be provided in some specific format
6
+ 2. The files are given in a different format
7
+ 3. The import fails; or even worse, the import succeeds but the data is mangled
8
+ 4. Some poor souls must dig through the CSV file to figure out what happened. Usually issues are related to bad cell quoting, inconsistent column counts, etc.
9
+
10
+ This gem seeks to make this process less terrible by providing a way to easily surface common formatting issues on a CSV file.
11
+
12
+ `WtfCSV.scan` has the following options:
13
+ ```
14
+ |---------------------------------|----------|--------------------------------------------------------------------------------------|
15
+ | Option | Default | Explanation |
16
+ |---------------------------------|----------|--------------------------------------------------------------------------------------|
17
+ | :col_sep | ',' | Column separator |
18
+ | :row_sep | $/ ,"\n" | Row separator - defaults to system's $/ , which defaults to "\n" |
19
+ | | | This can also be set to :auto, but will process the whole cvs file first (slow!) |
20
+ | :quote_char | '"' | Quotation character |
21
+ | :escape_char | '\' | Character to escape quotes |
22
+ |---------------------------------|----------|--------------------------------------------------------------------------------------|
23
+ | :check_col_count | true | If set, checks for issues in the number of columns that are present |
24
+ | :num_cols | 0 | If :check_col_count is set and this value is non-zero, will return errors if any |
25
+ | | | line does not have this number of columns |
26
+ | :col_threshold | 80 | If :check_col_count is set, this is the percentage of rows that must have a column |
27
+ | | | count in order for the module to assume this is the target number of columns. |
28
+ | | | For example, if there are 10 line in the file, and this value is set to 80, then |
29
+ | | | at least 8 lines must have a certain number of columns for the module to assume |
30
+ | | | this is the number of columns that rows are supposed to have |
31
+ |---------------------------------|----------|--------------------------------------------------------------------------------------|
32
+ | :ignore_string | nil | If a line is equal to this string, the line will not be checked for issues |
33
+ |---------------------------------|----------|--------------------------------------------------------------------------------------|
34
+ | :allow_row_sep_in_quoted_fields | false | Allows :row_sep characters to be present in quoted fields. Otherwise if there are |
35
+ | | | line ending characters in a field, they will be treat as sequential lines and you'll |
36
+ | | | likely receive column count errors (if you're checking for them) |
37
+ |---------------------------------|----------|--------------------------------------------------------------------------------------|
38
+ | :max_chars_in_field | nil | Ensures that fields have less than or equal to the provided number of characters |
39
+ |---------------------------------|----------|--------------------------------------------------------------------------------------|
40
+ | :file_encoding | 'utf-8' | Set the file encoding |
41
+ |---------------------------------|----------|--------------------------------------------------------------------------------------|
42
+ ```
43
+
44
+ If you happen upon this, know that this is in development - though should be very stable at this point. Soon this will be available on https://rubygems.org, where you'll be able to install with `gem install wtf_csv` or putting `require 'wtf_csv'` in your gemfile. Until then, feel free to install from source and bundle it into a gem yourself - just give credit where it is due.
data/Rakefile ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rubygems'
5
+ require 'rake'
6
+
7
+ require 'rspec/core/rake_task'
8
+
9
+ desc "Run RSpec"
10
+ RSpec::Core::RakeTask.new do |t|
11
+ t.verbose = false
12
+ end
13
+
14
+ desc "Run specs for all test cases"
15
+ task :spec_all do
16
+ system "rake spec"
17
+ end
18
+
19
+ task :default => :spec
@@ -0,0 +1,3 @@
1
+ module WtfCSV
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,206 @@
1
+ module WtfCSV
2
+ def WtfCSV.scan(file, options = {}, &block)
3
+ default_options = {
4
+ :col_sep => ',',
5
+ :row_sep => $/,
6
+ :quote_char => '"',
7
+ :escape_char => '\\',
8
+ :check_col_count => true,
9
+ :col_threshold => 80,
10
+ :num_cols => 0,
11
+ :ignore_string => nil,
12
+ :allow_row_sep_in_quoted_fields => false,
13
+ :max_chars_in_field => nil,
14
+ :file_encoding => 'utf-8',
15
+ }
16
+ options = default_options.merge(options)
17
+
18
+ f = File.open(file, "r:#{options[:file_encoding]}")
19
+ trgt_line_count = `wc -l "#{file}"`.strip.split(' ')[0].to_i if block_given?
20
+
21
+ if options[:row_sep] == :auto
22
+ options[:row_sep] = SmarterCSV.guess_line_ending(f, options)
23
+ f.rewind
24
+ end
25
+
26
+ # credit to tilo, author of smarter_csv, on how to loop over lines without reading whole file into memory
27
+ old_row_sep = $/
28
+ $/ = options[:row_sep]
29
+
30
+ quote_errors = Array.new
31
+ encoding_errors = Array.new
32
+ column_errors = Array.new
33
+ column_counts = Array.new if options[:check_col_count]
34
+ length_errors = Array.new
35
+
36
+ line_number = 0
37
+ col_number = 0
38
+ percent_done = 0
39
+ previous_line = ""
40
+ last_line_ended_quoted = false if options[:allow_row_sep_in_quoted_fields]
41
+ field_length = 0 if ! options[:max_chars_in_field].nil?
42
+
43
+ begin
44
+ while ! f.eof?
45
+ line = f.readline
46
+ begin
47
+ if block_given? and ((line_number.to_f / trgt_line_count)*100).to_i > percent_done
48
+ percent_done = ((line_number.to_f / trgt_line_count)*100).to_i
49
+ yield percent_done
50
+ end
51
+
52
+ line.chomp!
53
+
54
+ next if ! options[:ignore_string].nil? and line == options[:ignore_string]
55
+
56
+ if options[:allow_row_sep_in_quoted_fields] and last_line_ended_quoted
57
+ line_number -= 1
58
+ last_line_ended_quoted = false
59
+ field_length += options[:row_sep].length if ! options[:max_chars_in_field].nil?
60
+ else
61
+ is_quoted = false
62
+ new_col = true
63
+ quote_has_ended = false
64
+ quote_error = false
65
+ escape_char = false
66
+ col_number = 0
67
+ end
68
+ pos_start = 0
69
+
70
+ line.each_char.with_index do |char, position|
71
+ begin
72
+ char.ord # this is here to check encoding. if the encoding is bad this will throw an exception
73
+
74
+ field_length += 1 if ! options[:max_chars_in_field].nil?
75
+
76
+ if escape_char and options[:escape_char] == options[:quote_char] and char != options[:quote_char]
77
+ escape_char = false
78
+ is_quoted = ! is_quoted
79
+ if ! is_quoted
80
+ quote_has_ended = true
81
+ elsif ! new_col
82
+ quote_error = true
83
+ is_quoted = false
84
+ end
85
+ end
86
+
87
+ if char != options[:quote_char] and char != options[:col_sep] and char != options[:escape_char] ## escape_char part
88
+ new_col = false
89
+ if quote_has_ended
90
+ quote_error = true
91
+ end
92
+ elsif char == options[:quote_char] and escape_char
93
+ escape_char = false
94
+ elsif char == options[:escape_char]
95
+ escape_char = true
96
+ elsif char == options[:quote_char] and is_quoted
97
+ quote_has_ended = true
98
+ is_quoted = false
99
+ elsif char == options[:quote_char]
100
+ if new_col
101
+ is_quoted = true
102
+ new_col = false
103
+ else
104
+ quote_error = true
105
+ end
106
+ elsif char == options[:col_sep] and ! is_quoted
107
+ if quote_error
108
+ quote_errors.push([line_number + 1,col_number + 1,"#{previous_line}#{line[pos_start..(position - 1)]}"])
109
+ quote_error = false
110
+ end
111
+ if ! options[:max_chars_in_field].nil?
112
+ length_errors.push([line_number + 1,col_number + 1,field_length - 1]) if (field_length - 1) > options[:max_chars_in_field]
113
+ field_length = 0
114
+ end
115
+ new_col = true
116
+ quote_has_ended = false
117
+ previous_line = ""
118
+ pos_start = position + 1
119
+ col_number += 1
120
+ end
121
+ rescue Exception => e
122
+ if e.message == 'invalid byte sequence in UTF-8'
123
+ encoding_errors.push([line_number + 1,col_number + 1])
124
+ end
125
+ end
126
+ end
127
+
128
+ if escape_char and options[:escape_char] == options[:quote_char]
129
+ if ! new_col and ! is_quoted
130
+ quote_error = true
131
+ else
132
+ is_quoted = ! is_quoted
133
+ end
134
+ end
135
+
136
+ if is_quoted
137
+ if options[:allow_row_sep_in_quoted_fields]
138
+ last_line_ended_quoted = true
139
+ previous_line = "#{previous_line}#{line[pos_start...line.length]}#{options[:row_sep]}"
140
+ next
141
+ else
142
+ quote_error = true
143
+ end
144
+ end
145
+
146
+ quote_errors.push([line_number + 1,col_number + 1,line[pos_start..line.length]]) if quote_error
147
+
148
+ if ! options[:max_chars_in_field].nil?
149
+ length_errors.push([line_number + 1,col_number + 1,field_length]) if field_length > options[:max_chars_in_field]
150
+ field_length = 0
151
+ end
152
+
153
+ if options[:check_col_count]
154
+ fnd = false
155
+ column_counts.each do |val|
156
+ if val[0] == col_number + 1
157
+ val[1].push(line_number)
158
+ fnd = true
159
+ break
160
+ end
161
+ end
162
+
163
+ if ! fnd
164
+ column_counts.push([col_number + 1, [line_number + 1]])
165
+ end
166
+ end
167
+
168
+ rescue Exception => e
169
+ # don't do anything
170
+ ensure
171
+ line_number += 1
172
+ end
173
+ end
174
+ ensure
175
+ $/ = old_row_sep
176
+ end
177
+
178
+ if options[:check_col_count]
179
+ column_counts.sort_by! { |val| val[1].length }
180
+ column_counts.reverse!
181
+
182
+ # if we're looking for an absolute number...
183
+ if options[:num_cols] != 0
184
+ column_counts.each do |val|
185
+ if val[0] != options[:num_cols]
186
+ val[1].each { |row| column_errors.push([row,val[0],options[:num_cols]]) }
187
+ end
188
+ end
189
+
190
+ # else we'll try to figure out the target number of columns with :col_threshold
191
+ elsif column_counts.length > 1
192
+ if column_counts[0][1].length >= line_number * (options[:col_threshold].to_f / 100)
193
+ column_counts.drop(1).each { |val| val[1].each { |row| column_errors.push([row,val[0],column_counts[0][0]]) } }
194
+ else
195
+ column_counts.each { |val| column_errors.push([val[0],val[1].length]) }
196
+ end
197
+ end
198
+ end
199
+
200
+ return {quote_errors: quote_errors,
201
+ encoding_errors: encoding_errors,
202
+ column_errors: column_errors,
203
+ length_errors: length_errors}
204
+
205
+ end
206
+ end
data/lib/wtf_csv.rb ADDED
@@ -0,0 +1,3 @@
1
+ require "wtf_csv/wtf_csv.rb"
2
+ require "wtf_csv/version.rb"
3
+ require 'smarter_csv'
@@ -0,0 +1,5 @@
1
+ "animal","sound",color,size
2
+ "cat",meow,"calico",small,extra
3
+ dog,"bark","golden",medium
4
+ ,,,
5
+ horse,whinny,"brown"
@@ -0,0 +1,5 @@
1
+ "animal","sound",color,size
2
+ "cat",meow,"calico",small,extra
3
+ dog,"bark","golden",medium
4
+ ,,,
5
+ horse,whinny,"brown"
@@ -0,0 +1,4 @@
1
+ "animal","sound",co�lor,"\"size
2
+ cat",meow,"calico",small_and_this_is_a_long_field_with_44_chars
3
+ dog,"field_with_26_charact�ers"�,"golden,medium
4
+ horse,whinny\"","brown","large"
@@ -0,0 +1,4 @@
1
+ "="animal="","sound",c="olor,size
2
+ "cat",="meow=","="calico="",small
3
+ dog,"bark","="golden",medium="
4
+ horse,="="whinny="=","brown","large"
@@ -0,0 +1,4 @@
1
+ ""animal"","""sound""","color","size"
2
+ """cat"",""""","meow","calico","""small""
3
+ "dog","""bark""","""""golden""",""medium""
4
+ "horse"," whinny ","brown",",,large"""
@@ -0,0 +1,6 @@
1
+ "animal","sound",color,size
2
+ "cat",meow,"calico",small
3
+ dog,"bark","golden",medium
4
+ ignore_this_line
5
+ do not ignore this one
6
+ ,,,
@@ -0,0 +1,4 @@
1
+ "animal","sound",color,"\"size
2
+ cat",meow,"calico",small_and_this_is_a_long_field_with_44_chars
3
+ dog,"field_with_26_characters","golden,medium
4
+ horse,whinny\"","brown","large"
@@ -0,0 +1,4 @@
1
+ "animal","sound",color,"\"size
2
+ cat",meow,"calico",small_and_this_is_a_long_field_with_44_chars
3
+ dog,"field_with_26_characters","golden,medium
4
+ horse,whinny\"","brown","large"
@@ -0,0 +1,4 @@
1
+ "animal"|"sound"|color|size
2
+ "cat"|meow|"calico"|small
3
+ dog|"bark"|"golden"|medium
4
+ horse|whinny|"brown"|"large"
@@ -0,0 +1,3 @@
1
+ +animal+,"sound",+color+,size
2
+ ++cat++,meow,+calico+,+small+
3
+ +dog+,woof,"golden",+medium+a
@@ -0,0 +1,4 @@
1
+ "animal","sound",color,size
2
+ "cat",meow,"calico",small
3
+ dog,"bark","golden",medium
4
+ horse,whinny,"brown","large"
@@ -0,0 +1 @@
1
+ "animal","sound",color,size
@@ -0,0 +1,4 @@
1
+ "animal","sound",color,size
2
+ "cat",meow,"calico",small
3
+ dog,"bark","golden",medium
4
+ horse,whinny,"brown","large"
@@ -0,0 +1,4 @@
1
+ "animal","sound",color,"\"size
2
+ cat",meow,"calico",small
3
+ dog,"bark","golden,medium
4
+ horse,whinny","brown","large"
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ Bundler.require(:default)
5
+
6
+ require 'wtf_csv'
7
+
8
+ RSpec.configure do |config|
9
+ config.expect_with :rspec do |c|
10
+ c.syntax = :should
11
+ end
12
+ end
@@ -0,0 +1,248 @@
1
+ require 'spec_helper'
2
+
3
+ fixture_path = 'spec/fixtures'
4
+ # writing some files with special characters:
5
+ # File.write("#{fixture_path}/quoted_fields_r.csv", "\"animal\",\"sound\",color,size\r\"cat\",meow,\"calico\",small\rdog,\"bark\",\"golden\",medium\rhorse,whinny,\"brown\",\"large\"")
6
+ # File.write("#{fixture_path}/max_chars_in_field_rn.csv", "\"animal\",\"sound\",color,\"\\\"size\r\ncat\",meow,\"calico\",small_and_this_is_a_long_field_with_44_chars\r\ndog,\"field_with_26_characters\",\"golden,medium\r\nhorse,whinny\\\"\",\"brown\",\"large\"")
7
+ # File.write("#{fixture_path}/encoding.csv", "\"animal\",\"sound\",co\255lor,\"\\\"size\ncat\",meow,\"calico\",small_and_this_is_a_long_field_with_44_chars\ndog,\"field_with_26_charact\255ers\"\255,\"golden,medium\nhorse,whinny\\\"\",\"brown\",\"large\"")
8
+
9
+ describe 'a file with \n line endings' do
10
+ it 'should have no errors' do
11
+ options = {:row_sep => "\n"}
12
+ output = WtfCSV.scan("#{fixture_path}/quoted_fields_n.csv", options)
13
+ output[:quote_errors].length.should be == 0
14
+ output[:column_errors].length.should be == 0
15
+ output[:length_errors].length.should be == 0
16
+ output[:encoding_errors].length.should be == 0
17
+ end
18
+ end
19
+
20
+ describe 'a file with \r line endings' do
21
+ it 'should have no errors' do
22
+ options = {:row_sep => "\r"}
23
+ output = WtfCSV.scan("#{fixture_path}/quoted_fields_r.csv", options)
24
+ output[:quote_errors].length.should be == 0
25
+ output[:column_errors].length.should be == 0
26
+ output[:length_errors].length.should be == 0
27
+ output[:encoding_errors].length.should be == 0
28
+ end
29
+ end
30
+
31
+ describe 'a file with \r\n line endings' do
32
+ it 'should have no errors' do
33
+ options = {:row_sep => "\r\n"}
34
+ output = WtfCSV.scan("#{fixture_path}/quoted_fields_rn.csv", options)
35
+ output[:quote_errors].length.should be == 0
36
+ output[:column_errors].length.should be == 0
37
+ output[:length_errors].length.should be == 0
38
+ output[:encoding_errors].length.should be == 0
39
+ end
40
+ end
41
+
42
+ describe 'a file with pipe delimiters' do
43
+ it 'should have no errors' do
44
+ options = {:col_sep => '|'}
45
+ output = WtfCSV.scan("#{fixture_path}/pipe_delimited.csv", options)
46
+ output[:quote_errors].length.should be == 0
47
+ output[:column_errors].length.should be == 0
48
+ output[:length_errors].length.should be == 0
49
+ output[:encoding_errors].length.should be == 0
50
+ end
51
+ end
52
+
53
+ describe 'a file with custom quote characters' do
54
+ it 'should behave correctly' do
55
+ options = {:quote_char => '+'}
56
+ output = WtfCSV.scan("#{fixture_path}/quote_char.csv", options)
57
+ output[:quote_errors].length.should be == 2
58
+ if output[:quote_errors].length == 2
59
+ output[:quote_errors].should include [2,1,"++cat++"]
60
+ output[:quote_errors].should include [3,4,"+medium+a"]
61
+ end
62
+ output[:column_errors].length.should be == 0
63
+ output[:length_errors].length.should be == 0
64
+ output[:encoding_errors].length.should be == 0
65
+ end
66
+ end
67
+
68
+ describe 'a file with custom escape character' do
69
+ it 'should behave correctly' do
70
+ options = {:escape_char => '='}
71
+ output = WtfCSV.scan("#{fixture_path}/escape_char.csv", options)
72
+ output[:quote_errors].length.should be == 0
73
+ output[:column_errors].length.should be == 0
74
+ output[:length_errors].length.should be == 0
75
+ output[:encoding_errors].length.should be == 0
76
+ end
77
+ end
78
+
79
+ describe 'a file with the same escape and quote characters' do
80
+ it 'should behave correctly' do
81
+ options = {:escape_char => '"', :quote_char => '"'}
82
+ output = WtfCSV.scan("#{fixture_path}/escape_char_equals_quote_char.csv", options)
83
+ output[:quote_errors].length.should be == 1
84
+ if output[:quote_errors].length == 1
85
+ output[:quote_errors].should include [2,4,'"""small""']
86
+ end
87
+ output[:column_errors].length.should be == 0
88
+ output[:length_errors].length.should be == 0
89
+ output[:encoding_errors].length.should be == 0
90
+ end
91
+ end
92
+
93
+ describe 'checking a file for a fixed number of columns in each row' do
94
+ it 'should catch errors' do
95
+ options = {:num_cols => 4}
96
+ output = WtfCSV.scan("#{fixture_path}/check_column_count_fixed.csv", options)
97
+ output[:quote_errors].length.should be == 0
98
+ output[:column_errors].length.should be == 2
99
+ if output[:column_errors].length == 2
100
+ output[:column_errors].should include [2,5,4]
101
+ output[:column_errors].should include [5,3,4]
102
+ end
103
+ output[:length_errors].length.should be == 0
104
+ output[:encoding_errors].length.should be == 0
105
+ end
106
+ end
107
+
108
+ describe 'checking a file for column counts and determining the number of columns with a threshold' do
109
+ it 'should only return the counts of how many rows have different numbers of columns if under the threshold' do
110
+ options = {:col_threshold => 70}
111
+ output = WtfCSV.scan("#{fixture_path}/check_column_count_smart_threshold.csv", options)
112
+ output[:quote_errors].length.should be == 0
113
+ output[:column_errors].length.should be == 3
114
+ if output[:column_errors].length == 3
115
+ output[:column_errors].should include [4,3]
116
+ output[:column_errors].should include [5,1]
117
+ output[:column_errors].should include [3,1]
118
+ end
119
+ output[:length_errors].length.should be == 0
120
+ output[:encoding_errors].length.should be == 0
121
+ end
122
+ end
123
+
124
+ describe 'checking a file for column counts and determining the number of columns with a threshold' do
125
+ it 'should determine the target number of columns using a threshold' do
126
+ options = {:col_threshold => 60}
127
+ output = WtfCSV.scan("#{fixture_path}/check_column_count_smart_threshold.csv", options)
128
+ output[:quote_errors].length.should be == 0
129
+ output[:column_errors].length.should be == 2
130
+ if output[:column_errors].length == 2
131
+ output[:column_errors].should include [2,5,4]
132
+ output[:column_errors].should include [5,3,4]
133
+ end
134
+ output[:length_errors].length.should be == 0
135
+ output[:encoding_errors].length.should be == 0
136
+ end
137
+ end
138
+
139
+ describe 'using the :ignore_string option' do
140
+ it 'should skip the line that matches this string' do
141
+ options = {:ignore_string => 'ignore_this_line',
142
+ :col_threshold => 60}
143
+ output = WtfCSV.scan("#{fixture_path}/ignore_string.csv", options)
144
+ output[:quote_errors].length.should be == 0
145
+ output[:column_errors].length.should be == 1
146
+ if output[:column_errors].length == 1
147
+ output[:column_errors].should include [5,1,4]
148
+ end
149
+ output[:length_errors].length.should be == 0
150
+ output[:encoding_errors].length.should be == 0
151
+ end
152
+ end
153
+
154
+ describe 'a file with quoted newlines' do
155
+ it 'should provide accurate errors if :allow_row_sep_in_quoted_fields is not set to true' do
156
+ options = {:col_threshold => 70}
157
+ output = WtfCSV.scan("#{fixture_path}/quoted_newlines.csv", options)
158
+ output[:quote_errors].length.should be == 4
159
+ if output[:quote_errors].length == 4
160
+ output[:quote_errors].should include [1,4,'"\"size']
161
+ output[:quote_errors].should include [2,1,'cat"']
162
+ output[:quote_errors].should include [3,3,'"golden,medium']
163
+ output[:quote_errors].should include [4,2,'whinny"']
164
+ end
165
+ output[:column_errors].length.should be == 1
166
+ if output[:column_errors].length == 1
167
+ output[:column_errors].should include [3,3,4]
168
+ end
169
+ output[:length_errors].length.should be == 0
170
+ output[:encoding_errors].length.should be == 0
171
+ end
172
+ end
173
+
174
+ describe 'a file with quoted newlines' do
175
+ it 'should allow the quote newlines if :allow_row_sep_in_quoted_fields is set to true' do
176
+ options = {:allow_row_sep_in_quoted_fields => true}
177
+ output = WtfCSV.scan("#{fixture_path}/quoted_newlines.csv", options)
178
+ output[:quote_errors].length.should be == 0
179
+ output[:column_errors].length.should be == 2
180
+ if output[:column_errors].length == 2
181
+ output[:column_errors].should include [7,1]
182
+ output[:column_errors].should include [5,1]
183
+ end
184
+ output[:length_errors].length.should be == 0
185
+ output[:encoding_errors].length.should be == 0
186
+ end
187
+ end
188
+
189
+ describe 'a file that has greater than the number of characters in :max_chars_in_field in a single field' do
190
+ it 'should give appropriate errors, and should count a \n in the field as a single character' do
191
+ options = {:max_chars_in_field => 15,
192
+ :allow_row_sep_in_quoted_fields => true,
193
+ :check_col_count => false}
194
+ output = WtfCSV.scan("#{fixture_path}/max_chars_in_field_n.csv", options)
195
+ output[:quote_errors].length.should be == 0
196
+ output[:column_errors].length.should be == 0
197
+ output[:length_errors].length.should be == 3
198
+ if output[:length_errors].length == 3
199
+ output[:length_errors].should include [1,7,44]
200
+ output[:length_errors].should include [2,2,26]
201
+ output[:length_errors].should include [2,3,30]
202
+ end
203
+ output[:encoding_errors].length.should be == 0
204
+ end
205
+ end
206
+
207
+ describe 'a file that has greater than the number of characters in :max_chars_in_field in a single field' do
208
+ it 'should give appropriate errors, and should count a \r\n in the field as two characters' do
209
+ options = {:max_chars_in_field => 15,
210
+ :allow_row_sep_in_quoted_fields => true,
211
+ :row_sep => "\r\n",
212
+ :check_col_count => false}
213
+ output = WtfCSV.scan("#{fixture_path}/max_chars_in_field_rn.csv", options)
214
+ output[:quote_errors].length.should be == 0
215
+ output[:column_errors].length.should be == 0
216
+ output[:length_errors].length.should be == 3
217
+ if output[:length_errors].length == 3
218
+ output[:length_errors].should include [1,7,44]
219
+ output[:length_errors].should include [2,2,26]
220
+ output[:length_errors].should include [2,3,31]
221
+ end
222
+ output[:encoding_errors].length.should be == 0
223
+ end
224
+ end
225
+
226
+ describe 'a file that has encoding errors' do
227
+ it 'should give appropriate errors' do
228
+ options = {:allow_row_sep_in_quoted_fields => true,
229
+ :check_col_count => false}
230
+ output = WtfCSV.scan("#{fixture_path}/encoding.csv", options)
231
+ output[:encoding_errors]
232
+ output[:quote_errors].length.should be == 0
233
+ output[:column_errors].length.should be == 0
234
+ output[:length_errors].length.should be == 0
235
+ output[:encoding_errors].length.should be == 3
236
+ if output[:encoding_errors].length == 3
237
+ output[:encoding_errors].should include [1,3]
238
+ output[:encoding_errors].should include [2,2]
239
+ output[:encoding_errors].each_with_index do |err, idx|
240
+ if err == [2,2]
241
+ output[:encoding_errors].delete_at(idx)
242
+ break
243
+ end
244
+ end
245
+ output[:encoding_errors].should include [2,2]
246
+ end
247
+ end
248
+ end
data/wtf_csv.gemspec ADDED
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/wtf_csv/version', __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'wtf_csv'
6
+ s.version = WtfCSV::VERSION
7
+ s.date = '2015-09-11'
8
+ s.summary = %q{Ruby gem to detect formatting issues in a CSV}
9
+ s.description = %q{Ruby gem to detect formatting issues in a CSV. Can find quoting issues, incorrect column counts, and can properly handle quote-escaped line endings.}
10
+ s.authors = ["Greg Merritt"]
11
+ s.email = ["greg@evertrue.com"]
12
+ s.homepage = 'https://github.com/gremerritt/wtf_csv'
13
+ s.files = `git ls-files`.split($\)
14
+ s.require_paths = ["lib"]
15
+ s.licenses = ['MIT']
16
+
17
+ s.add_runtime_dependency 'smarter_csv'
18
+ s.add_development_dependency 'rspec'
19
+
20
+ s.requirements << 'smarter_csv'
21
+ end
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wtf_csv
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Greg Merritt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-09-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: smarter_csv
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: Ruby gem to detect formatting issues in a CSV. Can find quoting issues,
42
+ incorrect column counts, and can properly handle quote-escaped line endings.
43
+ email:
44
+ - greg@evertrue.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".gitignore"
50
+ - ".rspec"
51
+ - Gemfile
52
+ - Gemfile.lock
53
+ - README.md
54
+ - Rakefile
55
+ - lib/wtf_csv.rb
56
+ - lib/wtf_csv/version.rb
57
+ - lib/wtf_csv/wtf_csv.rb
58
+ - spec/fixtures/check_column_count_fixed.csv
59
+ - spec/fixtures/check_column_count_smart_threshold.csv
60
+ - spec/fixtures/encoding.csv
61
+ - spec/fixtures/escape_char.csv
62
+ - spec/fixtures/escape_char_equals_quote_char.csv
63
+ - spec/fixtures/ignore_string.csv
64
+ - spec/fixtures/max_chars_in_field_n.csv
65
+ - spec/fixtures/max_chars_in_field_rn.csv
66
+ - spec/fixtures/pipe_delimited.csv
67
+ - spec/fixtures/quote_char.csv
68
+ - spec/fixtures/quoted_fields_n.csv
69
+ - spec/fixtures/quoted_fields_r.csv
70
+ - spec/fixtures/quoted_fields_rn.csv
71
+ - spec/fixtures/quoted_newlines.csv
72
+ - spec/spec_helper.rb
73
+ - spec/wtf_csv/test_spec.rb
74
+ - wtf_csv.gemspec
75
+ homepage: https://github.com/gremerritt/wtf_csv
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements:
94
+ - smarter_csv
95
+ rubyforge_project:
96
+ rubygems_version: 2.0.3
97
+ signing_key:
98
+ specification_version: 4
99
+ summary: Ruby gem to detect formatting issues in a CSV
100
+ test_files: []