csv_sniffer 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/LICENSE +21 -21
- data/README.md +67 -67
- data/Rakefile +8 -8
- data/csv_sniffer.gemspec +15 -15
- data/lib/csv_sniffer.rb +245 -233
- data/test/test_csv_sniffer.rb +86 -78
- metadata +8 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 94eaa256ae4c389193159b7e22d4620a7fcd26a0
|
4
|
+
data.tar.gz: fb03dbd72b05013f2615f80053c577148959f83a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ce26eb29a3e246b597932fe3c66fca84fa5763a261a6c3e87e6af4565ced8b8178a439fe8b006b9435620caa72f992bd122423977097f9efea396569abffaa6
|
7
|
+
data.tar.gz: 63d1385bd21fc1362f6c4081c85857f22e1b397b33df92665c34f0c7c229f57b31e58d0dd4c4a1e93bdffe85c3ebe0b5dee3144620e185d4da88b759306d27b5
|
data/.gitignore
ADDED
data/LICENSE
CHANGED
@@ -1,21 +1,21 @@
|
|
1
|
-
The MIT License (MIT)
|
2
|
-
|
3
|
-
Copyright (c) 2015 Tim Ojo
|
4
|
-
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
7
|
-
in the Software without restriction, including without limitation the rights
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
10
|
-
furnished to do so, subject to the following conditions:
|
11
|
-
|
12
|
-
The above copyright notice and this permission notice shall be included in
|
13
|
-
all copies or substantial portions of the Software.
|
14
|
-
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
-
THE SOFTWARE.
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Tim Ojo
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,67 +1,67 @@
|
|
1
|
-
# CSV Sniffer
|
2
|
-
|
3
|
-
CSV Sniffer is a set of functions that allow a user heuristically detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers. For delimiter detection the following delimiters are currently supported `[",", "\t", "|", ";"]`
|
4
|
-
|
5
|
-
To ensure high performance and a low memory footprint, the library uses as little information as needed to make accurate decisions. Contributors are welcome to
|
6
|
-
improve the algorithms in use.
|
7
|
-
|
8
|
-
|
9
|
-
## Installation
|
10
|
-
|
11
|
-
```
|
12
|
-
$ gem install csv_sniffer
|
13
|
-
```
|
14
|
-
|
15
|
-
## Usage
|
16
|
-
|
17
|
-
Given a `some_file.csv` file:
|
18
|
-
|
19
|
-
```csv
|
20
|
-
Name;Phone
|
21
|
-
John Doe ;555-481-2345
|
22
|
-
Jane C. Doe;555-123-4567
|
23
|
-
```
|
24
|
-
|
25
|
-
Detection usage is as follows:
|
26
|
-
|
27
|
-
```rb
|
28
|
-
require "csv_sniffer"
|
29
|
-
|
30
|
-
delim = CsvSniffer.detect_delimiter("/path/to/some_file.csv") #=> ";"
|
31
|
-
is_quote_enclosed = CsvSniffer.is_quote_enclosed?("/path/to/some_file.csv") #=> false
|
32
|
-
has_header = CsvSniffer.has_header?("/path/to/some_file.csv") #=> true
|
33
|
-
```
|
34
|
-
|
35
|
-
See [`test_csv_sniffer.rb`](test/test_csv_sniffer.rb) for more examples.
|
36
|
-
|
37
|
-
|
38
|
-
## Tests
|
39
|
-
|
40
|
-
```
|
41
|
-
$ rake test
|
42
|
-
```
|
43
|
-
|
44
|
-
|
45
|
-
## License
|
46
|
-
|
47
|
-
The MIT License (MIT)
|
48
|
-
|
49
|
-
Copyright © 2015 Tim Ojo
|
50
|
-
|
51
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
52
|
-
of this software and associated documentation files (the "Software"), to deal
|
53
|
-
in the Software without restriction, including without limitation the rights
|
54
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
55
|
-
copies of the Software, and to permit persons to whom the Software is
|
56
|
-
furnished to do so, subject to the following conditions:
|
57
|
-
|
58
|
-
The above copyright notice and this permission notice shall be included in
|
59
|
-
all copies or substantial portions of the Software.
|
60
|
-
|
61
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
62
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
63
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
64
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
65
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
66
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
67
|
-
THE SOFTWARE.
|
1
|
+
# CSV Sniffer
|
2
|
+
|
3
|
+
CSV Sniffer is a set of functions that allow a user heuristically detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers. For delimiter detection the following delimiters are currently supported `[",", "\t", "|", ";"]`
|
4
|
+
|
5
|
+
To ensure high performance and a low memory footprint, the library uses as little information as needed to make accurate decisions. Contributors are welcome to
|
6
|
+
improve the algorithms in use.
|
7
|
+
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
```
|
12
|
+
$ gem install csv_sniffer
|
13
|
+
```
|
14
|
+
|
15
|
+
## Usage
|
16
|
+
|
17
|
+
Given a `some_file.csv` file:
|
18
|
+
|
19
|
+
```csv
|
20
|
+
Name;Phone
|
21
|
+
John Doe ;555-481-2345
|
22
|
+
Jane C. Doe;555-123-4567
|
23
|
+
```
|
24
|
+
|
25
|
+
Detection usage is as follows:
|
26
|
+
|
27
|
+
```rb
|
28
|
+
require "csv_sniffer"
|
29
|
+
|
30
|
+
delim = CsvSniffer.detect_delimiter("/path/to/some_file.csv") #=> ";"
|
31
|
+
is_quote_enclosed = CsvSniffer.is_quote_enclosed?("/path/to/some_file.csv") #=> false
|
32
|
+
has_header = CsvSniffer.has_header?("/path/to/some_file.csv") #=> true
|
33
|
+
```
|
34
|
+
|
35
|
+
See [`test_csv_sniffer.rb`](test/test_csv_sniffer.rb) for more examples.
|
36
|
+
|
37
|
+
|
38
|
+
## Tests
|
39
|
+
|
40
|
+
```
|
41
|
+
$ rake test
|
42
|
+
```
|
43
|
+
|
44
|
+
|
45
|
+
## License
|
46
|
+
|
47
|
+
The MIT License (MIT)
|
48
|
+
|
49
|
+
Copyright © 2015 Tim Ojo
|
50
|
+
|
51
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
52
|
+
of this software and associated documentation files (the "Software"), to deal
|
53
|
+
in the Software without restriction, including without limitation the rights
|
54
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
55
|
+
copies of the Software, and to permit persons to whom the Software is
|
56
|
+
furnished to do so, subject to the following conditions:
|
57
|
+
|
58
|
+
The above copyright notice and this permission notice shall be included in
|
59
|
+
all copies or substantial portions of the Software.
|
60
|
+
|
61
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
62
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
63
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
64
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
65
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
66
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
67
|
+
THE SOFTWARE.
|
data/Rakefile
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
require 'rake/testtask'
|
2
|
-
|
3
|
-
Rake::TestTask.new do |t|
|
4
|
-
t.libs << 'test'
|
5
|
-
end
|
6
|
-
|
7
|
-
desc "Run tests"
|
8
|
-
task :default => :test
|
1
|
+
require 'rake/testtask'
|
2
|
+
|
3
|
+
Rake::TestTask.new do |t|
|
4
|
+
t.libs << 'test'
|
5
|
+
end
|
6
|
+
|
7
|
+
desc "Run tests"
|
8
|
+
task :default => :test
|
data/csv_sniffer.gemspec
CHANGED
@@ -1,15 +1,15 @@
|
|
1
|
-
Gem::Specification.new do |s|
|
2
|
-
s.name = 'csv_sniffer'
|
3
|
-
s.version = '0.1.
|
4
|
-
s.date = '2015-
|
5
|
-
s.summary = "CSV library for heuristic detection of CSV properties"
|
6
|
-
s.description = "CSV Sniffer is a set of functions that allow a user detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers."
|
7
|
-
s.authors = ["Tim Ojo"]
|
8
|
-
s.email = 'ojo.tim@gmail.com'
|
9
|
-
s.homepage = 'https://github.com/tim-ojo/csv_sniffer'
|
10
|
-
s.license = 'MIT'
|
11
|
-
|
12
|
-
s.files = `git ls-files`.split($/)
|
13
|
-
s.test_files = s.files.grep(/^test/)
|
14
|
-
s.add_development_dependency
|
15
|
-
end
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'csv_sniffer'
|
3
|
+
s.version = '0.1.2'
|
4
|
+
s.date = '2015-12-28'
|
5
|
+
s.summary = "CSV library for heuristic detection of CSV properties"
|
6
|
+
s.description = "CSV Sniffer is a set of functions that allow a user detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers."
|
7
|
+
s.authors = ["Tim Ojo"]
|
8
|
+
s.email = 'ojo.tim@gmail.com'
|
9
|
+
s.homepage = 'https://github.com/tim-ojo/csv_sniffer'
|
10
|
+
s.license = 'MIT'
|
11
|
+
|
12
|
+
s.files = `git ls-files`.split($/)
|
13
|
+
s.test_files = s.files.grep(/^test/)
|
14
|
+
s.add_development_dependency 'test-unit', '~> 0'
|
15
|
+
end
|
data/lib/csv_sniffer.rb
CHANGED
@@ -1,233 +1,245 @@
|
|
1
|
-
# This class contains functions to heuristically decipher certain information from a CSV file
|
2
|
-
class CsvSniffer
|
3
|
-
|
4
|
-
# Reads the first line of the csv and returns true if the line starts and ends with " or '
|
5
|
-
#
|
6
|
-
# Example:
|
7
|
-
# CsvSniffer.is_quote_enclosed?("path/to/file")
|
8
|
-
# => true
|
9
|
-
#
|
10
|
-
# Arguments:
|
11
|
-
# filepath: (String)
|
12
|
-
|
13
|
-
def self.is_quote_enclosed?(filepath)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
27
|
-
#
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
if
|
70
|
-
return
|
71
|
-
end
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
row
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
1
|
+
# This class contains functions to heuristically decipher certain information from a CSV file
|
2
|
+
class CsvSniffer
|
3
|
+
|
4
|
+
# Reads the first line of the csv and returns true if the line starts and ends with " or '
|
5
|
+
#
|
6
|
+
# Example:
|
7
|
+
# CsvSniffer.is_quote_enclosed?("path/to/file")
|
8
|
+
# => true
|
9
|
+
#
|
10
|
+
# Arguments:
|
11
|
+
# filepath: (String)
|
12
|
+
|
13
|
+
def self.is_quote_enclosed?(filepath)
|
14
|
+
begin
|
15
|
+
line = File.open(filepath, &:readline)
|
16
|
+
line.chomp!.strip!
|
17
|
+
return line.start_with?('"') && line.end_with?('"') || line.start_with?("'") && line.end_with?("'")
|
18
|
+
rescue EOFError
|
19
|
+
false
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
# Gets the quote character in use in the file if one exists. Returns "'", """ or nil
|
25
|
+
#
|
26
|
+
# Example:
|
27
|
+
# CsvSniffer.get_quote_char("path/to/file")
|
28
|
+
# => "
|
29
|
+
#
|
30
|
+
# Arguments:
|
31
|
+
# filepath: (String)
|
32
|
+
|
33
|
+
def self.get_quote_char(filepath)
|
34
|
+
begin
|
35
|
+
if is_quote_enclosed?(filepath)
|
36
|
+
line = File.open(filepath, &:readline)
|
37
|
+
line.chomp!.strip!
|
38
|
+
return line[0]
|
39
|
+
else
|
40
|
+
return nil
|
41
|
+
end
|
42
|
+
rescue EOFError
|
43
|
+
nil
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
# Heuristically detects the delimiter used in the CSV file and returns it
|
49
|
+
#
|
50
|
+
# Example:
|
51
|
+
# CsvSniffer.detect_delimiter("path/to/file")
|
52
|
+
# => "|"
|
53
|
+
#
|
54
|
+
# Arguments:
|
55
|
+
# filepath: (String)
|
56
|
+
|
57
|
+
def self.detect_delimiter (filepath)
|
58
|
+
# If the csv is quote enclosed then just get the delimiter after the first cell. Otherwise...
|
59
|
+
# Get the first line and count how many of the possible delimiters are present. If there is >1 of one of the
|
60
|
+
# delimiters and 0 of the others then, then we pick the max. If there are more than 0 of any of the others then
|
61
|
+
# we repeat the counting procedure for the next 50 lines until the condition is satisfied.
|
62
|
+
# If the condition is never satisfied then we simply pick the delimiter that occurs the most frequently, defaulting
|
63
|
+
# to the comma. Unless that delimeter's count is equal to the tab or pipe delimiter's count. In that case we return \t or |
|
64
|
+
|
65
|
+
if is_quote_enclosed?(filepath)
|
66
|
+
line = File.open(filepath, &:readline)
|
67
|
+
line.chomp!.strip!
|
68
|
+
m = /["'].+?["']([,|;\t])/.match(line)
|
69
|
+
if (m)
|
70
|
+
return m[1]
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
lineCount = 0
|
75
|
+
File.foreach(filepath) do |line|
|
76
|
+
detectedDelim = max_delim_when_others_are_zero(line)
|
77
|
+
if detectedDelim != '0' #=> '0' is a sentinel value that indicates no delim found
|
78
|
+
return detectedDelim
|
79
|
+
end
|
80
|
+
|
81
|
+
lineCount += 1;
|
82
|
+
break if lineCount == 50
|
83
|
+
end
|
84
|
+
|
85
|
+
# If I got here I'm going to pick the default by counting the delimiters on the first line and returning the max
|
86
|
+
begin
|
87
|
+
line = File.open(filepath, &:readline)
|
88
|
+
freqOfPossibleDelims = get_freq_of_possible_delims(line)
|
89
|
+
rescue EOFError
|
90
|
+
freqOfPossibleDelims = [0,-1,-1,-1]
|
91
|
+
end
|
92
|
+
|
93
|
+
maxFreq = 0
|
94
|
+
maxFreqIndex = 0
|
95
|
+
freqOfPossibleDelims.each_with_index do |delimFreq, i|
|
96
|
+
if (delimFreq > maxFreq)
|
97
|
+
maxFreq = delimFreq
|
98
|
+
maxFreqIndex = i
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Favor "\t" and "|" over ","
|
103
|
+
if (maxFreq == freqOfPossibleDelims[1])
|
104
|
+
return '\t'
|
105
|
+
elsif (maxFreq == freqOfPossibleDelims[3])
|
106
|
+
return "|"
|
107
|
+
else
|
108
|
+
return [",", '\t', ";", "|"][maxFreqIndex]
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Heuristically detects whether or not the csv file uses the first line as a header
|
113
|
+
#
|
114
|
+
# Example:
|
115
|
+
# CsvSniffer.has_header?("path/to/file")
|
116
|
+
# => false
|
117
|
+
#
|
118
|
+
# Arguments:
|
119
|
+
# filepath: (String)
|
120
|
+
|
121
|
+
def self.has_header?(filepath)
|
122
|
+
# Creates a dictionary of types of data in each column. If any
|
123
|
+
# column is of a single type (say, integers), *except* for the first
|
124
|
+
# row, then the first row is presumed to be labels. If the type
|
125
|
+
# can't be determined, it is assumed to be a string in which case
|
126
|
+
# the length of the string is the determining factor: if all of the
|
127
|
+
# rows except for the first are the same length, it's a header.
|
128
|
+
# Finally, a 'vote' is taken at the end for each column, adding or
|
129
|
+
# subtracting from the likelihood of the first row being a header.
|
130
|
+
delim = detect_delimiter(filepath)
|
131
|
+
if (delim == "\\t")
|
132
|
+
delim = "\t"
|
133
|
+
end
|
134
|
+
|
135
|
+
headerRow = nil
|
136
|
+
lineCount = 0
|
137
|
+
columnTypes = Hash.new
|
138
|
+
File.foreach(filepath) do |line|
|
139
|
+
if (!headerRow) # assume the first row is a header
|
140
|
+
headerRow = line.split(delim)
|
141
|
+
|
142
|
+
headerRow.each_index do |colIndex|
|
143
|
+
columnTypes[colIndex] = nil
|
144
|
+
end
|
145
|
+
next
|
146
|
+
end
|
147
|
+
|
148
|
+
lineCount += 1
|
149
|
+
break if lineCount == 50
|
150
|
+
|
151
|
+
row = line.split(delim)
|
152
|
+
columnTypes.each_key do |colIndex|
|
153
|
+
thisColType = nil
|
154
|
+
if (row[colIndex].strip.to_i.to_s == row[colIndex])
|
155
|
+
thisColType = Integer
|
156
|
+
elsif (row[colIndex].strip.to_f.to_s == row[colIndex])
|
157
|
+
thisColType = Float
|
158
|
+
else
|
159
|
+
# fallback to the length of the string
|
160
|
+
thisColType = row[colIndex].strip.length
|
161
|
+
end
|
162
|
+
|
163
|
+
if (thisColType != columnTypes[colIndex])
|
164
|
+
if (columnTypes[colIndex] == nil)
|
165
|
+
# add new column type
|
166
|
+
columnTypes[colIndex] = thisColType
|
167
|
+
else
|
168
|
+
# type is inconsistent, remove from consideration
|
169
|
+
columnTypes[colIndex] = nil
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
end # end iterate through each row column to determine columnType
|
174
|
+
end # end iterate through each row
|
175
|
+
|
176
|
+
# finally, compare results against first row and "vote" on whether its a header
|
177
|
+
hasHeader = 0
|
178
|
+
columnTypes.each do |colIndex, colVal|
|
179
|
+
if colVal.class == NilClass
|
180
|
+
# ignore
|
181
|
+
elsif (colVal.class != Class) # it's a length
|
182
|
+
if (headerRow[colIndex].strip.length != colVal)
|
183
|
+
hasHeader += 1
|
184
|
+
else
|
185
|
+
hasHeader -= 1
|
186
|
+
end
|
187
|
+
else
|
188
|
+
# determine the type of the header and compare it to the type in the Hash
|
189
|
+
# if the type is the same then vote down otherwise vote up
|
190
|
+
if headerRow[colIndex].strip.to_i.to_s == headerRow[colIndex]
|
191
|
+
if colVal == Integer
|
192
|
+
hasHeader -= 1
|
193
|
+
else
|
194
|
+
hasHeader += 1
|
195
|
+
end
|
196
|
+
elsif headerRow[colIndex].strip.to_f.to_s == headerRow[colIndex]
|
197
|
+
if colVal == Float
|
198
|
+
hasHeader -= 1
|
199
|
+
else
|
200
|
+
hasHeader += 1
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end # end type comparison voting branch
|
204
|
+
end # end voting loop
|
205
|
+
|
206
|
+
return hasHeader > 0
|
207
|
+
end
|
208
|
+
|
209
|
+
def self.max_delim_when_others_are_zero (line)
|
210
|
+
freqOfPossibleDelims = get_freq_of_possible_delims(line)
|
211
|
+
|
212
|
+
maxFreq = 0
|
213
|
+
maxFreqIndex = 0
|
214
|
+
zeroCount = 0
|
215
|
+
freqOfPossibleDelims.each_with_index do |delimFreq, i|
|
216
|
+
if (delimFreq > maxFreq)
|
217
|
+
maxFreq = delimFreq
|
218
|
+
maxFreqIndex = i
|
219
|
+
end
|
220
|
+
zeroCount += 1 if delimFreq == 0
|
221
|
+
end
|
222
|
+
|
223
|
+
if zeroCount >= 3
|
224
|
+
return [',', '\t', ';', '|'][maxFreqIndex]
|
225
|
+
else
|
226
|
+
return '0' #=> '0' is a sentinel value that indicates no delim found
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
|
231
|
+
def self.get_freq_of_possible_delims (line)
|
232
|
+
freqOfPossibleDelims = Array.new(4) #=> [0 = ','] [1 = '\t'] [2 = ';'] [3 = '|']
|
233
|
+
freqOfPossibleDelims[0] = line.count ","
|
234
|
+
freqOfPossibleDelims[1] = line.count "\t"
|
235
|
+
freqOfPossibleDelims[2] = line.count ";"
|
236
|
+
freqOfPossibleDelims[3] = line.count "|"
|
237
|
+
|
238
|
+
return freqOfPossibleDelims
|
239
|
+
end
|
240
|
+
|
241
|
+
|
242
|
+
private_class_method :max_delim_when_others_are_zero
|
243
|
+
private_class_method :get_freq_of_possible_delims
|
244
|
+
|
245
|
+
end
|
data/test/test_csv_sniffer.rb
CHANGED
@@ -1,78 +1,86 @@
|
|
1
|
-
require 'minitest/autorun'
|
2
|
-
require 'tempfile'
|
3
|
-
require 'csv_sniffer'
|
4
|
-
|
5
|
-
class CsvSnifferTest < Minitest::Test
|
6
|
-
|
7
|
-
@@file1 = Tempfile.new('file1')
|
8
|
-
@@file1.puts "Name,Number"
|
9
|
-
@@file1.puts "John Doe,555-123-4567"
|
10
|
-
@@file1.puts "Jane C. Doe,555-000-1234"
|
11
|
-
@@file1.rewind
|
12
|
-
|
13
|
-
@@file2 = Tempfile.new('file2')
|
14
|
-
@@file2.puts "'Name' |'Number'\t"
|
15
|
-
@@file2.puts "'John Doe'|'555-123-4567'"
|
16
|
-
@@file2.puts "'Jane C. Doe'|'555-000-1234'"
|
17
|
-
@@file2.rewind
|
18
|
-
|
19
|
-
@@file3 = Tempfile.new('file3')
|
20
|
-
@@file3.puts "John Doe;555-123-4567;Good\tdude"
|
21
|
-
@@file3.puts "Jane C. Doe;555-000-1234 ; Great gal"
|
22
|
-
@@file3.puts "John Smith;555-999-1234;Don't know about him"
|
23
|
-
@@file3.rewind
|
24
|
-
|
25
|
-
@@file4 = Tempfile.new('file4')
|
26
|
-
@@file4.puts "Doe, John\t555-123-4567"
|
27
|
-
@@file4.puts "Jane C. Doe\t555-000-1234\t"
|
28
|
-
@@file4.rewind
|
29
|
-
|
30
|
-
@@file5 = Tempfile.new('file5')
|
31
|
-
@@file5.puts '"Doe,,,,,, John"|"555-123-4567"'
|
32
|
-
@@file5.puts '"Jane C. Doe"|"555-000-1234\t"'
|
33
|
-
@@file5.rewind
|
34
|
-
|
35
|
-
@@file6 = Tempfile.new('file6')
|
36
|
-
@@file6.puts 'Name|Phone No.|Age'
|
37
|
-
@@file6.puts 'Doe, John|555-123-4567|31'
|
38
|
-
@@file6.puts 'Doe, Jane C. |555-000-1234|30'
|
39
|
-
@@file6.rewind
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
assert_equal
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
assert_equal
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'tempfile'
|
3
|
+
require 'csv_sniffer'
|
4
|
+
|
5
|
+
class CsvSnifferTest < Minitest::Test
|
6
|
+
|
7
|
+
@@file1 = Tempfile.new('file1')
|
8
|
+
@@file1.puts "Name,Number"
|
9
|
+
@@file1.puts "John Doe,555-123-4567"
|
10
|
+
@@file1.puts "Jane C. Doe,555-000-1234"
|
11
|
+
@@file1.rewind
|
12
|
+
|
13
|
+
@@file2 = Tempfile.new('file2')
|
14
|
+
@@file2.puts "'Name' |'Number'\t"
|
15
|
+
@@file2.puts "'John Doe'|'555-123-4567'"
|
16
|
+
@@file2.puts "'Jane C. Doe'|'555-000-1234'"
|
17
|
+
@@file2.rewind
|
18
|
+
|
19
|
+
@@file3 = Tempfile.new('file3')
|
20
|
+
@@file3.puts "John Doe;555-123-4567;Good\tdude"
|
21
|
+
@@file3.puts "Jane C. Doe;555-000-1234 ; Great gal"
|
22
|
+
@@file3.puts "John Smith;555-999-1234;Don't know about him"
|
23
|
+
@@file3.rewind
|
24
|
+
|
25
|
+
@@file4 = Tempfile.new('file4')
|
26
|
+
@@file4.puts "Doe, John\t555-123-4567"
|
27
|
+
@@file4.puts "Jane C. Doe\t555-000-1234\t"
|
28
|
+
@@file4.rewind
|
29
|
+
|
30
|
+
@@file5 = Tempfile.new('file5')
|
31
|
+
@@file5.puts '"Doe,,,,,, John"|"555-123-4567"'
|
32
|
+
@@file5.puts '"Jane C. Doe"|"555-000-1234\t"'
|
33
|
+
@@file5.rewind
|
34
|
+
|
35
|
+
@@file6 = Tempfile.new('file6')
|
36
|
+
@@file6.puts 'Name|Phone No.|Age'
|
37
|
+
@@file6.puts 'Doe, John|555-123-4567|31'
|
38
|
+
@@file6.puts 'Doe, Jane C. |555-000-1234|30'
|
39
|
+
@@file6.rewind
|
40
|
+
|
41
|
+
@@file7 = Tempfile.new('file7')
|
42
|
+
@@file7.rewind
|
43
|
+
|
44
|
+
def test_file1
|
45
|
+
assert_equal ",", CsvSniffer.detect_delimiter(@@file1.path)
|
46
|
+
assert_equal false, CsvSniffer.is_quote_enclosed?(@@file1.path)
|
47
|
+
assert_equal nil, CsvSniffer.get_quote_char(@@file1.path)
|
48
|
+
assert_equal true, CsvSniffer.has_header?(@@file1.path)
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_file2
|
52
|
+
assert_equal "|", CsvSniffer.detect_delimiter(@@file2.path)
|
53
|
+
assert_equal true, CsvSniffer.is_quote_enclosed?(@@file2.path)
|
54
|
+
assert_equal "'", CsvSniffer.get_quote_char(@@file2.path)
|
55
|
+
assert_equal true, CsvSniffer.has_header?(@@file2.path)
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_file3
|
59
|
+
assert_equal ";", CsvSniffer.detect_delimiter(@@file3.path)
|
60
|
+
assert_equal false, CsvSniffer.is_quote_enclosed?(@@file3.path)
|
61
|
+
assert_equal false, CsvSniffer.has_header?(@@file3.path)
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_file4
|
65
|
+
assert_equal '\t', CsvSniffer.detect_delimiter(@@file4.path)
|
66
|
+
assert_equal nil, CsvSniffer.get_quote_char(@@file4.path)
|
67
|
+
assert_equal false, CsvSniffer.has_header?(@@file4.path)
|
68
|
+
end
|
69
|
+
|
70
|
+
def test_file5
|
71
|
+
assert_equal "|", CsvSniffer.detect_delimiter(@@file5.path)
|
72
|
+
assert_equal true, CsvSniffer.is_quote_enclosed?(@@file5.path)
|
73
|
+
assert_equal '"', CsvSniffer.get_quote_char(@@file5.path)
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_file6
|
77
|
+
assert_equal "|", CsvSniffer.detect_delimiter(@@file6.path)
|
78
|
+
assert_equal true, CsvSniffer.has_header?(@@file6.path)
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_file7
|
82
|
+
assert_equal false, CsvSniffer.has_header?(@@file7.path)
|
83
|
+
assert_equal nil, CsvSniffer.get_quote_char(@@file7.path)
|
84
|
+
assert_equal ",", CsvSniffer.detect_delimiter(@@file7.path)
|
85
|
+
end
|
86
|
+
end
|
metadata
CHANGED
@@ -1,27 +1,27 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv_sniffer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Ojo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: test-unit
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
description: CSV Sniffer is a set of functions that allow a user detect the delimiter
|
@@ -33,6 +33,7 @@ executables: []
|
|
33
33
|
extensions: []
|
34
34
|
extra_rdoc_files: []
|
35
35
|
files:
|
36
|
+
- ".gitignore"
|
36
37
|
- LICENSE
|
37
38
|
- README.md
|
38
39
|
- Rakefile
|
@@ -49,17 +50,17 @@ require_paths:
|
|
49
50
|
- lib
|
50
51
|
required_ruby_version: !ruby/object:Gem::Requirement
|
51
52
|
requirements:
|
52
|
-
- -
|
53
|
+
- - ">="
|
53
54
|
- !ruby/object:Gem::Version
|
54
55
|
version: '0'
|
55
56
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
57
|
requirements:
|
57
|
-
- -
|
58
|
+
- - ">="
|
58
59
|
- !ruby/object:Gem::Version
|
59
60
|
version: '0'
|
60
61
|
requirements: []
|
61
62
|
rubyforge_project:
|
62
|
-
rubygems_version: 2.
|
63
|
+
rubygems_version: 2.4.7
|
63
64
|
signing_key:
|
64
65
|
specification_version: 4
|
65
66
|
summary: CSV library for heuristic detection of CSV properties
|