csv_sniffer 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ad85431534479a065a967c635b40262075688649
4
- data.tar.gz: 1bf4f25c6628a1a83c06f36319d9178517d2f6c5
3
+ metadata.gz: 94eaa256ae4c389193159b7e22d4620a7fcd26a0
4
+ data.tar.gz: fb03dbd72b05013f2615f80053c577148959f83a
5
5
  SHA512:
6
- metadata.gz: 75ab53bcd1b9db8a182ff93b09dbdbf59ac7a934fef724905248ead35b4fe3f2e0d07c88314a001d7145fa0ca29f3f65c22bcde5d88447e6fd7f34164711e892
7
- data.tar.gz: ed4b9829deac6773b429d5217e5778df3581e8048b46e19170de1ad91b8c41b3bc5677d2f7548213c0c907dac90169e5fc9b22d6e4f754a2d93950517ab49092
6
+ metadata.gz: 7ce26eb29a3e246b597932fe3c66fca84fa5763a261a6c3e87e6af4565ced8b8178a439fe8b006b9435620caa72f992bd122423977097f9efea396569abffaa6
7
+ data.tar.gz: 63d1385bd21fc1362f6c4081c85857f22e1b397b33df92665c34f0c7c229f57b31e58d0dd4c4a1e93bdffe85c3ebe0b5dee3144620e185d4da88b759306d27b5
@@ -0,0 +1,2 @@
1
+ *.gem
2
+
data/LICENSE CHANGED
@@ -1,21 +1,21 @@
1
- The MIT License (MIT)
2
-
3
- Copyright (c) 2015 Tim Ojo
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in
13
- all copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
- THE SOFTWARE.
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Tim Ojo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md CHANGED
@@ -1,67 +1,67 @@
1
- # CSV Sniffer
2
-
3
- CSV Sniffer is a set of functions that allow a user heuristically detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers. For delimiter detection the following delimiters are currently supported `[",", "\t", "|", ";"]`
4
-
5
- To ensure high performance and a low memory footprint, the library uses as little information as needed to make accurate decisions. Contributors are welcome to
6
- improve the algorithms in use.
7
-
8
-
9
- ## Installation
10
-
11
- ```
12
- $ gem install csv_sniffer
13
- ```
14
-
15
- ## Usage
16
-
17
- Given a `some_file.csv` file:
18
-
19
- ```csv
20
- Name;Phone
21
- John Doe ;555-481-2345
22
- Jane C. Doe;555-123-4567
23
- ```
24
-
25
- Detection usage is as follows:
26
-
27
- ```rb
28
- require "csv_sniffer"
29
-
30
- delim = CsvSniffer.detect_delimiter("/path/to/some_file.csv") #=> ";"
31
- is_quote_enclosed = CsvSniffer.is_quote_enclosed?("/path/to/some_file.csv") #=> false
32
- has_header = CsvSniffer.has_header?("/path/to/some_file.csv") #=> true
33
- ```
34
-
35
- See [`test_csv_sniffer.rb`](test/test_csv_sniffer.rb) for more examples.
36
-
37
-
38
- ## Tests
39
-
40
- ```
41
- $ rake test
42
- ```
43
-
44
-
45
- ## License
46
-
47
- The MIT License (MIT)
48
-
49
- Copyright © 2015 Tim Ojo
50
-
51
- Permission is hereby granted, free of charge, to any person obtaining a copy
52
- of this software and associated documentation files (the "Software"), to deal
53
- in the Software without restriction, including without limitation the rights
54
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
55
- copies of the Software, and to permit persons to whom the Software is
56
- furnished to do so, subject to the following conditions:
57
-
58
- The above copyright notice and this permission notice shall be included in
59
- all copies or substantial portions of the Software.
60
-
61
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
63
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
65
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
66
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
67
- THE SOFTWARE.
1
+ # CSV Sniffer
2
+
3
+ CSV Sniffer is a set of functions that allow a user heuristically detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers. For delimiter detection the following delimiters are currently supported `[",", "\t", "|", ";"]`
4
+
5
+ To ensure high performance and a low memory footprint, the library uses as little information as needed to make accurate decisions. Contributors are welcome to
6
+ improve the algorithms in use.
7
+
8
+
9
+ ## Installation
10
+
11
+ ```
12
+ $ gem install csv_sniffer
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ Given a `some_file.csv` file:
18
+
19
+ ```csv
20
+ Name;Phone
21
+ John Doe ;555-481-2345
22
+ Jane C. Doe;555-123-4567
23
+ ```
24
+
25
+ Detection usage is as follows:
26
+
27
+ ```rb
28
+ require "csv_sniffer"
29
+
30
+ delim = CsvSniffer.detect_delimiter("/path/to/some_file.csv") #=> ";"
31
+ is_quote_enclosed = CsvSniffer.is_quote_enclosed?("/path/to/some_file.csv") #=> false
32
+ has_header = CsvSniffer.has_header?("/path/to/some_file.csv") #=> true
33
+ ```
34
+
35
+ See [`test_csv_sniffer.rb`](test/test_csv_sniffer.rb) for more examples.
36
+
37
+
38
+ ## Tests
39
+
40
+ ```
41
+ $ rake test
42
+ ```
43
+
44
+
45
+ ## License
46
+
47
+ The MIT License (MIT)
48
+
49
+ Copyright © 2015 Tim Ojo
50
+
51
+ Permission is hereby granted, free of charge, to any person obtaining a copy
52
+ of this software and associated documentation files (the "Software"), to deal
53
+ in the Software without restriction, including without limitation the rights
54
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
55
+ copies of the Software, and to permit persons to whom the Software is
56
+ furnished to do so, subject to the following conditions:
57
+
58
+ The above copyright notice and this permission notice shall be included in
59
+ all copies or substantial portions of the Software.
60
+
61
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
63
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
65
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
66
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
67
+ THE SOFTWARE.
data/Rakefile CHANGED
@@ -1,8 +1,8 @@
1
- require 'rake/testtask'
2
-
3
- Rake::TestTask.new do |t|
4
- t.libs << 'test'
5
- end
6
-
7
- desc "Run tests"
8
- task :default => :test
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
@@ -1,15 +1,15 @@
1
- Gem::Specification.new do |s|
2
- s.name = 'csv_sniffer'
3
- s.version = '0.1.1'
4
- s.date = '2015-10-16'
5
- s.summary = "CSV library for heuristic detection of CSV properties"
6
- s.description = "CSV Sniffer is a set of functions that allow a user detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers."
7
- s.authors = ["Tim Ojo"]
8
- s.email = 'ojo.tim@gmail.com'
9
- s.homepage = 'https://github.com/tim-ojo/csv_sniffer'
10
- s.license = 'MIT'
11
-
12
- s.files = `git ls-files`.split($/)
13
- s.test_files = s.files.grep(/^test/)
14
- s.add_development_dependency "test-unit"
15
- end
1
+ Gem::Specification.new do |s|
2
+ s.name = 'csv_sniffer'
3
+ s.version = '0.1.2'
4
+ s.date = '2015-12-28'
5
+ s.summary = "CSV library for heuristic detection of CSV properties"
6
+ s.description = "CSV Sniffer is a set of functions that allow a user detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers."
7
+ s.authors = ["Tim Ojo"]
8
+ s.email = 'ojo.tim@gmail.com'
9
+ s.homepage = 'https://github.com/tim-ojo/csv_sniffer'
10
+ s.license = 'MIT'
11
+
12
+ s.files = `git ls-files`.split($/)
13
+ s.test_files = s.files.grep(/^test/)
14
+ s.add_development_dependency 'test-unit', '~> 0'
15
+ end
@@ -1,233 +1,245 @@
1
- # This class contains functions to heuristically decipher certain information from a CSV file
2
- class CsvSniffer
3
-
4
- # Reads the first line of the csv and returns true if the line starts and ends with " or '
5
- #
6
- # Example:
7
- # CsvSniffer.is_quote_enclosed?("path/to/file")
8
- # => true
9
- #
10
- # Arguments:
11
- # filepath: (String)
12
-
13
- def self.is_quote_enclosed?(filepath)
14
- line = File.open(filepath, &:readline)
15
- line.chomp!.strip!
16
- return line.start_with?('"') && line.end_with?('"') || line.start_with?("'") && line.end_with?("'")
17
- end
18
-
19
-
20
- # Gets the quote character in use in the file if one exists. Returns "'", """ or nil
21
- #
22
- # Example:
23
- # CsvSniffer.get_quote_char("path/to/file")
24
- # => "
25
- #
26
- # Arguments:
27
- # filepath: (String)
28
-
29
- def self.get_quote_char(filepath)
30
- if is_quote_enclosed?(filepath)
31
- line = File.open(filepath, &:readline)
32
- line.chomp!.strip!
33
- return line[0]
34
- else
35
- return nil
36
- end
37
- end
38
-
39
-
40
- # Heuristically detects the delimiter used in the CSV file and returns it
41
- #
42
- # Example:
43
- # CsvSniffer.detect_delimiter("path/to/file")
44
- # => "|"
45
- #
46
- # Arguments:
47
- # filepath: (String)
48
-
49
- def self.detect_delimiter (filepath)
50
- # If the csv is quote enclosed then just get the delimiter after the first cell. Otherwise...
51
- # Get the first line and count how many of the possible delimiters are present. If there is >1 of one of the
52
- # delimiters and 0 of the others then, then we pick the max. If there are more than 0 of any of the others then
53
- # we repeat the counting procedure for the next 50 lines until the condition is satisfied.
54
- # If the condition is never satisfied then we simply pick the delimiter that occurs the most frequently, defaulting
55
- # to the comma. Unless that delimeter's count is equal to the tab or pipe delimiter's count. In that case we return \t or |
56
-
57
- if is_quote_enclosed?(filepath)
58
- line = File.open(filepath, &:readline)
59
- line.chomp!.strip!
60
- m = /["'].+?["']([,|;\t])/.match(line)
61
- if (m)
62
- return m[1]
63
- end
64
- end
65
-
66
- lineCount = 0
67
- File.foreach(filepath) do |line|
68
- detectedDelim = max_delim_when_others_are_zero(line)
69
- if detectedDelim != '0' #=> '0' is a sentinel value that indicates no delim found
70
- return detectedDelim
71
- end
72
-
73
- lineCount += 1;
74
- break if lineCount == 50
75
- end
76
-
77
- # If I got here I'm going to pick the default by counting the delimiters on the first line and returning the max
78
- line = File.open(filepath, &:readline)
79
- freqOfPossibleDelims = get_freq_of_possible_delims(line)
80
-
81
- maxFreq = 0
82
- maxFreqIndex = 0
83
- freqOfPossibleDelims.each_with_index do |delimFreq, i|
84
- if (delimFreq > maxFreq)
85
- maxFreq = delimFreq
86
- maxFreqIndex = i
87
- end
88
- end
89
-
90
- # Favor "\t" and "|" over ","
91
- if (maxFreq == freqOfPossibleDelims[1])
92
- return '\t'
93
- elsif (maxFreq == freqOfPossibleDelims[3])
94
- return "|"
95
- else
96
- return [",", '\t', ";", "|"][maxFreqIndex]
97
- end
98
- end
99
-
100
- # Heuristically detects whether or not the csv file uses the first line as a header
101
- #
102
- # Example:
103
- # CsvSniffer.has_header?("path/to/file")
104
- # => false
105
- #
106
- # Arguments:
107
- # filepath: (String)
108
-
109
- def self.has_header?(filepath)
110
- # Creates a dictionary of types of data in each column. If any
111
- # column is of a single type (say, integers), *except* for the first
112
- # row, then the first row is presumed to be labels. If the type
113
- # can't be determined, it is assumed to be a string in which case
114
- # the length of the string is the determining factor: if all of the
115
- # rows except for the first are the same length, it's a header.
116
- # Finally, a 'vote' is taken at the end for each column, adding or
117
- # subtracting from the likelihood of the first row being a header.
118
- delim = detect_delimiter(filepath)
119
- if (delim == "\\t")
120
- delim = "\t"
121
- end
122
-
123
- headerRow = nil
124
- lineCount = 0
125
- columnTypes = Hash.new
126
- File.foreach(filepath) do |line|
127
- if (!headerRow) # assume the first row is a header
128
- headerRow = line.split(delim)
129
-
130
- headerRow.each_index do |colIndex|
131
- columnTypes[colIndex] = nil
132
- end
133
- next
134
- end
135
-
136
- lineCount += 1
137
- break if lineCount == 50
138
-
139
- row = line.split(delim)
140
- columnTypes.each_key do |colIndex|
141
- thisColType = nil
142
- if (row[colIndex].strip.to_i.to_s == row[colIndex])
143
- thisColType = Integer
144
- elsif (row[colIndex].strip.to_f.to_s == row[colIndex])
145
- thisColType = Float
146
- else
147
- # fallback to the length of the string
148
- thisColType = row[colIndex].strip.length
149
- end
150
-
151
- if (thisColType != columnTypes[colIndex])
152
- if (columnTypes[colIndex] == nil)
153
- # add new column type
154
- columnTypes[colIndex] = thisColType
155
- else
156
- # type is inconsistent, remove from consideration
157
- columnTypes[colIndex] = nil
158
- end
159
- end
160
-
161
- end # end iterate through each row column to determine columnType
162
- end # end iterate through each row
163
-
164
- # finally, compare results against first row and "vote" on whether its a header
165
- hasHeader = 0
166
- columnTypes.each do |colIndex, colVal|
167
- if colVal.class == NilClass
168
- # ignore
169
- elsif (colVal.class != Class) # it's a length
170
- if (headerRow[colIndex].strip.length != colVal)
171
- hasHeader += 1
172
- else
173
- hasHeader -= 1
174
- end
175
- else
176
- # determine the type of the header and compare it to the type in the Hash
177
- # if the type is the same then vote down otherwise vote up
178
- if headerRow[colIndex].strip.to_i.to_s == headerRow[colIndex]
179
- if colVal == Integer
180
- hasHeader -= 1
181
- else
182
- hasHeader += 1
183
- end
184
- elsif headerRow[colIndex].strip.to_f.to_s == headerRow[colIndex]
185
- if colVal == Float
186
- hasHeader -= 1
187
- else
188
- hasHeader += 1
189
- end
190
- end
191
- end # end type comparison voting branch
192
- end # end voting loop
193
-
194
- return hasHeader > 0
195
- end
196
-
197
- def self.max_delim_when_others_are_zero (line)
198
- freqOfPossibleDelims = get_freq_of_possible_delims(line)
199
-
200
- maxFreq = 0
201
- maxFreqIndex = 0
202
- zeroCount = 0
203
- freqOfPossibleDelims.each_with_index do |delimFreq, i|
204
- if (delimFreq > maxFreq)
205
- maxFreq = delimFreq
206
- maxFreqIndex = i
207
- end
208
- zeroCount += 1 if delimFreq == 0
209
- end
210
-
211
- if zeroCount >= 3
212
- return [',', '\t', ';', '|'][maxFreqIndex]
213
- else
214
- return '0' #=> '0' is a sentinel value that indicates no delim found
215
- end
216
- end
217
-
218
-
219
- def self.get_freq_of_possible_delims (line)
220
- freqOfPossibleDelims = Array.new(4) #=> [0 = ','] [1 = '\t'] [2 = ';'] [3 = '|']
221
- freqOfPossibleDelims[0] = line.count ","
222
- freqOfPossibleDelims[1] = line.count "\t"
223
- freqOfPossibleDelims[2] = line.count ";"
224
- freqOfPossibleDelims[3] = line.count "|"
225
-
226
- return freqOfPossibleDelims
227
- end
228
-
229
-
230
- private_class_method :max_delim_when_others_are_zero
231
- private_class_method :get_freq_of_possible_delims
232
-
233
- end
1
+ # This class contains functions to heuristically decipher certain information from a CSV file
2
+ class CsvSniffer
3
+
4
+ # Reads the first line of the csv and returns true if the line starts and ends with " or '
5
+ #
6
+ # Example:
7
+ # CsvSniffer.is_quote_enclosed?("path/to/file")
8
+ # => true
9
+ #
10
+ # Arguments:
11
+ # filepath: (String)
12
+
13
+ def self.is_quote_enclosed?(filepath)
14
+ begin
15
+ line = File.open(filepath, &:readline)
16
+ line.chomp!.strip!
17
+ return line.start_with?('"') && line.end_with?('"') || line.start_with?("'") && line.end_with?("'")
18
+ rescue EOFError
19
+ false
20
+ end
21
+ end
22
+
23
+
24
+ # Gets the quote character in use in the file if one exists. Returns "'", """ or nil
25
+ #
26
+ # Example:
27
+ # CsvSniffer.get_quote_char("path/to/file")
28
+ # => "
29
+ #
30
+ # Arguments:
31
+ # filepath: (String)
32
+
33
+ def self.get_quote_char(filepath)
34
+ begin
35
+ if is_quote_enclosed?(filepath)
36
+ line = File.open(filepath, &:readline)
37
+ line.chomp!.strip!
38
+ return line[0]
39
+ else
40
+ return nil
41
+ end
42
+ rescue EOFError
43
+ nil
44
+ end
45
+ end
46
+
47
+
48
+ # Heuristically detects the delimiter used in the CSV file and returns it
49
+ #
50
+ # Example:
51
+ # CsvSniffer.detect_delimiter("path/to/file")
52
+ # => "|"
53
+ #
54
+ # Arguments:
55
+ # filepath: (String)
56
+
57
+ def self.detect_delimiter (filepath)
58
+ # If the csv is quote enclosed then just get the delimiter after the first cell. Otherwise...
59
+ # Get the first line and count how many of the possible delimiters are present. If there is >1 of one of the
60
+ # delimiters and 0 of the others then, then we pick the max. If there are more than 0 of any of the others then
61
+ # we repeat the counting procedure for the next 50 lines until the condition is satisfied.
62
+ # If the condition is never satisfied then we simply pick the delimiter that occurs the most frequently, defaulting
63
+ # to the comma. Unless that delimeter's count is equal to the tab or pipe delimiter's count. In that case we return \t or |
64
+
65
+ if is_quote_enclosed?(filepath)
66
+ line = File.open(filepath, &:readline)
67
+ line.chomp!.strip!
68
+ m = /["'].+?["']([,|;\t])/.match(line)
69
+ if (m)
70
+ return m[1]
71
+ end
72
+ end
73
+
74
+ lineCount = 0
75
+ File.foreach(filepath) do |line|
76
+ detectedDelim = max_delim_when_others_are_zero(line)
77
+ if detectedDelim != '0' #=> '0' is a sentinel value that indicates no delim found
78
+ return detectedDelim
79
+ end
80
+
81
+ lineCount += 1;
82
+ break if lineCount == 50
83
+ end
84
+
85
+ # If I got here I'm going to pick the default by counting the delimiters on the first line and returning the max
86
+ begin
87
+ line = File.open(filepath, &:readline)
88
+ freqOfPossibleDelims = get_freq_of_possible_delims(line)
89
+ rescue EOFError
90
+ freqOfPossibleDelims = [0,-1,-1,-1]
91
+ end
92
+
93
+ maxFreq = 0
94
+ maxFreqIndex = 0
95
+ freqOfPossibleDelims.each_with_index do |delimFreq, i|
96
+ if (delimFreq > maxFreq)
97
+ maxFreq = delimFreq
98
+ maxFreqIndex = i
99
+ end
100
+ end
101
+
102
+ # Favor "\t" and "|" over ","
103
+ if (maxFreq == freqOfPossibleDelims[1])
104
+ return '\t'
105
+ elsif (maxFreq == freqOfPossibleDelims[3])
106
+ return "|"
107
+ else
108
+ return [",", '\t', ";", "|"][maxFreqIndex]
109
+ end
110
+ end
111
+
112
+ # Heuristically detects whether or not the csv file uses the first line as a header
113
+ #
114
+ # Example:
115
+ # CsvSniffer.has_header?("path/to/file")
116
+ # => false
117
+ #
118
+ # Arguments:
119
+ # filepath: (String)
120
+
121
+ def self.has_header?(filepath)
122
+ # Creates a dictionary of types of data in each column. If any
123
+ # column is of a single type (say, integers), *except* for the first
124
+ # row, then the first row is presumed to be labels. If the type
125
+ # can't be determined, it is assumed to be a string in which case
126
+ # the length of the string is the determining factor: if all of the
127
+ # rows except for the first are the same length, it's a header.
128
+ # Finally, a 'vote' is taken at the end for each column, adding or
129
+ # subtracting from the likelihood of the first row being a header.
130
+ delim = detect_delimiter(filepath)
131
+ if (delim == "\\t")
132
+ delim = "\t"
133
+ end
134
+
135
+ headerRow = nil
136
+ lineCount = 0
137
+ columnTypes = Hash.new
138
+ File.foreach(filepath) do |line|
139
+ if (!headerRow) # assume the first row is a header
140
+ headerRow = line.split(delim)
141
+
142
+ headerRow.each_index do |colIndex|
143
+ columnTypes[colIndex] = nil
144
+ end
145
+ next
146
+ end
147
+
148
+ lineCount += 1
149
+ break if lineCount == 50
150
+
151
+ row = line.split(delim)
152
+ columnTypes.each_key do |colIndex|
153
+ thisColType = nil
154
+ if (row[colIndex].strip.to_i.to_s == row[colIndex])
155
+ thisColType = Integer
156
+ elsif (row[colIndex].strip.to_f.to_s == row[colIndex])
157
+ thisColType = Float
158
+ else
159
+ # fallback to the length of the string
160
+ thisColType = row[colIndex].strip.length
161
+ end
162
+
163
+ if (thisColType != columnTypes[colIndex])
164
+ if (columnTypes[colIndex] == nil)
165
+ # add new column type
166
+ columnTypes[colIndex] = thisColType
167
+ else
168
+ # type is inconsistent, remove from consideration
169
+ columnTypes[colIndex] = nil
170
+ end
171
+ end
172
+
173
+ end # end iterate through each row column to determine columnType
174
+ end # end iterate through each row
175
+
176
+ # finally, compare results against first row and "vote" on whether its a header
177
+ hasHeader = 0
178
+ columnTypes.each do |colIndex, colVal|
179
+ if colVal.class == NilClass
180
+ # ignore
181
+ elsif (colVal.class != Class) # it's a length
182
+ if (headerRow[colIndex].strip.length != colVal)
183
+ hasHeader += 1
184
+ else
185
+ hasHeader -= 1
186
+ end
187
+ else
188
+ # determine the type of the header and compare it to the type in the Hash
189
+ # if the type is the same then vote down otherwise vote up
190
+ if headerRow[colIndex].strip.to_i.to_s == headerRow[colIndex]
191
+ if colVal == Integer
192
+ hasHeader -= 1
193
+ else
194
+ hasHeader += 1
195
+ end
196
+ elsif headerRow[colIndex].strip.to_f.to_s == headerRow[colIndex]
197
+ if colVal == Float
198
+ hasHeader -= 1
199
+ else
200
+ hasHeader += 1
201
+ end
202
+ end
203
+ end # end type comparison voting branch
204
+ end # end voting loop
205
+
206
+ return hasHeader > 0
207
+ end
208
+
209
+ def self.max_delim_when_others_are_zero (line)
210
+ freqOfPossibleDelims = get_freq_of_possible_delims(line)
211
+
212
+ maxFreq = 0
213
+ maxFreqIndex = 0
214
+ zeroCount = 0
215
+ freqOfPossibleDelims.each_with_index do |delimFreq, i|
216
+ if (delimFreq > maxFreq)
217
+ maxFreq = delimFreq
218
+ maxFreqIndex = i
219
+ end
220
+ zeroCount += 1 if delimFreq == 0
221
+ end
222
+
223
+ if zeroCount >= 3
224
+ return [',', '\t', ';', '|'][maxFreqIndex]
225
+ else
226
+ return '0' #=> '0' is a sentinel value that indicates no delim found
227
+ end
228
+ end
229
+
230
+
231
+ def self.get_freq_of_possible_delims (line)
232
+ freqOfPossibleDelims = Array.new(4) #=> [0 = ','] [1 = '\t'] [2 = ';'] [3 = '|']
233
+ freqOfPossibleDelims[0] = line.count ","
234
+ freqOfPossibleDelims[1] = line.count "\t"
235
+ freqOfPossibleDelims[2] = line.count ";"
236
+ freqOfPossibleDelims[3] = line.count "|"
237
+
238
+ return freqOfPossibleDelims
239
+ end
240
+
241
+
242
+ private_class_method :max_delim_when_others_are_zero
243
+ private_class_method :get_freq_of_possible_delims
244
+
245
+ end
@@ -1,78 +1,86 @@
1
- require 'minitest/autorun'
2
- require 'tempfile'
3
- require 'csv_sniffer'
4
-
5
- class CsvSnifferTest < Minitest::Test
6
-
7
- @@file1 = Tempfile.new('file1')
8
- @@file1.puts "Name,Number"
9
- @@file1.puts "John Doe,555-123-4567"
10
- @@file1.puts "Jane C. Doe,555-000-1234"
11
- @@file1.rewind
12
-
13
- @@file2 = Tempfile.new('file2')
14
- @@file2.puts "'Name' |'Number'\t"
15
- @@file2.puts "'John Doe'|'555-123-4567'"
16
- @@file2.puts "'Jane C. Doe'|'555-000-1234'"
17
- @@file2.rewind
18
-
19
- @@file3 = Tempfile.new('file3')
20
- @@file3.puts "John Doe;555-123-4567;Good\tdude"
21
- @@file3.puts "Jane C. Doe;555-000-1234 ; Great gal"
22
- @@file3.puts "John Smith;555-999-1234;Don't know about him"
23
- @@file3.rewind
24
-
25
- @@file4 = Tempfile.new('file4')
26
- @@file4.puts "Doe, John\t555-123-4567"
27
- @@file4.puts "Jane C. Doe\t555-000-1234\t"
28
- @@file4.rewind
29
-
30
- @@file5 = Tempfile.new('file5')
31
- @@file5.puts '"Doe,,,,,, John"|"555-123-4567"'
32
- @@file5.puts '"Jane C. Doe"|"555-000-1234\t"'
33
- @@file5.rewind
34
-
35
- @@file6 = Tempfile.new('file6')
36
- @@file6.puts 'Name|Phone No.|Age'
37
- @@file6.puts 'Doe, John|555-123-4567|31'
38
- @@file6.puts 'Doe, Jane C. |555-000-1234|30'
39
- @@file6.rewind
40
-
41
- def test_file1
42
- assert_equal ",", CsvSniffer.detect_delimiter(@@file1.path)
43
- assert_equal false, CsvSniffer.is_quote_enclosed?(@@file1.path)
44
- assert_equal nil, CsvSniffer.get_quote_char(@@file1.path)
45
- assert_equal true, CsvSniffer.has_header?(@@file1.path)
46
- end
47
-
48
- def test_file2
49
- assert_equal "|", CsvSniffer.detect_delimiter(@@file2.path)
50
- assert_equal true, CsvSniffer.is_quote_enclosed?(@@file2.path)
51
- assert_equal "'", CsvSniffer.get_quote_char(@@file2.path)
52
- assert_equal true, CsvSniffer.has_header?(@@file2.path)
53
- end
54
-
55
- def test_file3
56
- assert_equal ";", CsvSniffer.detect_delimiter(@@file3.path)
57
- assert_equal false, CsvSniffer.is_quote_enclosed?(@@file3.path)
58
- assert_equal false, CsvSniffer.has_header?(@@file3.path)
59
- end
60
-
61
- def test_file4
62
- assert_equal '\t', CsvSniffer.detect_delimiter(@@file4.path)
63
- assert_equal nil, CsvSniffer.get_quote_char(@@file4.path)
64
- assert_equal false, CsvSniffer.has_header?(@@file4.path)
65
- end
66
-
67
- def test_file5
68
- assert_equal "|", CsvSniffer.detect_delimiter(@@file5.path)
69
- assert_equal true, CsvSniffer.is_quote_enclosed?(@@file5.path)
70
- assert_equal '"', CsvSniffer.get_quote_char(@@file5.path)
71
- end
72
-
73
- def test_file6
74
- assert_equal "|", CsvSniffer.detect_delimiter(@@file6.path)
75
- assert_equal true, CsvSniffer.has_header?(@@file6.path)
76
- end
77
-
78
- end
1
+ require 'minitest/autorun'
2
+ require 'tempfile'
3
+ require 'csv_sniffer'
4
+
5
+ class CsvSnifferTest < Minitest::Test
6
+
7
+ @@file1 = Tempfile.new('file1')
8
+ @@file1.puts "Name,Number"
9
+ @@file1.puts "John Doe,555-123-4567"
10
+ @@file1.puts "Jane C. Doe,555-000-1234"
11
+ @@file1.rewind
12
+
13
+ @@file2 = Tempfile.new('file2')
14
+ @@file2.puts "'Name' |'Number'\t"
15
+ @@file2.puts "'John Doe'|'555-123-4567'"
16
+ @@file2.puts "'Jane C. Doe'|'555-000-1234'"
17
+ @@file2.rewind
18
+
19
+ @@file3 = Tempfile.new('file3')
20
+ @@file3.puts "John Doe;555-123-4567;Good\tdude"
21
+ @@file3.puts "Jane C. Doe;555-000-1234 ; Great gal"
22
+ @@file3.puts "John Smith;555-999-1234;Don't know about him"
23
+ @@file3.rewind
24
+
25
+ @@file4 = Tempfile.new('file4')
26
+ @@file4.puts "Doe, John\t555-123-4567"
27
+ @@file4.puts "Jane C. Doe\t555-000-1234\t"
28
+ @@file4.rewind
29
+
30
+ @@file5 = Tempfile.new('file5')
31
+ @@file5.puts '"Doe,,,,,, John"|"555-123-4567"'
32
+ @@file5.puts '"Jane C. Doe"|"555-000-1234\t"'
33
+ @@file5.rewind
34
+
35
+ @@file6 = Tempfile.new('file6')
36
+ @@file6.puts 'Name|Phone No.|Age'
37
+ @@file6.puts 'Doe, John|555-123-4567|31'
38
+ @@file6.puts 'Doe, Jane C. |555-000-1234|30'
39
+ @@file6.rewind
40
+
41
+ @@file7 = Tempfile.new('file7')
42
+ @@file7.rewind
43
+
44
+ def test_file1
45
+ assert_equal ",", CsvSniffer.detect_delimiter(@@file1.path)
46
+ assert_equal false, CsvSniffer.is_quote_enclosed?(@@file1.path)
47
+ assert_equal nil, CsvSniffer.get_quote_char(@@file1.path)
48
+ assert_equal true, CsvSniffer.has_header?(@@file1.path)
49
+ end
50
+
51
+ def test_file2
52
+ assert_equal "|", CsvSniffer.detect_delimiter(@@file2.path)
53
+ assert_equal true, CsvSniffer.is_quote_enclosed?(@@file2.path)
54
+ assert_equal "'", CsvSniffer.get_quote_char(@@file2.path)
55
+ assert_equal true, CsvSniffer.has_header?(@@file2.path)
56
+ end
57
+
58
+ def test_file3
59
+ assert_equal ";", CsvSniffer.detect_delimiter(@@file3.path)
60
+ assert_equal false, CsvSniffer.is_quote_enclosed?(@@file3.path)
61
+ assert_equal false, CsvSniffer.has_header?(@@file3.path)
62
+ end
63
+
64
+ def test_file4
65
+ assert_equal '\t', CsvSniffer.detect_delimiter(@@file4.path)
66
+ assert_equal nil, CsvSniffer.get_quote_char(@@file4.path)
67
+ assert_equal false, CsvSniffer.has_header?(@@file4.path)
68
+ end
69
+
70
+ def test_file5
71
+ assert_equal "|", CsvSniffer.detect_delimiter(@@file5.path)
72
+ assert_equal true, CsvSniffer.is_quote_enclosed?(@@file5.path)
73
+ assert_equal '"', CsvSniffer.get_quote_char(@@file5.path)
74
+ end
75
+
76
+ def test_file6
77
+ assert_equal "|", CsvSniffer.detect_delimiter(@@file6.path)
78
+ assert_equal true, CsvSniffer.has_header?(@@file6.path)
79
+ end
80
+
81
+ def test_file7
82
+ assert_equal false, CsvSniffer.has_header?(@@file7.path)
83
+ assert_equal nil, CsvSniffer.get_quote_char(@@file7.path)
84
+ assert_equal ",", CsvSniffer.detect_delimiter(@@file7.path)
85
+ end
86
+ end
metadata CHANGED
@@ -1,27 +1,27 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv_sniffer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Ojo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-16 00:00:00.000000000 Z
11
+ date: 2015-12-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: test-unit
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '>='
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '>='
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  description: CSV Sniffer is a set of functions that allow a user detect the delimiter
@@ -33,6 +33,7 @@ executables: []
33
33
  extensions: []
34
34
  extra_rdoc_files: []
35
35
  files:
36
+ - ".gitignore"
36
37
  - LICENSE
37
38
  - README.md
38
39
  - Rakefile
@@ -49,17 +50,17 @@ require_paths:
49
50
  - lib
50
51
  required_ruby_version: !ruby/object:Gem::Requirement
51
52
  requirements:
52
- - - '>='
53
+ - - ">="
53
54
  - !ruby/object:Gem::Version
54
55
  version: '0'
55
56
  required_rubygems_version: !ruby/object:Gem::Requirement
56
57
  requirements:
57
- - - '>='
58
+ - - ">="
58
59
  - !ruby/object:Gem::Version
59
60
  version: '0'
60
61
  requirements: []
61
62
  rubyforge_project:
62
- rubygems_version: 2.0.14
63
+ rubygems_version: 2.4.7
63
64
  signing_key:
64
65
  specification_version: 4
65
66
  summary: CSV library for heuristic detection of CSV properties