csv_sniffer 0.0.2 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/csv_sniffer.gemspec +2 -2
- data/lib/csv_sniffer.rb +98 -2
- data/test/test_csv_sniffer.rb +10 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e14c6e37b6559305010247b4ba71c6d1b87d4f5a
|
4
|
+
data.tar.gz: 909f073414ada386352ae1a7462d4b227681551f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 045d74a5dc4c0a134e157f2b0c18bd713c5e85cfcdb696c77b74b0d33f78489db9d172936d25979bf1709a2f45dd662a32469df7da7059eefc5d511d26388dce
|
7
|
+
data.tar.gz: 01f3db3f4953cbd489b4ef90f0c5771e4253da659dc9bb398e4fc2d1a9d421f560f03bb88dd35e6bbf30b3f23e92ec34994ed0563bd6bb891b4ab2454dda4645
|
data/csv_sniffer.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'csv_sniffer'
|
3
|
-
s.version = '0.0
|
4
|
-
s.date = '2015-10-
|
3
|
+
s.version = '0.1.0'
|
4
|
+
s.date = '2015-10-15'
|
5
5
|
s.summary = "CSV library for heuristic detection of CSV properties"
|
6
6
|
s.description = "CSV Sniffer is intended to provide utilities that will allow a user detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers."
|
7
7
|
s.authors = ["Tim Ojo"]
|
data/lib/csv_sniffer.rb
CHANGED
@@ -89,14 +89,110 @@ class CsvSniffer
|
|
89
89
|
|
90
90
|
# Favor "\t" and "|" over ","
|
91
91
|
if (maxFreq == freqOfPossibleDelims[1])
|
92
|
-
return
|
92
|
+
return '\t'
|
93
93
|
elsif (maxFreq == freqOfPossibleDelims[3])
|
94
94
|
return "|"
|
95
95
|
else
|
96
|
-
return [",",
|
96
|
+
return [",", '\t', ";", "|"][maxFreqIndex]
|
97
97
|
end
|
98
98
|
end
|
99
99
|
|
100
|
+
# Heuristically detects whether or not the csv file uses the first line as a header
|
101
|
+
#
|
102
|
+
# Example:
|
103
|
+
# CsvSniffer.has_header?("path/to/file")
|
104
|
+
# => false
|
105
|
+
#
|
106
|
+
# Arguments:
|
107
|
+
# filepath: (String)
|
108
|
+
|
109
|
+
def self.has_header?(filepath)
|
110
|
+
# Creates a dictionary of types of data in each column. If any
|
111
|
+
# column is of a single type (say, integers), *except* for the first
|
112
|
+
# row, then the first row is presumed to be labels. If the type
|
113
|
+
# can't be determined, it is assumed to be a string in which case
|
114
|
+
# the length of the string is the determining factor: if all of the
|
115
|
+
# rows except for the first are the same length, it's a header.
|
116
|
+
# Finally, a 'vote' is taken at the end for each column, adding or
|
117
|
+
# subtracting from the likelihood of the first row being a header.
|
118
|
+
delim = detect_delimiter(filepath)
|
119
|
+
if (delim == "\\t")
|
120
|
+
delim = "\t"
|
121
|
+
end
|
122
|
+
|
123
|
+
headerRow = nil
|
124
|
+
lineCount = 0
|
125
|
+
columnTypes = Hash.new
|
126
|
+
File.foreach(filepath) do |line|
|
127
|
+
if (!headerRow) # assume the first row is a header
|
128
|
+
headerRow = line.split(delim)
|
129
|
+
|
130
|
+
headerRow.each_index do |colIndex|
|
131
|
+
columnTypes[colIndex] = nil
|
132
|
+
end
|
133
|
+
next
|
134
|
+
end
|
135
|
+
|
136
|
+
lineCount += 1
|
137
|
+
break if lineCount == 50
|
138
|
+
|
139
|
+
row = line.split(delim)
|
140
|
+
columnTypes.each_key do |colIndex|
|
141
|
+
thisColType = nil
|
142
|
+
if (row[colIndex].strip.to_i.to_s == row[colIndex])
|
143
|
+
thisColType = Integer
|
144
|
+
elsif (row[colIndex].strip.to_f.to_s == row[colIndex])
|
145
|
+
thisColType = Float
|
146
|
+
else
|
147
|
+
# fallback to the length of the string
|
148
|
+
thisColType = row[colIndex].strip.length
|
149
|
+
end
|
150
|
+
|
151
|
+
if (thisColType != columnTypes[colIndex])
|
152
|
+
if (columnTypes[colIndex] == nil)
|
153
|
+
# add new column type
|
154
|
+
columnTypes[colIndex] = thisColType
|
155
|
+
else
|
156
|
+
# type is inconsistent, remove from consideration
|
157
|
+
columnTypes[colIndex] = nil
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
end # end iterate through each row column to determine columnType
|
162
|
+
end # end iterate through each row
|
163
|
+
|
164
|
+
# finally, compare results against first row and "vote" on whether its a header
|
165
|
+
hasHeader = 0
|
166
|
+
columnTypes.each do |colIndex, colVal|
|
167
|
+
if colVal.class == NilClass
|
168
|
+
# ignore
|
169
|
+
elsif (colVal.class != Class) # it's a length
|
170
|
+
if (headerRow[colIndex].strip.length != colVal)
|
171
|
+
hasHeader += 1
|
172
|
+
else
|
173
|
+
hasHeader -= 1
|
174
|
+
end
|
175
|
+
else
|
176
|
+
# determine the type of the header and compare it to the type in the Hash
|
177
|
+
# if the type is the same then vote down otherwise vote up
|
178
|
+
if headerRow[colIndex].strip.to_i.to_s == headerRow[colIndex]
|
179
|
+
if colVal == Integer
|
180
|
+
hasHeader -= 1
|
181
|
+
else
|
182
|
+
hasHeader += 1
|
183
|
+
end
|
184
|
+
elsif headerRow[colIndex].strip.to_f.to_s == headerRow[colIndex]
|
185
|
+
if colVal == Float
|
186
|
+
hasHeader -= 1
|
187
|
+
else
|
188
|
+
hasHeader += 1
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end # end type comparison voting branch
|
192
|
+
end # end voting loop
|
193
|
+
|
194
|
+
return hasHeader > 0
|
195
|
+
end
|
100
196
|
|
101
197
|
def self.max_delim_when_others_are_zero (line)
|
102
198
|
freqOfPossibleDelims = get_freq_of_possible_delims(line)
|
data/test/test_csv_sniffer.rb
CHANGED
@@ -19,6 +19,7 @@ class CsvSnifferTest < Minitest::Test
|
|
19
19
|
@@file3 = Tempfile.new('file3')
|
20
20
|
@@file3.puts "John Doe;555-123-4567;Good\tdude"
|
21
21
|
@@file3.puts "Jane C. Doe;555-000-1234 ; Great gal"
|
22
|
+
@@file3.puts "John Smith;555-999-1234;Don't know about him"
|
22
23
|
@@file3.rewind
|
23
24
|
|
24
25
|
@@file4 = Tempfile.new('file4')
|
@@ -32,30 +33,35 @@ class CsvSnifferTest < Minitest::Test
|
|
32
33
|
@@file5.rewind
|
33
34
|
|
34
35
|
@@file6 = Tempfile.new('file6')
|
35
|
-
@@file6.puts '
|
36
|
-
@@file6.puts 'Doe,
|
36
|
+
@@file6.puts 'Name|Phone No.|Age'
|
37
|
+
@@file6.puts 'Doe, John|555-123-4567|31'
|
38
|
+
@@file6.puts 'Doe, Jane C. |555-000-1234|30'
|
37
39
|
@@file6.rewind
|
38
40
|
|
39
41
|
def test_file1
|
40
42
|
assert_equal ",", CsvSniffer.detect_delimiter(@@file1.path)
|
41
43
|
assert_equal false, CsvSniffer.is_quote_enclosed?(@@file1.path)
|
42
44
|
assert_equal nil, CsvSniffer.get_quote_char(@@file1.path)
|
45
|
+
assert_equal true, CsvSniffer.has_header?(@@file1.path)
|
43
46
|
end
|
44
47
|
|
45
48
|
def test_file2
|
46
49
|
assert_equal "|", CsvSniffer.detect_delimiter(@@file2.path)
|
47
50
|
assert_equal true, CsvSniffer.is_quote_enclosed?(@@file2.path)
|
48
51
|
assert_equal "'", CsvSniffer.get_quote_char(@@file2.path)
|
52
|
+
assert_equal true, CsvSniffer.has_header?(@@file2.path)
|
49
53
|
end
|
50
54
|
|
51
55
|
def test_file3
|
52
56
|
assert_equal ";", CsvSniffer.detect_delimiter(@@file3.path)
|
53
57
|
assert_equal false, CsvSniffer.is_quote_enclosed?(@@file3.path)
|
58
|
+
assert_equal false, CsvSniffer.has_header?(@@file3.path)
|
54
59
|
end
|
55
60
|
|
56
61
|
def test_file4
|
57
|
-
assert_equal
|
62
|
+
assert_equal '\t', CsvSniffer.detect_delimiter(@@file4.path)
|
58
63
|
assert_equal nil, CsvSniffer.get_quote_char(@@file4.path)
|
64
|
+
assert_equal false, CsvSniffer.has_header?(@@file4.path)
|
59
65
|
end
|
60
66
|
|
61
67
|
def test_file5
|
@@ -66,6 +72,7 @@ class CsvSnifferTest < Minitest::Test
|
|
66
72
|
|
67
73
|
def test_file6
|
68
74
|
assert_equal "|", CsvSniffer.detect_delimiter(@@file6.path)
|
75
|
+
assert_equal true, CsvSniffer.has_header?(@@file6.path)
|
69
76
|
end
|
70
77
|
|
71
78
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv_sniffer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Ojo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: test-unit
|