csv_sniffer 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/csv_sniffer.gemspec +2 -2
- data/lib/csv_sniffer.rb +98 -2
- data/test/test_csv_sniffer.rb +10 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e14c6e37b6559305010247b4ba71c6d1b87d4f5a
|
4
|
+
data.tar.gz: 909f073414ada386352ae1a7462d4b227681551f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 045d74a5dc4c0a134e157f2b0c18bd713c5e85cfcdb696c77b74b0d33f78489db9d172936d25979bf1709a2f45dd662a32469df7da7059eefc5d511d26388dce
|
7
|
+
data.tar.gz: 01f3db3f4953cbd489b4ef90f0c5771e4253da659dc9bb398e4fc2d1a9d421f560f03bb88dd35e6bbf30b3f23e92ec34994ed0563bd6bb891b4ab2454dda4645
|
data/csv_sniffer.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'csv_sniffer'
|
3
|
-
s.version = '0.0
|
4
|
-
s.date = '2015-10-
|
3
|
+
s.version = '0.1.0'
|
4
|
+
s.date = '2015-10-15'
|
5
5
|
s.summary = "CSV library for heuristic detection of CSV properties"
|
6
6
|
s.description = "CSV Sniffer is intended to provide utilities that will allow a user detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers."
|
7
7
|
s.authors = ["Tim Ojo"]
|
data/lib/csv_sniffer.rb
CHANGED
@@ -89,14 +89,110 @@ class CsvSniffer
|
|
89
89
|
|
90
90
|
# Favor "\t" and "|" over ","
|
91
91
|
if (maxFreq == freqOfPossibleDelims[1])
|
92
|
-
return
|
92
|
+
return '\t'
|
93
93
|
elsif (maxFreq == freqOfPossibleDelims[3])
|
94
94
|
return "|"
|
95
95
|
else
|
96
|
-
return [",",
|
96
|
+
return [",", '\t', ";", "|"][maxFreqIndex]
|
97
97
|
end
|
98
98
|
end
|
99
99
|
|
100
|
+
# Heuristically detects whether or not the csv file uses the first line as a header
|
101
|
+
#
|
102
|
+
# Example:
|
103
|
+
# CsvSniffer.has_header?("path/to/file")
|
104
|
+
# => false
|
105
|
+
#
|
106
|
+
# Arguments:
|
107
|
+
# filepath: (String)
|
108
|
+
|
109
|
+
def self.has_header?(filepath)
|
110
|
+
# Creates a dictionary of types of data in each column. If any
|
111
|
+
# column is of a single type (say, integers), *except* for the first
|
112
|
+
# row, then the first row is presumed to be labels. If the type
|
113
|
+
# can't be determined, it is assumed to be a string in which case
|
114
|
+
# the length of the string is the determining factor: if all of the
|
115
|
+
# rows except for the first are the same length, it's a header.
|
116
|
+
# Finally, a 'vote' is taken at the end for each column, adding or
|
117
|
+
# subtracting from the likelihood of the first row being a header.
|
118
|
+
delim = detect_delimiter(filepath)
|
119
|
+
if (delim == "\\t")
|
120
|
+
delim = "\t"
|
121
|
+
end
|
122
|
+
|
123
|
+
headerRow = nil
|
124
|
+
lineCount = 0
|
125
|
+
columnTypes = Hash.new
|
126
|
+
File.foreach(filepath) do |line|
|
127
|
+
if (!headerRow) # assume the first row is a header
|
128
|
+
headerRow = line.split(delim)
|
129
|
+
|
130
|
+
headerRow.each_index do |colIndex|
|
131
|
+
columnTypes[colIndex] = nil
|
132
|
+
end
|
133
|
+
next
|
134
|
+
end
|
135
|
+
|
136
|
+
lineCount += 1
|
137
|
+
break if lineCount == 50
|
138
|
+
|
139
|
+
row = line.split(delim)
|
140
|
+
columnTypes.each_key do |colIndex|
|
141
|
+
thisColType = nil
|
142
|
+
if (row[colIndex].strip.to_i.to_s == row[colIndex])
|
143
|
+
thisColType = Integer
|
144
|
+
elsif (row[colIndex].strip.to_f.to_s == row[colIndex])
|
145
|
+
thisColType = Float
|
146
|
+
else
|
147
|
+
# fallback to the length of the string
|
148
|
+
thisColType = row[colIndex].strip.length
|
149
|
+
end
|
150
|
+
|
151
|
+
if (thisColType != columnTypes[colIndex])
|
152
|
+
if (columnTypes[colIndex] == nil)
|
153
|
+
# add new column type
|
154
|
+
columnTypes[colIndex] = thisColType
|
155
|
+
else
|
156
|
+
# type is inconsistent, remove from consideration
|
157
|
+
columnTypes[colIndex] = nil
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
end # end iterate through each row column to determine columnType
|
162
|
+
end # end iterate through each row
|
163
|
+
|
164
|
+
# finally, compare results against first row and "vote" on whether its a header
|
165
|
+
hasHeader = 0
|
166
|
+
columnTypes.each do |colIndex, colVal|
|
167
|
+
if colVal.class == NilClass
|
168
|
+
# ignore
|
169
|
+
elsif (colVal.class != Class) # it's a length
|
170
|
+
if (headerRow[colIndex].strip.length != colVal)
|
171
|
+
hasHeader += 1
|
172
|
+
else
|
173
|
+
hasHeader -= 1
|
174
|
+
end
|
175
|
+
else
|
176
|
+
# determine the type of the header and compare it to the type in the Hash
|
177
|
+
# if the type is the same then vote down otherwise vote up
|
178
|
+
if headerRow[colIndex].strip.to_i.to_s == headerRow[colIndex]
|
179
|
+
if colVal == Integer
|
180
|
+
hasHeader -= 1
|
181
|
+
else
|
182
|
+
hasHeader += 1
|
183
|
+
end
|
184
|
+
elsif headerRow[colIndex].strip.to_f.to_s == headerRow[colIndex]
|
185
|
+
if colVal == Float
|
186
|
+
hasHeader -= 1
|
187
|
+
else
|
188
|
+
hasHeader += 1
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end # end type comparison voting branch
|
192
|
+
end # end voting loop
|
193
|
+
|
194
|
+
return hasHeader > 0
|
195
|
+
end
|
100
196
|
|
101
197
|
def self.max_delim_when_others_are_zero (line)
|
102
198
|
freqOfPossibleDelims = get_freq_of_possible_delims(line)
|
data/test/test_csv_sniffer.rb
CHANGED
@@ -19,6 +19,7 @@ class CsvSnifferTest < Minitest::Test
|
|
19
19
|
@@file3 = Tempfile.new('file3')
|
20
20
|
@@file3.puts "John Doe;555-123-4567;Good\tdude"
|
21
21
|
@@file3.puts "Jane C. Doe;555-000-1234 ; Great gal"
|
22
|
+
@@file3.puts "John Smith;555-999-1234;Don't know about him"
|
22
23
|
@@file3.rewind
|
23
24
|
|
24
25
|
@@file4 = Tempfile.new('file4')
|
@@ -32,30 +33,35 @@ class CsvSnifferTest < Minitest::Test
|
|
32
33
|
@@file5.rewind
|
33
34
|
|
34
35
|
@@file6 = Tempfile.new('file6')
|
35
|
-
@@file6.puts '
|
36
|
-
@@file6.puts 'Doe,
|
36
|
+
@@file6.puts 'Name|Phone No.|Age'
|
37
|
+
@@file6.puts 'Doe, John|555-123-4567|31'
|
38
|
+
@@file6.puts 'Doe, Jane C. |555-000-1234|30'
|
37
39
|
@@file6.rewind
|
38
40
|
|
39
41
|
def test_file1
|
40
42
|
assert_equal ",", CsvSniffer.detect_delimiter(@@file1.path)
|
41
43
|
assert_equal false, CsvSniffer.is_quote_enclosed?(@@file1.path)
|
42
44
|
assert_equal nil, CsvSniffer.get_quote_char(@@file1.path)
|
45
|
+
assert_equal true, CsvSniffer.has_header?(@@file1.path)
|
43
46
|
end
|
44
47
|
|
45
48
|
def test_file2
|
46
49
|
assert_equal "|", CsvSniffer.detect_delimiter(@@file2.path)
|
47
50
|
assert_equal true, CsvSniffer.is_quote_enclosed?(@@file2.path)
|
48
51
|
assert_equal "'", CsvSniffer.get_quote_char(@@file2.path)
|
52
|
+
assert_equal true, CsvSniffer.has_header?(@@file2.path)
|
49
53
|
end
|
50
54
|
|
51
55
|
def test_file3
|
52
56
|
assert_equal ";", CsvSniffer.detect_delimiter(@@file3.path)
|
53
57
|
assert_equal false, CsvSniffer.is_quote_enclosed?(@@file3.path)
|
58
|
+
assert_equal false, CsvSniffer.has_header?(@@file3.path)
|
54
59
|
end
|
55
60
|
|
56
61
|
def test_file4
|
57
|
-
assert_equal
|
62
|
+
assert_equal '\t', CsvSniffer.detect_delimiter(@@file4.path)
|
58
63
|
assert_equal nil, CsvSniffer.get_quote_char(@@file4.path)
|
64
|
+
assert_equal false, CsvSniffer.has_header?(@@file4.path)
|
59
65
|
end
|
60
66
|
|
61
67
|
def test_file5
|
@@ -66,6 +72,7 @@ class CsvSnifferTest < Minitest::Test
|
|
66
72
|
|
67
73
|
def test_file6
|
68
74
|
assert_equal "|", CsvSniffer.detect_delimiter(@@file6.path)
|
75
|
+
assert_equal true, CsvSniffer.has_header?(@@file6.path)
|
69
76
|
end
|
70
77
|
|
71
78
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv_sniffer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Ojo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: test-unit
|