csv_sniffer 0.0.2 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b9e303592691083e6ac15f46740577447dfeea43
4
- data.tar.gz: 0d7c5ab5c866d2b16d5591a20db8ff62ac0abeb3
3
+ metadata.gz: e14c6e37b6559305010247b4ba71c6d1b87d4f5a
4
+ data.tar.gz: 909f073414ada386352ae1a7462d4b227681551f
5
5
  SHA512:
6
- metadata.gz: c7bd0b58ee2ae274f149e0212fb9f8434b2a240304b0b37f1237d815ff573a2ae14459cb27f573ce39622d88680460a8c4a859dda94e7297556eca72fc510608
7
- data.tar.gz: caa05cdbb7763a1e72b85468f2720f10b2354ba7b97c7b6fb05582807b1f3f0bcdf9f078a8888bdfb811f30022904307f98989e252d059f51e8c75206dcb886e
6
+ metadata.gz: 045d74a5dc4c0a134e157f2b0c18bd713c5e85cfcdb696c77b74b0d33f78489db9d172936d25979bf1709a2f45dd662a32469df7da7059eefc5d511d26388dce
7
+ data.tar.gz: 01f3db3f4953cbd489b4ef90f0c5771e4253da659dc9bb398e4fc2d1a9d421f560f03bb88dd35e6bbf30b3f23e92ec34994ed0563bd6bb891b4ab2454dda4645
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'csv_sniffer'
3
- s.version = '0.0.2'
4
- s.date = '2015-10-10'
3
+ s.version = '0.1.0'
4
+ s.date = '2015-10-15'
5
5
  s.summary = "CSV library for heuristic detection of CSV properties"
6
6
  s.description = "CSV Sniffer is intended to provide utilities that will allow a user detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers."
7
7
  s.authors = ["Tim Ojo"]
@@ -89,14 +89,110 @@ class CsvSniffer
89
89
 
90
90
  # Favor "\t" and "|" over ","
91
91
  if (maxFreq == freqOfPossibleDelims[1])
92
- return "\t"
92
+ return '\t'
93
93
  elsif (maxFreq == freqOfPossibleDelims[3])
94
94
  return "|"
95
95
  else
96
- return [",", "\t", ";", "|"][maxFreqIndex]
96
+ return [",", '\t', ";", "|"][maxFreqIndex]
97
97
  end
98
98
  end
99
99
 
100
+ # Heuristically detects whether or not the csv file uses the first line as a header
101
+ #
102
+ # Example:
103
+ # CsvSniffer.has_header?("path/to/file")
104
+ # => false
105
+ #
106
+ # Arguments:
107
+ # filepath: (String)
108
+
109
+ def self.has_header?(filepath)
110
+ # Creates a dictionary of types of data in each column. If any
111
+ # column is of a single type (say, integers), *except* for the first
112
+ # row, then the first row is presumed to be labels. If the type
113
+ # can't be determined, it is assumed to be a string in which case
114
+ # the length of the string is the determining factor: if all of the
115
+ # rows except for the first are the same length, it's a header.
116
+ # Finally, a 'vote' is taken at the end for each column, adding or
117
+ # subtracting from the likelihood of the first row being a header.
118
+ delim = detect_delimiter(filepath)
119
+ if (delim == "\\t")
120
+ delim = "\t"
121
+ end
122
+
123
+ headerRow = nil
124
+ lineCount = 0
125
+ columnTypes = Hash.new
126
+ File.foreach(filepath) do |line|
127
+ if (!headerRow) # assume the first row is a header
128
+ headerRow = line.split(delim)
129
+
130
+ headerRow.each_index do |colIndex|
131
+ columnTypes[colIndex] = nil
132
+ end
133
+ next
134
+ end
135
+
136
+ lineCount += 1
137
+ break if lineCount == 50
138
+
139
+ row = line.split(delim)
140
+ columnTypes.each_key do |colIndex|
141
+ thisColType = nil
142
+ if (row[colIndex].strip.to_i.to_s == row[colIndex])
143
+ thisColType = Integer
144
+ elsif (row[colIndex].strip.to_f.to_s == row[colIndex])
145
+ thisColType = Float
146
+ else
147
+ # fallback to the length of the string
148
+ thisColType = row[colIndex].strip.length
149
+ end
150
+
151
+ if (thisColType != columnTypes[colIndex])
152
+ if (columnTypes[colIndex] == nil)
153
+ # add new column type
154
+ columnTypes[colIndex] = thisColType
155
+ else
156
+ # type is inconsistent, remove from consideration
157
+ columnTypes[colIndex] = nil
158
+ end
159
+ end
160
+
161
+ end # end iterate through each row column to determine columnType
162
+ end # end iterate through each row
163
+
164
+ # finally, compare results against first row and "vote" on whether its a header
165
+ hasHeader = 0
166
+ columnTypes.each do |colIndex, colVal|
167
+ if colVal.class == NilClass
168
+ # ignore
169
+ elsif (colVal.class != Class) # it's a length
170
+ if (headerRow[colIndex].strip.length != colVal)
171
+ hasHeader += 1
172
+ else
173
+ hasHeader -= 1
174
+ end
175
+ else
176
+ # determine the type of the header and compare it to the type in the Hash
177
+ # if the type is the same then vote down otherwise vote up
178
+ if headerRow[colIndex].strip.to_i.to_s == headerRow[colIndex]
179
+ if colVal == Integer
180
+ hasHeader -= 1
181
+ else
182
+ hasHeader += 1
183
+ end
184
+ elsif headerRow[colIndex].strip.to_f.to_s == headerRow[colIndex]
185
+ if colVal == Float
186
+ hasHeader -= 1
187
+ else
188
+ hasHeader += 1
189
+ end
190
+ end
191
+ end # end type comparison voting branch
192
+ end # end voting loop
193
+
194
+ return hasHeader > 0
195
+ end
100
196
 
101
197
  def self.max_delim_when_others_are_zero (line)
102
198
  freqOfPossibleDelims = get_freq_of_possible_delims(line)
@@ -19,6 +19,7 @@ class CsvSnifferTest < Minitest::Test
19
19
  @@file3 = Tempfile.new('file3')
20
20
  @@file3.puts "John Doe;555-123-4567;Good\tdude"
21
21
  @@file3.puts "Jane C. Doe;555-000-1234 ; Great gal"
22
+ @@file3.puts "John Smith;555-999-1234;Don't know about him"
22
23
  @@file3.rewind
23
24
 
24
25
  @@file4 = Tempfile.new('file4')
@@ -32,30 +33,35 @@ class CsvSnifferTest < Minitest::Test
32
33
  @@file5.rewind
33
34
 
34
35
  @@file6 = Tempfile.new('file6')
35
- @@file6.puts 'Doe, John|555-123-4567'
36
- @@file6.puts 'Doe, Jane C. |555-000-1234'
36
+ @@file6.puts 'Name|Phone No.|Age'
37
+ @@file6.puts 'Doe, John|555-123-4567|31'
38
+ @@file6.puts 'Doe, Jane C. |555-000-1234|30'
37
39
  @@file6.rewind
38
40
 
39
41
  def test_file1
40
42
  assert_equal ",", CsvSniffer.detect_delimiter(@@file1.path)
41
43
  assert_equal false, CsvSniffer.is_quote_enclosed?(@@file1.path)
42
44
  assert_equal nil, CsvSniffer.get_quote_char(@@file1.path)
45
+ assert_equal true, CsvSniffer.has_header?(@@file1.path)
43
46
  end
44
47
 
45
48
  def test_file2
46
49
  assert_equal "|", CsvSniffer.detect_delimiter(@@file2.path)
47
50
  assert_equal true, CsvSniffer.is_quote_enclosed?(@@file2.path)
48
51
  assert_equal "'", CsvSniffer.get_quote_char(@@file2.path)
52
+ assert_equal true, CsvSniffer.has_header?(@@file2.path)
49
53
  end
50
54
 
51
55
  def test_file3
52
56
  assert_equal ";", CsvSniffer.detect_delimiter(@@file3.path)
53
57
  assert_equal false, CsvSniffer.is_quote_enclosed?(@@file3.path)
58
+ assert_equal false, CsvSniffer.has_header?(@@file3.path)
54
59
  end
55
60
 
56
61
  def test_file4
57
- assert_equal "\\t", CsvSniffer.detect_delimiter(@@file4.path)
62
+ assert_equal '\t', CsvSniffer.detect_delimiter(@@file4.path)
58
63
  assert_equal nil, CsvSniffer.get_quote_char(@@file4.path)
64
+ assert_equal false, CsvSniffer.has_header?(@@file4.path)
59
65
  end
60
66
 
61
67
  def test_file5
@@ -66,6 +72,7 @@ class CsvSnifferTest < Minitest::Test
66
72
 
67
73
  def test_file6
68
74
  assert_equal "|", CsvSniffer.detect_delimiter(@@file6.path)
75
+ assert_equal true, CsvSniffer.has_header?(@@file6.path)
69
76
  end
70
77
 
71
78
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv_sniffer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Ojo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-10 00:00:00.000000000 Z
11
+ date: 2015-10-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: test-unit