csv_sniffer 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b9e303592691083e6ac15f46740577447dfeea43
4
- data.tar.gz: 0d7c5ab5c866d2b16d5591a20db8ff62ac0abeb3
3
+ metadata.gz: e14c6e37b6559305010247b4ba71c6d1b87d4f5a
4
+ data.tar.gz: 909f073414ada386352ae1a7462d4b227681551f
5
5
  SHA512:
6
- metadata.gz: c7bd0b58ee2ae274f149e0212fb9f8434b2a240304b0b37f1237d815ff573a2ae14459cb27f573ce39622d88680460a8c4a859dda94e7297556eca72fc510608
7
- data.tar.gz: caa05cdbb7763a1e72b85468f2720f10b2354ba7b97c7b6fb05582807b1f3f0bcdf9f078a8888bdfb811f30022904307f98989e252d059f51e8c75206dcb886e
6
+ metadata.gz: 045d74a5dc4c0a134e157f2b0c18bd713c5e85cfcdb696c77b74b0d33f78489db9d172936d25979bf1709a2f45dd662a32469df7da7059eefc5d511d26388dce
7
+ data.tar.gz: 01f3db3f4953cbd489b4ef90f0c5771e4253da659dc9bb398e4fc2d1a9d421f560f03bb88dd35e6bbf30b3f23e92ec34994ed0563bd6bb891b4ab2454dda4645
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'csv_sniffer'
3
- s.version = '0.0.2'
4
- s.date = '2015-10-10'
3
+ s.version = '0.1.0'
4
+ s.date = '2015-10-15'
5
5
  s.summary = "CSV library for heuristic detection of CSV properties"
6
6
  s.description = "CSV Sniffer is intended to provide utilities that will allow a user detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers."
7
7
  s.authors = ["Tim Ojo"]
@@ -89,14 +89,110 @@ class CsvSniffer
89
89
 
90
90
  # Favor "\t" and "|" over ","
91
91
  if (maxFreq == freqOfPossibleDelims[1])
92
- return "\t"
92
+ return '\t'
93
93
  elsif (maxFreq == freqOfPossibleDelims[3])
94
94
  return "|"
95
95
  else
96
- return [",", "\t", ";", "|"][maxFreqIndex]
96
+ return [",", '\t', ";", "|"][maxFreqIndex]
97
97
  end
98
98
  end
99
99
 
100
+ # Heuristically detects whether or not the csv file uses the first line as a header
101
+ #
102
+ # Example:
103
+ # CsvSniffer.has_header?("path/to/file")
104
+ # => false
105
+ #
106
+ # Arguments:
107
+ # filepath: (String)
108
+
109
+ def self.has_header?(filepath)
110
+ # Creates a dictionary of types of data in each column. If any
111
+ # column is of a single type (say, integers), *except* for the first
112
+ # row, then the first row is presumed to be labels. If the type
113
+ # can't be determined, it is assumed to be a string in which case
114
+ # the length of the string is the determining factor: if all of the
115
+ # rows except for the first are the same length, it's a header.
116
+ # Finally, a 'vote' is taken at the end for each column, adding or
117
+ # subtracting from the likelihood of the first row being a header.
118
+ delim = detect_delimiter(filepath)
119
+ if (delim == "\\t")
120
+ delim = "\t"
121
+ end
122
+
123
+ headerRow = nil
124
+ lineCount = 0
125
+ columnTypes = Hash.new
126
+ File.foreach(filepath) do |line|
127
+ if (!headerRow) # assume the first row is a header
128
+ headerRow = line.split(delim)
129
+
130
+ headerRow.each_index do |colIndex|
131
+ columnTypes[colIndex] = nil
132
+ end
133
+ next
134
+ end
135
+
136
+ lineCount += 1
137
+ break if lineCount == 50
138
+
139
+ row = line.split(delim)
140
+ columnTypes.each_key do |colIndex|
141
+ thisColType = nil
142
+ if (row[colIndex].strip.to_i.to_s == row[colIndex])
143
+ thisColType = Integer
144
+ elsif (row[colIndex].strip.to_f.to_s == row[colIndex])
145
+ thisColType = Float
146
+ else
147
+ # fallback to the length of the string
148
+ thisColType = row[colIndex].strip.length
149
+ end
150
+
151
+ if (thisColType != columnTypes[colIndex])
152
+ if (columnTypes[colIndex] == nil)
153
+ # add new column type
154
+ columnTypes[colIndex] = thisColType
155
+ else
156
+ # type is inconsistent, remove from consideration
157
+ columnTypes[colIndex] = nil
158
+ end
159
+ end
160
+
161
+ end # end iterate through each row column to determine columnType
162
+ end # end iterate through each row
163
+
164
+ # finally, compare results against first row and "vote" on whether its a header
165
+ hasHeader = 0
166
+ columnTypes.each do |colIndex, colVal|
167
+ if colVal.class == NilClass
168
+ # ignore
169
+ elsif (colVal.class != Class) # it's a length
170
+ if (headerRow[colIndex].strip.length != colVal)
171
+ hasHeader += 1
172
+ else
173
+ hasHeader -= 1
174
+ end
175
+ else
176
+ # determine the type of the header and compare it to the type in the Hash
177
+ # if the type is the same then vote down otherwise vote up
178
+ if headerRow[colIndex].strip.to_i.to_s == headerRow[colIndex]
179
+ if colVal == Integer
180
+ hasHeader -= 1
181
+ else
182
+ hasHeader += 1
183
+ end
184
+ elsif headerRow[colIndex].strip.to_f.to_s == headerRow[colIndex]
185
+ if colVal == Float
186
+ hasHeader -= 1
187
+ else
188
+ hasHeader += 1
189
+ end
190
+ end
191
+ end # end type comparison voting branch
192
+ end # end voting loop
193
+
194
+ return hasHeader > 0
195
+ end
100
196
 
101
197
  def self.max_delim_when_others_are_zero (line)
102
198
  freqOfPossibleDelims = get_freq_of_possible_delims(line)
@@ -19,6 +19,7 @@ class CsvSnifferTest < Minitest::Test
19
19
  @@file3 = Tempfile.new('file3')
20
20
  @@file3.puts "John Doe;555-123-4567;Good\tdude"
21
21
  @@file3.puts "Jane C. Doe;555-000-1234 ; Great gal"
22
+ @@file3.puts "John Smith;555-999-1234;Don't know about him"
22
23
  @@file3.rewind
23
24
 
24
25
  @@file4 = Tempfile.new('file4')
@@ -32,30 +33,35 @@ class CsvSnifferTest < Minitest::Test
32
33
  @@file5.rewind
33
34
 
34
35
  @@file6 = Tempfile.new('file6')
35
- @@file6.puts 'Doe, John|555-123-4567'
36
- @@file6.puts 'Doe, Jane C. |555-000-1234'
36
+ @@file6.puts 'Name|Phone No.|Age'
37
+ @@file6.puts 'Doe, John|555-123-4567|31'
38
+ @@file6.puts 'Doe, Jane C. |555-000-1234|30'
37
39
  @@file6.rewind
38
40
 
39
41
  def test_file1
40
42
  assert_equal ",", CsvSniffer.detect_delimiter(@@file1.path)
41
43
  assert_equal false, CsvSniffer.is_quote_enclosed?(@@file1.path)
42
44
  assert_equal nil, CsvSniffer.get_quote_char(@@file1.path)
45
+ assert_equal true, CsvSniffer.has_header?(@@file1.path)
43
46
  end
44
47
 
45
48
  def test_file2
46
49
  assert_equal "|", CsvSniffer.detect_delimiter(@@file2.path)
47
50
  assert_equal true, CsvSniffer.is_quote_enclosed?(@@file2.path)
48
51
  assert_equal "'", CsvSniffer.get_quote_char(@@file2.path)
52
+ assert_equal true, CsvSniffer.has_header?(@@file2.path)
49
53
  end
50
54
 
51
55
  def test_file3
52
56
  assert_equal ";", CsvSniffer.detect_delimiter(@@file3.path)
53
57
  assert_equal false, CsvSniffer.is_quote_enclosed?(@@file3.path)
58
+ assert_equal false, CsvSniffer.has_header?(@@file3.path)
54
59
  end
55
60
 
56
61
  def test_file4
57
- assert_equal "\\t", CsvSniffer.detect_delimiter(@@file4.path)
62
+ assert_equal '\t', CsvSniffer.detect_delimiter(@@file4.path)
58
63
  assert_equal nil, CsvSniffer.get_quote_char(@@file4.path)
64
+ assert_equal false, CsvSniffer.has_header?(@@file4.path)
59
65
  end
60
66
 
61
67
  def test_file5
@@ -66,6 +72,7 @@ class CsvSnifferTest < Minitest::Test
66
72
 
67
73
  def test_file6
68
74
  assert_equal "|", CsvSniffer.detect_delimiter(@@file6.path)
75
+ assert_equal true, CsvSniffer.has_header?(@@file6.path)
69
76
  end
70
77
 
71
78
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv_sniffer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Ojo
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-10 00:00:00.000000000 Z
11
+ date: 2015-10-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: test-unit