csv_sniffer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0c4ec9e742582c39e7ee5b57d4dce731292bc0e0
4
+ data.tar.gz: 3aaa46390b01a030b317724be479da6724a2612c
5
+ SHA512:
6
+ metadata.gz: 976a4029144696e77fdb7904a9be48270b10ca4e4b03b99c26a99875c0845a4032e5cf587ce32dddf8b1332789fd5af0062c95a73b6e10c34a404be262311b79
7
+ data.tar.gz: 232fb8c1acacbf5932962e0429bf0f287ec510f96e7d7f731b613662155a636dd89b3f87a5d1659aea8e0d760a648c6fd1a33563a0486fc83e240b72db005620
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Tim Ojo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,65 @@
1
+ # CSV Sniffer
2
+
3
+ CSV Sniffer is intended to provide utilities that will allow a user heuristically detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers. For delimiter detection the following delimiters are currently supported `[",", "\t", "|", ";"]`
4
+
5
+ To ensure high performance and a low memory footprint, the library uses as little information as needed to make accurate decisions. Contributors are welcome to
6
+ improve the algorithms in use.
7
+
8
+
9
+ ## Installation
10
+
11
+ ```
12
+ $ gem install csv_sniffer
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ Given a `some_data.csv` file:
18
+
19
+ ```csv
20
+ Name;Phone
21
+ John Doe ;555-481-2345
22
+ Jane C. Doe;555-123-4567
23
+ ```
24
+
25
+ Detection usage is as follows:
26
+
27
+ ```rb
28
+ require "csv_sniffer"
29
+
30
+ delim = CsvSniffer.detect_delimiter("/path/to/some_data.csv") #=> ";"
31
+ is_quote_enclosed = CsvSniffer.is_quote_enclosed?("/path/to/some_data.csv") #=> False
32
+ ```
33
+
34
+ See [`test.rb`](test.rb) for more examples.
35
+
36
+
37
+ ## Tests
38
+
39
+ ```
40
+ $ ruby test.rb
41
+ ```
42
+
43
+ ## License
44
+
45
+ The MIT License (MIT)
46
+
47
+ Copyright © 2015 Tim Ojo
48
+
49
+ Permission is hereby granted, free of charge, to any person obtaining a copy
50
+ of this software and associated documentation files (the "Software"), to deal
51
+ in the Software without restriction, including without limitation the rights
52
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
53
+ copies of the Software, and to permit persons to whom the Software is
54
+ furnished to do so, subject to the following conditions:
55
+
56
+ The above copyright notice and this permission notice shall be included in
57
+ all copies or substantial portions of the Software.
58
+
59
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
61
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
63
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
64
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
65
+ THE SOFTWARE.
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
@@ -0,0 +1,15 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'csv_sniffer'
3
+ s.version = '0.0.1'
4
+ s.date = '2015-10-09'
5
+ s.summary = "CSV library for heuristic detection of CSV properties"
6
+ s.description = "CSV Sniffer is intended to provide utilities that will allow a user detect the delimiter character in use, whether the values in the CSV file are quote enclosed, whether the file contains a header, and more. The library is intended to detect information to be used as configuration inputs for CSV parsers."
7
+ s.authors = ["Tim Ojo"]
8
+ s.email = 'ojo.tim@gmail.com'
9
+ s.homepage = 'https://github.com/tim-ojo/csv_sniffer'
10
+ s.license = 'MIT'
11
+
12
+ s.files = `git ls-files`.split($/)
13
+ s.test_files = s.files.grep(/^test/)
14
+ s.add_development_dependency "test-unit"
15
+ end
@@ -0,0 +1,106 @@
1
+ # This class contains functions to heuristically decipher certain information from a CSV file
2
+ class CsvSniffer
3
+
4
+ # Reads the first line of the csv and returns true if the line starts and ends with " or '
5
+ def self.is_quote_enclosed?(filepath)
6
+ line = File.open(filepath, &:readline)
7
+ line.chomp!.strip!
8
+ return line.start_with?('"') && line.end_with?('"') || line.start_with?("'") && line.end_with?("'")
9
+ end
10
+
11
+ def self.get_quote_char(filepath)
12
+ if is_quote_enclosed?(filepath)
13
+ line = File.open(filepath, &:readline)
14
+ line.chomp!.strip!
15
+ return line[0]
16
+ else
17
+ return nil
18
+ end
19
+ end
20
+
21
+ # If the csv is quote enclosed then just get the delimiter after the first cell. Otherwise...
22
+ # Get the first line and count how many of the possible delimiters are present. If there is >1 of one of the
23
+ # delimiters and 0 of the others then, then we pick the max. If there are more than 0 of any of the others then
24
+ # we repeat the counting procedure for the next 50 lines until the condition is satisfied.
25
+ # If the condition is never satisfied then we simply pick the delimiter that occurs the most frequently, defaulting
26
+ # to the comma. Unless that delimeter's count is equal to the tab or pipe delimiter's count. In that case we return \t or |
27
+ def self.detect_delimiter (filepath)
28
+
29
+ if is_quote_enclosed?(filepath)
30
+ line = File.open(filepath, &:readline)
31
+ line.chomp!.strip!
32
+ m = /["'].+?["']([,|;\t])/.match(line)
33
+ if (m)
34
+ return m[1]
35
+ end
36
+ end
37
+
38
+ lineCount = 0
39
+ File.foreach(filepath) do |line|
40
+ detectedDelim = max_delim_when_others_are_zero(line)
41
+ if detectedDelim != '0' #=> '0' is a sentinel value that indicates no delim found
42
+ return detectedDelim
43
+ end
44
+
45
+ lineCount += 1;
46
+ break if lineCount == 50
47
+ end
48
+
49
+ # If I got here I'm going to pick the default by counting the delimiters on the first line and returning the max
50
+ line = File.open(filepath, &:readline)
51
+ freqOfPossibleDelims = get_freq_of_possible_delims(line)
52
+
53
+ maxFreq = 0
54
+ maxFreqIndex = 0
55
+ freqOfPossibleDelims.each_with_index do |delimFreq, i|
56
+ if (delimFreq > maxFreq)
57
+ maxFreq = delimFreq
58
+ maxFreqIndex = i
59
+ end
60
+ end
61
+
62
+ # Favor "\t" and "|" over ","
63
+ if (maxFreq == freqOfPossibleDelims[1])
64
+ return "\t"
65
+ elsif (maxFreq == freqOfPossibleDelims[3])
66
+ return "|"
67
+ else
68
+ return [",", "\t", ";", "|"][maxFreqIndex]
69
+ end
70
+ end
71
+
72
+ def self.max_delim_when_others_are_zero (line)
73
+ freqOfPossibleDelims = get_freq_of_possible_delims(line)
74
+
75
+ maxFreq = 0
76
+ maxFreqIndex = 0
77
+ zeroCount = 0
78
+ freqOfPossibleDelims.each_with_index do |delimFreq, i|
79
+ if (delimFreq > maxFreq)
80
+ maxFreq = delimFreq
81
+ maxFreqIndex = i
82
+ end
83
+ zeroCount += 1 if delimFreq == 0
84
+ end
85
+
86
+ if zeroCount >= 3
87
+ return [',', '\t', ';', '|'][maxFreqIndex]
88
+ else
89
+ return '0' #=> '0' is a sentinel value that indicates no delim found
90
+ end
91
+ end
92
+
93
+ def self.get_freq_of_possible_delims (line)
94
+ freqOfPossibleDelims = Array.new(4) #=> [0 = ','] [1 = '\t'] [2 = ';'] [3 = '|']
95
+ freqOfPossibleDelims[0] = line.count ","
96
+ freqOfPossibleDelims[1] = line.count "\t"
97
+ freqOfPossibleDelims[2] = line.count ";"
98
+ freqOfPossibleDelims[3] = line.count "|"
99
+
100
+ return freqOfPossibleDelims
101
+ end
102
+
103
+ private_class_method :max_delim_when_others_are_zero
104
+ private_class_method :get_freq_of_possible_delims
105
+
106
+ end
@@ -0,0 +1,71 @@
1
+ require 'minitest/autorun'
2
+ require 'tempfile'
3
+ require 'csv_sniffer'
4
+
5
+ class CsvSnifferTest < Minitest::Test
6
+
7
+ @@file1 = Tempfile.new('file1')
8
+ @@file1.puts "Name,Number"
9
+ @@file1.puts "John Doe,555-123-4567"
10
+ @@file1.puts "Jane C. Doe,555-000-1234"
11
+ @@file1.rewind
12
+
13
+ @@file2 = Tempfile.new('file2')
14
+ @@file2.puts "'Name' |'Number'\t"
15
+ @@file2.puts "'John Doe'|'555-123-4567'"
16
+ @@file2.puts "'Jane C. Doe'|'555-000-1234'"
17
+ @@file2.rewind
18
+
19
+ @@file3 = Tempfile.new('file3')
20
+ @@file3.puts "John Doe;555-123-4567;Good\tdude"
21
+ @@file3.puts "Jane C. Doe;555-000-1234 ; Great gal"
22
+ @@file3.rewind
23
+
24
+ @@file4 = Tempfile.new('file4')
25
+ @@file4.puts "Doe, John\t555-123-4567"
26
+ @@file4.puts "Jane C. Doe\t555-000-1234\t"
27
+ @@file4.rewind
28
+
29
+ @@file5 = Tempfile.new('file5')
30
+ @@file5.puts '"Doe,,,,,, John"|"555-123-4567"'
31
+ @@file5.puts '"Jane C. Doe"|"555-000-1234\t"'
32
+ @@file5.rewind
33
+
34
+ @@file6 = Tempfile.new('file6')
35
+ @@file6.puts 'Doe, John|555-123-4567'
36
+ @@file6.puts 'Doe, Jane C. |555-000-1234'
37
+ @@file6.rewind
38
+
39
+ def test_file1
40
+ assert_equal ",", CsvSniffer.detect_delimiter(@@file1.path)
41
+ assert_equal false, CsvSniffer.is_quote_enclosed?(@@file1.path)
42
+ assert_equal nil, CsvSniffer.get_quote_char(@@file1.path)
43
+ end
44
+
45
+ def test_file2
46
+ assert_equal "|", CsvSniffer.detect_delimiter(@@file2.path)
47
+ assert_equal true, CsvSniffer.is_quote_enclosed?(@@file2.path)
48
+ assert_equal "'", CsvSniffer.get_quote_char(@@file2.path)
49
+ end
50
+
51
+ def test_file3
52
+ assert_equal ";", CsvSniffer.detect_delimiter(@@file3.path)
53
+ assert_equal false, CsvSniffer.is_quote_enclosed?(@@file3.path)
54
+ end
55
+
56
+ def test_file4
57
+ assert_equal "\\t", CsvSniffer.detect_delimiter(@@file4.path)
58
+ assert_equal nil, CsvSniffer.get_quote_char(@@file4.path)
59
+ end
60
+
61
+ def test_file5
62
+ assert_equal "|", CsvSniffer.detect_delimiter(@@file5.path)
63
+ assert_equal true, CsvSniffer.is_quote_enclosed?(@@file5.path)
64
+ assert_equal '"', CsvSniffer.get_quote_char(@@file5.path)
65
+ end
66
+
67
+ def test_file6
68
+ assert_equal "|", CsvSniffer.detect_delimiter(@@file6.path)
69
+ end
70
+
71
+ end
metadata ADDED
@@ -0,0 +1,67 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: csv_sniffer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Tim Ojo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-10-09 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: test-unit
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description: CSV Sniffer is intended to provide utilities that will allow a user detect
28
+ the delimiter character in use, whether the values in the CSV file are quote enclosed,
29
+ whether the file contains a header, and more. The library is intended to detect
30
+ information to be used as configuration inputs for CSV parsers.
31
+ email: ojo.tim@gmail.com
32
+ executables: []
33
+ extensions: []
34
+ extra_rdoc_files: []
35
+ files:
36
+ - LICENSE
37
+ - README.md
38
+ - Rakefile
39
+ - csv_sniffer.gemspec
40
+ - lib/csv_sniffer.rb
41
+ - test/test_csv_sniffer.rb
42
+ homepage: https://github.com/tim-ojo/csv_sniffer
43
+ licenses:
44
+ - MIT
45
+ metadata: {}
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ required_rubygems_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ requirements: []
61
+ rubyforge_project:
62
+ rubygems_version: 2.4.7
63
+ signing_key:
64
+ specification_version: 4
65
+ summary: CSV library for heuristic detection of CSV properties
66
+ test_files:
67
+ - test/test_csv_sniffer.rb