csv-format-guesser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 245c4703f769967b17663c5ed17d3925b8a17039
4
+ data.tar.gz: a298b9967bde06fa11528b5c933a537ef45fc36f
5
+ SHA512:
6
+ metadata.gz: aa55694a8ff24aa4fe79664c4fc98f44dbe78a58de1e299716211acc4b0d28ff72c62af860e43ccff0bfd2fbbcf24a87932654f7115a936410072c06a74e388d
7
+ data.tar.gz: 37ecd48383122a1ae088530a60ab8390758ed0e3030c0d70971628c59023da640b39f70d22023472c699048fcaf2f7f077148676bd9b10e0cc526f989cccf246
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ coverage
6
+ InstalledFiles
7
+ lib/bundler/man
8
+ pkg
9
+ rdoc
10
+ spec/reports
11
+ test/tmp
12
+ test/version_tmp
13
+ tmp
14
+
15
+ # YARD artifacts
16
+ .yardoc
17
+ _yardoc
18
+ doc/
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in csv-format-guesser.gemspec
4
+ gemspec
5
+ gem 'edouard-rchardet', git: 'https://github.com/kirillrdy/rchardet', require: 'rchardet'
@@ -0,0 +1,34 @@
1
+ GIT
2
+ remote: https://github.com/kirillrdy/rchardet
3
+ revision: f0e9f35c199b15b2c9539b3c0e883bb8fb993e84
4
+ specs:
5
+ edouard-rchardet (1.3.3)
6
+
7
+ PATH
8
+ remote: .
9
+ specs:
10
+ csv-format-guesser (0.0.1)
11
+
12
+ GEM
13
+ remote: https://rubygems.org/
14
+ specs:
15
+ diff-lcs (1.2.5)
16
+ rake (10.1.0)
17
+ rspec (2.14.1)
18
+ rspec-core (~> 2.14.0)
19
+ rspec-expectations (~> 2.14.0)
20
+ rspec-mocks (~> 2.14.0)
21
+ rspec-core (2.14.7)
22
+ rspec-expectations (2.14.4)
23
+ diff-lcs (>= 1.1.3, < 2.0)
24
+ rspec-mocks (2.14.4)
25
+
26
+ PLATFORMS
27
+ ruby
28
+
29
+ DEPENDENCIES
30
+ bundler (~> 1.3)
31
+ csv-format-guesser!
32
+ edouard-rchardet!
33
+ rake
34
+ rspec (~> 2.14)
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 olgen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Eugen Martin
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # CsvFormatGuesser
2
+
3
+ Guess format and encoding of .csv/.tsv files to generate options compatible with ruby CSV class.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'csv-format-guesser'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install csv-format-guesser
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "csv-format-guesser"
7
+ spec.version = '0.0.1'
8
+ spec.authors = ["Eugen Martin"]
9
+ spec.email = ["eugenius martinus ad gmail"]
10
+ spec.description = %q{Guess format and encoding of .csv/.tsv files to generate options compatible with ruby CSV class. Works with ruby2.0 }
11
+ spec.summary = %q{CSV Format guesser for ruby. Uses rchardet.}
12
+ spec.homepage = ""
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_development_dependency "rake"
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rspec", "~> 2.14"
23
+ end
@@ -0,0 +1,123 @@
1
+ require 'rchardet'
2
+ class CsvFormatGuesser
3
+ attr_reader :encoding, :col_sep, :quote_char
4
+ VERSION = '0.0.1'
5
+ PREVIEW_LINES = 100
6
+ PREVIEW_BYTES = 10 * 1024
7
+
8
+ def initialize(path)
9
+ @path = path
10
+ guess_encoding()
11
+ guess_col_sep()
12
+ guess_quote_char()
13
+ end
14
+
15
+ def csv_opts
16
+ {
17
+ encoding: @encoding,
18
+ col_sep: @col_sep,
19
+ quote_char: @quote_char,
20
+ }
21
+ end
22
+
23
+ protected
24
+
25
+ def guess_encoding
26
+ cd = CharDet.detect(File.read(@path, PREVIEW_BYTES))
27
+ @encoding = cd['encoding'] if cd
28
+ @encoding ||= 'UTF-8'
29
+ try_encoding_with_fallback!
30
+ rescue Encoding::UndefinedConversionError => e
31
+ @encoding = 'ISO-8859-1' if @encoding == 'ISO-8859-7'
32
+ # rescue => e
33
+ # @encoding ||= 'UTF-8'
34
+ end
35
+
36
+ def try_encoding_with_fallback!
37
+ File.open(@path, "r", encoding: @encoding) do |f|
38
+ f.read
39
+ end
40
+ end
41
+
42
+ POTENTIAL_COL_SEP_REGEX = /[^\w ]/i
43
+ # we assume that the separater is non alphanumeric and has the same
44
+ # occurencies in the top lines
45
+ def guess_col_sep
46
+ header = find_header
47
+ raise "Could not find header_row from file: #{@path}" unless header
48
+ char_stats = header.scan(POTENTIAL_COL_SEP_REGEX).inject(Hash.new(0)) {|h,char| h[char]+=1; h}
49
+ # here we sort all possible col seps by their count in the header
50
+ @most_appearing = char_stats.to_a.sort{|a,b| b[1] <=> a[1]}.first
51
+ @col_sep = @most_appearing.first if @most_appearing
52
+ raise "Could not guess column_separator from file: #{@path}" unless @col_sep
53
+ rescue => e
54
+ @col_sep ||= ','
55
+ end
56
+
57
+ def find_header
58
+ preview_lines.each do |line|
59
+ return line if line.scan(POTENTIAL_COL_SEP_REGEX).any?
60
+ end
61
+ end
62
+
63
+ COMMON_QUOTE_CHARS = [ '"', '\'', '|']
64
+ def guess_quote_char
65
+ readlines do |line|
66
+ @quote_char = search_quote_char(line)
67
+ return if @quote_char
68
+ end
69
+ @quote_char ||= select_unused_quote_char
70
+ rescue => e
71
+ @quote_char ||= "'"
72
+ end
73
+
74
+ def search_quote_char(line)
75
+ @used_quote_chars ||= []
76
+ COMMON_QUOTE_CHARS.each do |char|
77
+ if line.include?(char)
78
+ @used_quote_chars << char
79
+ # should be next to field separator
80
+ if line.include?(char)
81
+ is_quote = false
82
+
83
+ enclosed = @col_sep + line + @col_sep
84
+ openings = enclosed.scan( Regexp.new(Regexp.escape(@col_sep+char)) ).length
85
+ closings = enclosed.scan( Regexp.new(Regexp.escape(char + @col_sep)) ).length
86
+
87
+ return char if openings > 0 && openings == closings
88
+ end
89
+ end
90
+ end
91
+ return nil
92
+ end
93
+
94
+ def select_unused_quote_char
95
+ COMMON_QUOTE_CHARS.each do |char|
96
+ return char unless @used_quote_chars.include?(char)
97
+ end
98
+ # fallback to an exotic one:
99
+ return "\x00"
100
+ end
101
+
102
+ def preview_lines
103
+ @preview_lines ||= readlines(PREVIEW_LINES)
104
+ end
105
+
106
+ def readlines(max = nil, &block)
107
+ lines = []
108
+ File.open(@path, "r:#{@encoding}:utf-8") do |f|
109
+ i = 0
110
+ f.each_line do |line|
111
+ i += 1
112
+ break if max && i > max
113
+ if block
114
+ yield(line)
115
+ else
116
+ lines << line
117
+ end
118
+ end
119
+ end
120
+ return lines
121
+ end
122
+
123
+ end
@@ -0,0 +1,50 @@
1
+ require 'csv'
2
+ require 'csv_format_guesser'
3
+ describe CsvFormatGuesser do
4
+
5
+ let(:guesser) { CsvFormatGuesser.new(file) }
6
+ subject {guesser}
7
+
8
+ context 'tab separated' do
9
+ let(:file) { 'spec/fixtures/files/tab_separated.csv' }
10
+ its(:col_sep) { should == "\t" }
11
+ its(:encoding) { should == 'ISO-8859-2' } # latin-2
12
+ its(:quote_char) { should == "'" }
13
+ end
14
+
15
+
16
+ context 'hash separated' do
17
+ let(:file) { 'spec/fixtures/files/hash_separated_utf8.csv' }
18
+ its(:col_sep) { should == '#' }
19
+ its(:encoding) { should == 'utf-8' }
20
+ its(:quote_char) { should == "'" }
21
+ end
22
+
23
+ context 'broken excape' do
24
+ let(:file) { 'spec/fixtures/files/broken_escape.csv' }
25
+ its(:col_sep) { should == ';' }
26
+ its(:encoding) { should == 'utf-8' }
27
+ its(:quote_char) { should == "'" }
28
+ end
29
+
30
+ context 'navision' do
31
+ let(:file) { 'spec/fixtures/files/navision_export_sample.csv' }
32
+ its(:col_sep) { should == ',' }
33
+ its(:quote_char) { should == '"' }
34
+ its(:encoding) { should == 'ISO-8859-7' }
35
+ end
36
+
37
+ context 'concurrent # and , as separators' do
38
+ let(:file) { 'spec/fixtures/files/concurrent_separators.txt' }
39
+ its(:col_sep) { should == '#' }
40
+ end
41
+
42
+ context 'iso 8859-2' do
43
+ let(:file) { 'spec/fixtures/files/iso_8859-2.csv' }
44
+ its(:encoding) { should == 'ISO-8859-2' }
45
+ its(:col_sep) { should == ';' }
46
+ its(:quote_char) { should == '"' }
47
+ end
48
+
49
+ end
50
+
@@ -0,0 +1 @@
1
+ 950AHL148L;Rettungsweg links, langnachleuchtend "zur Fluchtwegkennzeichnung;HIGHLIGHT Aluminium, langnachleuchtend;Fluchtwegschild: Rettungsweg links HIGHLIGHT Aluminium 14,8 x 29,7 cm Leuchtdichte: HIGHLIGHT 48 mcd/m² gemäß BGV A8, E13;Langnachleuchtende Sicherheitsleitsysteme\Rettungswegbeschilderung nachleuchtende DIN67510\Langnachleuchtende Rettungswegschilder BGV A8\950HLL Schilder langnachleuchtend;950hll.jpg;40200102;58;H07
@@ -0,0 +1 @@
1
+ BR586124#3903#>>1#291,50#0,00#291,50#1#291,50#0,00#291,50#1#291,50#0,00#291,50#1#291,50#0,00#291,50#1#291,50#0,00#291,50#1#291,50#0,00#291,50#0
@@ -0,0 +1 @@
1
+ 'Bäschilderung'#5840006#170#Rettungsweg links (BGV A8 E 13)#Kennzeichnung & Schilder>>Rettungszeichen>>Rettungszeichen nach BGV A8 - VGB 125#Material#Kunststoff (langnachleuchtend, selbstklebend)#Größe#148 x 297 mm#Erkennungsweite#14,8 m#######Normen#BGV A8;DIN 67510;;VBG 125#Materialeigenschaften
@@ -0,0 +1,3 @@
1
+ alt (PA / A-B);20101406;7,83321E+11;92553;50751235;l�r;;;;;36,95;36;36;;18,6;;;;1;1;;;;;;;
2
+ alt (PA / A-B);20101406;7,83321E+11;92553;50751235;l�r;;;;;36,95;36;36;;18,6;;;;1;1;;;;;;;
3
+ alt (PA / A-B);20101406;7,83321E+11;92553;50751235;l�r;;;;;36,95;36;36;;18,6;;;;1;1;;;;;;;
@@ -0,0 +1 @@
1
+ 12345,"Losgr��enrundungsfaktor",Awesome,Selbstbr�unungsspray,100 ml,women
@@ -0,0 +1,2 @@
1
+ artnr herstnr artname hersteller
2
+ 990477 EZ.PCM03.007 'Z Zubeh�r Deckenhalterung universal' Acer Computer GmbH
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: csv-format-guesser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Eugen Martin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-11-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '2.14'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '2.14'
55
+ description: 'Guess format and encoding of .csv/.tsv files to generate options compatible
56
+ with ruby CSV class. Works with ruby2.0 '
57
+ email:
58
+ - eugenius martinus ad gmail
59
+ executables: []
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - .gitignore
64
+ - Gemfile
65
+ - Gemfile.lock
66
+ - LICENSE
67
+ - LICENSE.txt
68
+ - README.md
69
+ - Rakefile
70
+ - csv-format-guesser.gemspec
71
+ - lib/csv_format_guesser.rb
72
+ - spec/csv_format_guesser_spec.rb
73
+ - spec/fixtures/files/broken_escape.csv
74
+ - spec/fixtures/files/concurrent_separators.txt
75
+ - spec/fixtures/files/hash_separated_utf8.csv
76
+ - spec/fixtures/files/iso_8859-2.csv
77
+ - spec/fixtures/files/navision_export_sample.csv
78
+ - spec/fixtures/files/tab_separated.csv
79
+ homepage: ''
80
+ licenses:
81
+ - MIT
82
+ metadata: {}
83
+ post_install_message:
84
+ rdoc_options: []
85
+ require_paths:
86
+ - lib
87
+ required_ruby_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - '>='
90
+ - !ruby/object:Gem::Version
91
+ version: '0'
92
+ required_rubygems_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ requirements: []
98
+ rubyforge_project:
99
+ rubygems_version: 2.0.6
100
+ signing_key:
101
+ specification_version: 4
102
+ summary: CSV Format guesser for ruby. Uses rchardet.
103
+ test_files:
104
+ - spec/csv_format_guesser_spec.rb
105
+ - spec/fixtures/files/broken_escape.csv
106
+ - spec/fixtures/files/concurrent_separators.txt
107
+ - spec/fixtures/files/hash_separated_utf8.csv
108
+ - spec/fixtures/files/iso_8859-2.csv
109
+ - spec/fixtures/files/navision_export_sample.csv
110
+ - spec/fixtures/files/tab_separated.csv