acsv-p 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: c166ac5fe7e88938a2ed9e95823d8c826adeb53beea927075f6105feea6c2e7a
4
+ data.tar.gz: 0e96a37108e13846f3ea5f41b9f142df1a9e24b01c4cc768573b555698165fb8
5
+ SHA512:
6
+ metadata.gz: 38b5148f5a6c6f7b1dbbe057d3964b29e112ea25893bb94197dcb89e193feee92100095cc4d48b76fdc676ff91002ce9c8f904811a3e05650440401821f263af
7
+ data.tar.gz: 8ca9488af41e5997cf3441605dcf0d79583fc9ba1ad444e1dd7dbcc9703d993f777008212baff13c614246a4bd2adeb7caf41678f2742abc72ff946be38198ca
data/lib/acsv-p/csv.rb ADDED
@@ -0,0 +1,65 @@
1
+ require 'csv'
2
+
3
+ module ACSV
4
+ # This class provides a complete interface to CSV files and data while trying
5
+ # to detect the separator and character set. It is Ruby's standard CSV class
6
+ # with auto-detection facilities.
7
+ #
8
+ # Please note that non-rewindable IO objects, like STDIN, are not supported.
9
+ #
10
+ # @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html
11
+ class CSV < ::CSV
12
+ # This constructor will wrap either a String or IO object passed in data for reading and/or writing.
13
+ # In case of reading, the character separator is auto-detected (unless given as an option).
14
+ #
15
+ # @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html#method-c-new
16
+ def initialize(data, options = Hash.new)
17
+ options[:col_sep] ||= ACSV::Detect.separator(data)
18
+ super(data, options)
19
+ end
20
+
21
+ # This method opens an IO object, and wraps that with CSV. For reading, separator
22
+ # and character encoding (when an encoding-detection gem is loaded) are auto-detected.
23
+ #
24
+ # If the +encoding+ or +external_encoding+ option is set (and not +nil+), or if the
25
+ # external encoding is specified as part of the mode parameter or option, no
26
+ # auto-detection takes place (since the given encoding is used).
27
+ #
28
+ # When auto-detection fails, the default encoding as used by CSV and IO is taken.
29
+ #
30
+ # @option args [Number] :confidence minimum confidence level (0-1)
31
+ # @option args [String] :method try only specific method, one of {ACSV::Detect.encoding_methods}
32
+ # @see ACSV::Detect.encoding
33
+ # @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html#method-c-open
34
+ def self.open(*args)
35
+ # find the +options+ Hash
36
+ options = if args.last.is_a? Hash then args.pop else Hash.new end
37
+ # auto-detect encoding unless external encoding is specified
38
+ full_mode = args[1] || 'rb'
39
+ mode, ext_enc, int_enc = full_mode.split(':')
40
+ if (ext_enc.nil? || ext_enc=='') && options[:encoding].nil? && options[:external_encoding].nil?
41
+ # try to detect encoding
42
+ if ext_enc = ACSV::Detect.encoding(File.open(args[0], mode, options), options)
43
+ # workaround for http://stackoverflow.com/a/20723346
44
+ ext_enc = "BOM|#{ext_enc}" if ext_enc =~ /UTF/
45
+ # create new mode specification if there was one, else store in option
46
+ # only one may be supplied to IO#new so we need to check this
47
+ # also, BOM may only be specified as part of a mode parameter
48
+ if full_mode.include?(':') || ext_enc.include?('BOM|')
49
+ mode = "#{mode}:#{ext_enc}"
50
+ mode += ":#{int_enc}" if int_enc
51
+ args[1] = mode
52
+ else
53
+ options[:external_encoding] = ext_enc
54
+ end
55
+ end
56
+ end
57
+ # remove options CSV doesn't understand
58
+ options.delete :confidence
59
+ options.delete :method
60
+ # to superclass
61
+ args << options
62
+ super(*args)
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,67 @@
1
+ require_relative 'encoding_holmes'
2
+ require_relative 'encoding_rchardet'
3
+ require_relative 'encoding_uchardet'
4
+
5
+ module ACSV
6
+ module Detect
7
+ class << self
8
+
9
+ # Default confidence level for encoding detection to succeed
10
+ CONFIDENCE = 0.6
11
+ # Number of bytes to test encoding on
12
+ PREVIEW_BYTES = 8 * 4096
13
+
14
+ # Tries to detect the file encoding.
15
+ #
16
+ # @param file_or_data [File, String] CSV file or data to probe
17
+ # @option options [Number] :confidence minimum confidence level (0-1)
18
+ # @option options [String] :method try only specific method, one of {encoding_methods}
19
+ # @return [String] most probable encoding
20
+ def encoding(file_or_data, options={})
21
+ if file_or_data.is_a? File
22
+ position = file_or_data.tell
23
+ data = file_or_data.read(PREVIEW_BYTES)
24
+ file_or_data.seek(position)
25
+ else
26
+ data = file_or_data
27
+ end
28
+
29
+ detector_do(options) do |detector|
30
+ if enc = detector.encoding(data, options)
31
+ return enc
32
+ end
33
+ end
34
+ nil
35
+ end
36
+
37
+ # @return [Array<String>] List of available methods for encoding
38
+ def encoding_methods
39
+ ENCODING_DETECTORS_AVAIL.map(&:require_name)
40
+ end
41
+
42
+ # @return [Array<String>] List of possible methods for encoding (even if its gem is missing)
43
+ def encoding_methods_all
44
+ ENCODING_DETECTORS_ALL.map(&:require_name)
45
+ end
46
+
47
+ protected
48
+
49
+ ENCODING_DETECTORS_ALL = [ EncodingHolmes, EncodingRChardet, EncodingUChardet ]
50
+ ENCODING_DETECTORS_AVAIL = ENCODING_DETECTORS_ALL.select(&:present?)
51
+
52
+ # Run supplied block on detectors
53
+ # @option options [Boolean] :method Only try this method, instead of trying all
54
+ def detector_do(options)
55
+ if options[:method]
56
+ detector = ENCODING_DETECTORS_AVAIL.select{|d| d.require_name == options[:method]}.first
57
+ yield detector
58
+ else
59
+ ENCODING_DETECTORS_AVAIL.each do |detector|
60
+ yield detector if detector.present?
61
+ end
62
+ end
63
+ end
64
+
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,29 @@
1
+ begin
2
+ require 'charlock_holmes'
3
+ rescue LoadError
4
+ end
5
+
6
+ module ACSV
7
+ module Detect
8
+ module EncodingHolmes
9
+
10
+ DEFAULT_CONFIDENCE = 0.01
11
+
12
+ def self.require_name
13
+ 'charlock_holmes'
14
+ end
15
+
16
+ def self.present?
17
+ defined? ::CharlockHolmes::EncodingDetector
18
+ end
19
+
20
+ def self.encoding(data, options)
21
+ if present?
22
+ encdet = ::CharlockHolmes::EncodingDetector.detect(data)
23
+ encdet[:encoding] if encdet[:confidence] > (options[:confidence] || DEFAULT_CONFIDENCE)*100
24
+ end
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ begin
2
+ require 'rchardet'
3
+ rescue LoadError
4
+ end
5
+
6
+ module ACSV
7
+ module Detect
8
+ module EncodingRChardet
9
+
10
+ DEFAULT_CONFIDENCE = 0.2
11
+
12
+ def self.require_name
13
+ 'rchardet'
14
+ end
15
+
16
+ def self.present?
17
+ defined? ::CharDet
18
+ end
19
+
20
+ def self.encoding(data, options)
21
+ if present?
22
+ encdet = ::CharDet.detect(data)
23
+ encdet["encoding"] if encdet["confidence"] > (options[:confidence] || DEFAULT_CONFIDENCE)
24
+ end
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ begin
2
+ require 'uchardet'
3
+ rescue LoadError
4
+ end
5
+
6
+ module ACSV
7
+ module Detect
8
+ module EncodingUChardet
9
+
10
+ DEFAULT_CONFIDENCE = 0.01
11
+
12
+ def self.require_name
13
+ 'uchardet'
14
+ end
15
+
16
+ def self.present?
17
+ defined? ::ICU::UCharsetDetector
18
+ end
19
+
20
+ def self.encoding(data, options)
21
+ if present?
22
+ encdet = ::ICU::UCharsetDetector.detect(data)
23
+ encdet[:encoding] if encdet[:confidence] > (options[:confidence] || DEFAULT_CONFIDENCE)*100
24
+ end
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,24 @@
1
+ module ACSV
2
+ module Detect
3
+
4
+ # Possible CSV separators to check
5
+ SEPARATORS = [",", ";", "\t", "|", "#"]
6
+
7
+ # @param file_or_data [File, String] CSV file or data to probe
8
+ # @return [String] most probable column separator character from first line, or +nil+ when none found
9
+ # @todo return whichever character returns the same number of columns over multiple lines
10
+ def self.separator(file_or_data)
11
+ if file_or_data.is_a? File
12
+ position = file_or_data.tell
13
+ firstline = file_or_data.readline
14
+ file_or_data.seek(position)
15
+ else
16
+ firstline = file_or_data.split("\n", 2)[0]
17
+ end
18
+ separators = SEPARATORS.map{|s| s.encode(firstline.encoding)}
19
+ sep = separators.map {|x| [firstline.count(x),x]}.sort_by {|x| x[0]}.last
20
+ sep[0] == 0 ? nil : sep[1].encode('ascii')
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module ACSV
2
+ VERSION = '0.1.0'
3
+ end
data/lib/acsv-p.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'acsv-p/detect/separator'
2
+ require 'acsv-p/detect/encoding'
3
+ require 'acsv-p/csv'
4
+ require 'acsv-p/version'
metadata ADDED
@@ -0,0 +1,93 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: acsv-p
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - wvengen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: charlock_holmes
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.7.3
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.7.3
55
+ description: A wrapper for Ruby's standard CSV class that auto-detects column separator
56
+ and file encoding.
57
+ email: dev-rails@willem.engen.nl
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - lib/acsv-p.rb
63
+ - lib/acsv-p/csv.rb
64
+ - lib/acsv-p/detect/encoding.rb
65
+ - lib/acsv-p/detect/encoding_holmes.rb
66
+ - lib/acsv-p/detect/encoding_rchardet.rb
67
+ - lib/acsv-p/detect/encoding_uchardet.rb
68
+ - lib/acsv-p/detect/separator.rb
69
+ - lib/acsv-p/version.rb
70
+ homepage: https://github.com/wvengen/ruby-acsv-p
71
+ licenses:
72
+ - GPL-3.0+
73
+ metadata: {}
74
+ post_install_message:
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubygems_version: 3.0.9
90
+ signing_key:
91
+ specification_version: 4
92
+ summary: Read CSV files without configuration
93
+ test_files: []