acsv-p 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: c166ac5fe7e88938a2ed9e95823d8c826adeb53beea927075f6105feea6c2e7a
4
+ data.tar.gz: 0e96a37108e13846f3ea5f41b9f142df1a9e24b01c4cc768573b555698165fb8
5
+ SHA512:
6
+ metadata.gz: 38b5148f5a6c6f7b1dbbe057d3964b29e112ea25893bb94197dcb89e193feee92100095cc4d48b76fdc676ff91002ce9c8f904811a3e05650440401821f263af
7
+ data.tar.gz: 8ca9488af41e5997cf3441605dcf0d79583fc9ba1ad444e1dd7dbcc9703d993f777008212baff13c614246a4bd2adeb7caf41678f2742abc72ff946be38198ca
data/lib/acsv-p/csv.rb ADDED
@@ -0,0 +1,65 @@
1
+ require 'csv'
2
+
3
+ module ACSV
4
+ # This class provides a complete interface to CSV files and data while trying
5
+ # to detect the separator and character set. It is Ruby's standard CSV class
6
+ # with auto-detection facilities.
7
+ #
8
+ # Please note that non-rewindable IO objects, like STDIN, are not supported.
9
+ #
10
+ # @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html
11
+ class CSV < ::CSV
12
+ # This constructor will wrap either a String or IO object passed in data for reading and/or writing.
13
+ # In case of reading, the character separator is auto-detected (unless given as an option).
14
+ #
15
+ # @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html#method-c-new
16
+ def initialize(data, options = Hash.new)
17
+ options[:col_sep] ||= ACSV::Detect.separator(data)
18
+ super(data, options)
19
+ end
20
+
21
+ # This method opens an IO object, and wraps that with CSV. For reading, separator
22
+ # and character encoding (when an encoding-detection gem is loaded) are auto-detected.
23
+ #
24
+ # If the +encoding+ or +external_encoding+ option is set (and not +nil+), or if the
25
+ # external encoding is specified as part of the mode parameter or option, no
26
+ # auto-detection takes place (since the given encoding is used).
27
+ #
28
+ # When auto-detection fails, the default encoding as used by CSV and IO is taken.
29
+ #
30
+ # @option args [Number] :confidence minimum confidence level (0-1)
31
+ # @option args [String] :method try only specific method, one of {ACSV::Detect.encoding_methods}
32
+ # @see ACSV::Detect.encoding
33
+ # @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html#method-c-open
34
+ def self.open(*args)
35
+ # find the +options+ Hash
36
+ options = if args.last.is_a? Hash then args.pop else Hash.new end
37
+ # auto-detect encoding unless external encoding is specified
38
+ full_mode = args[1] || 'rb'
39
+ mode, ext_enc, int_enc = full_mode.split(':')
40
+ if (ext_enc.nil? || ext_enc=='') && options[:encoding].nil? && options[:external_encoding].nil?
41
+ # try to detect encoding
42
+ if ext_enc = ACSV::Detect.encoding(File.open(args[0], mode, options), options)
43
+ # workaround for http://stackoverflow.com/a/20723346
44
+ ext_enc = "BOM|#{ext_enc}" if ext_enc =~ /UTF/
45
+ # create new mode specification if there was one, else store in option
46
+ # only one may be supplied to IO#new so we need to check this
47
+ # also, BOM may only be specified as part of a mode parameter
48
+ if full_mode.include?(':') || ext_enc.include?('BOM|')
49
+ mode = "#{mode}:#{ext_enc}"
50
+ mode += ":#{int_enc}" if int_enc
51
+ args[1] = mode
52
+ else
53
+ options[:external_encoding] = ext_enc
54
+ end
55
+ end
56
+ end
57
+ # remove options CSV doesn't understand
58
+ options.delete :confidence
59
+ options.delete :method
60
+ # to superclass
61
+ args << options
62
+ super(*args)
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,67 @@
1
+ require_relative 'encoding_holmes'
2
+ require_relative 'encoding_rchardet'
3
+ require_relative 'encoding_uchardet'
4
+
5
+ module ACSV
6
+ module Detect
7
+ class << self
8
+
9
+ # Default confidence level for encoding detection to succeed
10
+ CONFIDENCE = 0.6
11
+ # Number of bytes to test encoding on
12
+ PREVIEW_BYTES = 8 * 4096
13
+
14
+ # Tries to detect the file encoding.
15
+ #
16
+ # @param file_or_data [File, String] CSV file or data to probe
17
+ # @option options [Number] :confidence minimum confidence level (0-1)
18
+ # @option options [String] :method try only specific method, one of {encoding_methods}
19
+ # @return [String] most probable encoding
20
+ def encoding(file_or_data, options={})
21
+ if file_or_data.is_a? File
22
+ position = file_or_data.tell
23
+ data = file_or_data.read(PREVIEW_BYTES)
24
+ file_or_data.seek(position)
25
+ else
26
+ data = file_or_data
27
+ end
28
+
29
+ detector_do(options) do |detector|
30
+ if enc = detector.encoding(data, options)
31
+ return enc
32
+ end
33
+ end
34
+ nil
35
+ end
36
+
37
+ # @return [Array<String>] List of available methods for encoding
38
+ def encoding_methods
39
+ ENCODING_DETECTORS_AVAIL.map(&:require_name)
40
+ end
41
+
42
+ # @return [Array<String>] List of possible methods for encoding (even if its gem is missing)
43
+ def encoding_methods_all
44
+ ENCODING_DETECTORS_ALL.map(&:require_name)
45
+ end
46
+
47
+ protected
48
+
49
+ ENCODING_DETECTORS_ALL = [ EncodingHolmes, EncodingRChardet, EncodingUChardet ]
50
+ ENCODING_DETECTORS_AVAIL = ENCODING_DETECTORS_ALL.select(&:present?)
51
+
52
+ # Run supplied block on detectors
53
+ # @option options [Boolean] :method Only try this method, instead of trying all
54
+ def detector_do(options)
55
+ if options[:method]
56
+ detector = ENCODING_DETECTORS_AVAIL.select{|d| d.require_name == options[:method]}.first
57
+ yield detector
58
+ else
59
+ ENCODING_DETECTORS_AVAIL.each do |detector|
60
+ yield detector if detector.present?
61
+ end
62
+ end
63
+ end
64
+
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,29 @@
1
+ begin
2
+ require 'charlock_holmes'
3
+ rescue LoadError
4
+ end
5
+
6
+ module ACSV
7
+ module Detect
8
+ module EncodingHolmes
9
+
10
+ DEFAULT_CONFIDENCE = 0.01
11
+
12
+ def self.require_name
13
+ 'charlock_holmes'
14
+ end
15
+
16
+ def self.present?
17
+ defined? ::CharlockHolmes::EncodingDetector
18
+ end
19
+
20
+ def self.encoding(data, options)
21
+ if present?
22
+ encdet = ::CharlockHolmes::EncodingDetector.detect(data)
23
+ encdet[:encoding] if encdet[:confidence] > (options[:confidence] || DEFAULT_CONFIDENCE)*100
24
+ end
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ begin
2
+ require 'rchardet'
3
+ rescue LoadError
4
+ end
5
+
6
+ module ACSV
7
+ module Detect
8
+ module EncodingRChardet
9
+
10
+ DEFAULT_CONFIDENCE = 0.2
11
+
12
+ def self.require_name
13
+ 'rchardet'
14
+ end
15
+
16
+ def self.present?
17
+ defined? ::CharDet
18
+ end
19
+
20
+ def self.encoding(data, options)
21
+ if present?
22
+ encdet = ::CharDet.detect(data)
23
+ encdet["encoding"] if encdet["confidence"] > (options[:confidence] || DEFAULT_CONFIDENCE)
24
+ end
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ begin
2
+ require 'uchardet'
3
+ rescue LoadError
4
+ end
5
+
6
+ module ACSV
7
+ module Detect
8
+ module EncodingUChardet
9
+
10
+ DEFAULT_CONFIDENCE = 0.01
11
+
12
+ def self.require_name
13
+ 'uchardet'
14
+ end
15
+
16
+ def self.present?
17
+ defined? ::ICU::UCharsetDetector
18
+ end
19
+
20
+ def self.encoding(data, options)
21
+ if present?
22
+ encdet = ::ICU::UCharsetDetector.detect(data)
23
+ encdet[:encoding] if encdet[:confidence] > (options[:confidence] || DEFAULT_CONFIDENCE)*100
24
+ end
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,24 @@
1
+ module ACSV
2
+ module Detect
3
+
4
+ # Possible CSV separators to check
5
+ SEPARATORS = [",", ";", "\t", "|", "#"]
6
+
7
+ # @param file_or_data [File, String] CSV file or data to probe
8
+ # @return [String] most probable column separator character from first line, or +nil+ when none found
9
+ # @todo return whichever character returns the same number of columns over multiple lines
10
+ def self.separator(file_or_data)
11
+ if file_or_data.is_a? File
12
+ position = file_or_data.tell
13
+ firstline = file_or_data.readline
14
+ file_or_data.seek(position)
15
+ else
16
+ firstline = file_or_data.split("\n", 2)[0]
17
+ end
18
+ separators = SEPARATORS.map{|s| s.encode(firstline.encoding)}
19
+ sep = separators.map {|x| [firstline.count(x),x]}.sort_by {|x| x[0]}.last
20
+ sep[0] == 0 ? nil : sep[1].encode('ascii')
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module ACSV
2
+ VERSION = '0.1.0'
3
+ end
data/lib/acsv-p.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'acsv-p/detect/separator'
2
+ require 'acsv-p/detect/encoding'
3
+ require 'acsv-p/csv'
4
+ require 'acsv-p/version'
metadata ADDED
@@ -0,0 +1,93 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: acsv-p
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - wvengen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: charlock_holmes
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.7.3
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.7.3
55
+ description: A wrapper for Ruby's standard CSV class that auto-detects column separator
56
+ and file encoding.
57
+ email: dev-rails@willem.engen.nl
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - lib/acsv-p.rb
63
+ - lib/acsv-p/csv.rb
64
+ - lib/acsv-p/detect/encoding.rb
65
+ - lib/acsv-p/detect/encoding_holmes.rb
66
+ - lib/acsv-p/detect/encoding_rchardet.rb
67
+ - lib/acsv-p/detect/encoding_uchardet.rb
68
+ - lib/acsv-p/detect/separator.rb
69
+ - lib/acsv-p/version.rb
70
+ homepage: https://github.com/wvengen/ruby-acsv-p
71
+ licenses:
72
+ - GPL-3.0+
73
+ metadata: {}
74
+ post_install_message:
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubygems_version: 3.0.9
90
+ signing_key:
91
+ specification_version: 4
92
+ summary: Read CSV files without configuration
93
+ test_files: []