acsv-p 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/acsv-p/csv.rb +65 -0
- data/lib/acsv-p/detect/encoding.rb +67 -0
- data/lib/acsv-p/detect/encoding_holmes.rb +29 -0
- data/lib/acsv-p/detect/encoding_rchardet.rb +29 -0
- data/lib/acsv-p/detect/encoding_uchardet.rb +29 -0
- data/lib/acsv-p/detect/separator.rb +24 -0
- data/lib/acsv-p/version.rb +3 -0
- data/lib/acsv-p.rb +4 -0
- metadata +93 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c166ac5fe7e88938a2ed9e95823d8c826adeb53beea927075f6105feea6c2e7a
|
4
|
+
data.tar.gz: 0e96a37108e13846f3ea5f41b9f142df1a9e24b01c4cc768573b555698165fb8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 38b5148f5a6c6f7b1dbbe057d3964b29e112ea25893bb94197dcb89e193feee92100095cc4d48b76fdc676ff91002ce9c8f904811a3e05650440401821f263af
|
7
|
+
data.tar.gz: 8ca9488af41e5997cf3441605dcf0d79583fc9ba1ad444e1dd7dbcc9703d993f777008212baff13c614246a4bd2adeb7caf41678f2742abc72ff946be38198ca
|
data/lib/acsv-p/csv.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module ACSV
|
4
|
+
# This class provides a complete interface to CSV files and data while trying
|
5
|
+
# to detect the separator and character set. It is Ruby's standard CSV class
|
6
|
+
# with auto-detection facilities.
|
7
|
+
#
|
8
|
+
# Please note that non-rewindable IO objects, like STDIN, are not supported.
|
9
|
+
#
|
10
|
+
# @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html
|
11
|
+
class CSV < ::CSV
|
12
|
+
# This constructor will wrap either a String or IO object passed in data for reading and/or writing.
|
13
|
+
# In case of reading, the character separator is auto-detected (unless given as an option).
|
14
|
+
#
|
15
|
+
# @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html#method-c-new
|
16
|
+
def initialize(data, options = Hash.new)
|
17
|
+
options[:col_sep] ||= ACSV::Detect.separator(data)
|
18
|
+
super(data, options)
|
19
|
+
end
|
20
|
+
|
21
|
+
# This method opens an IO object, and wraps that with CSV. For reading, separator
|
22
|
+
# and character encoding (when an encoding-detection gem is loaded) are auto-detected.
|
23
|
+
#
|
24
|
+
# If the +encoding+ or +external_encoding+ option is set (and not +nil+), or if the
|
25
|
+
# external encoding is specified as part of the mode parameter or option, no
|
26
|
+
# auto-detection takes place (since the given encoding is used).
|
27
|
+
#
|
28
|
+
# When auto-detection fails, the default encoding as used by CSV and IO is taken.
|
29
|
+
#
|
30
|
+
# @option args [Number] :confidence minimum confidence level (0-1)
|
31
|
+
# @option args [String] :method try only specific method, one of {ACSV::Detect.encoding_methods}
|
32
|
+
# @see ACSV::Detect.encoding
|
33
|
+
# @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html#method-c-open
|
34
|
+
def self.open(*args)
|
35
|
+
# find the +options+ Hash
|
36
|
+
options = if args.last.is_a? Hash then args.pop else Hash.new end
|
37
|
+
# auto-detect encoding unless external encoding is specified
|
38
|
+
full_mode = args[1] || 'rb'
|
39
|
+
mode, ext_enc, int_enc = full_mode.split(':')
|
40
|
+
if (ext_enc.nil? || ext_enc=='') && options[:encoding].nil? && options[:external_encoding].nil?
|
41
|
+
# try to detect encoding
|
42
|
+
if ext_enc = ACSV::Detect.encoding(File.open(args[0], mode, options), options)
|
43
|
+
# workaround for http://stackoverflow.com/a/20723346
|
44
|
+
ext_enc = "BOM|#{ext_enc}" if ext_enc =~ /UTF/
|
45
|
+
# create new mode specification if there was one, else store in option
|
46
|
+
# only one may be supplied to IO#new so we need to check this
|
47
|
+
# also, BOM may only be specified as part of a mode parameter
|
48
|
+
if full_mode.include?(':') || ext_enc.include?('BOM|')
|
49
|
+
mode = "#{mode}:#{ext_enc}"
|
50
|
+
mode += ":#{int_enc}" if int_enc
|
51
|
+
args[1] = mode
|
52
|
+
else
|
53
|
+
options[:external_encoding] = ext_enc
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
# remove options CSV doesn't understand
|
58
|
+
options.delete :confidence
|
59
|
+
options.delete :method
|
60
|
+
# to superclass
|
61
|
+
args << options
|
62
|
+
super(*args)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require_relative 'encoding_holmes'
|
2
|
+
require_relative 'encoding_rchardet'
|
3
|
+
require_relative 'encoding_uchardet'
|
4
|
+
|
5
|
+
module ACSV
|
6
|
+
module Detect
|
7
|
+
class << self
|
8
|
+
|
9
|
+
# Default confidence level for encoding detection to succeed
|
10
|
+
CONFIDENCE = 0.6
|
11
|
+
# Number of bytes to test encoding on
|
12
|
+
PREVIEW_BYTES = 8 * 4096
|
13
|
+
|
14
|
+
# Tries to detect the file encoding.
|
15
|
+
#
|
16
|
+
# @param file_or_data [File, String] CSV file or data to probe
|
17
|
+
# @option options [Number] :confidence minimum confidence level (0-1)
|
18
|
+
# @option options [String] :method try only specific method, one of {encoding_methods}
|
19
|
+
# @return [String] most probable encoding
|
20
|
+
def encoding(file_or_data, options={})
|
21
|
+
if file_or_data.is_a? File
|
22
|
+
position = file_or_data.tell
|
23
|
+
data = file_or_data.read(PREVIEW_BYTES)
|
24
|
+
file_or_data.seek(position)
|
25
|
+
else
|
26
|
+
data = file_or_data
|
27
|
+
end
|
28
|
+
|
29
|
+
detector_do(options) do |detector|
|
30
|
+
if enc = detector.encoding(data, options)
|
31
|
+
return enc
|
32
|
+
end
|
33
|
+
end
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
37
|
+
# @return [Array<String>] List of available methods for encoding
|
38
|
+
def encoding_methods
|
39
|
+
ENCODING_DETECTORS_AVAIL.map(&:require_name)
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return [Array<String>] List of possible methods for encoding (even if its gem is missing)
|
43
|
+
def encoding_methods_all
|
44
|
+
ENCODING_DETECTORS_ALL.map(&:require_name)
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
ENCODING_DETECTORS_ALL = [ EncodingHolmes, EncodingRChardet, EncodingUChardet ]
|
50
|
+
ENCODING_DETECTORS_AVAIL = ENCODING_DETECTORS_ALL.select(&:present?)
|
51
|
+
|
52
|
+
# Run supplied block on detectors
|
53
|
+
# @option options [Boolean] :method Only try this method, instead of trying all
|
54
|
+
def detector_do(options)
|
55
|
+
if options[:method]
|
56
|
+
detector = ENCODING_DETECTORS_AVAIL.select{|d| d.require_name == options[:method]}.first
|
57
|
+
yield detector
|
58
|
+
else
|
59
|
+
ENCODING_DETECTORS_AVAIL.each do |detector|
|
60
|
+
yield detector if detector.present?
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
begin
|
2
|
+
require 'charlock_holmes'
|
3
|
+
rescue LoadError
|
4
|
+
end
|
5
|
+
|
6
|
+
module ACSV
|
7
|
+
module Detect
|
8
|
+
module EncodingHolmes
|
9
|
+
|
10
|
+
DEFAULT_CONFIDENCE = 0.01
|
11
|
+
|
12
|
+
def self.require_name
|
13
|
+
'charlock_holmes'
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.present?
|
17
|
+
defined? ::CharlockHolmes::EncodingDetector
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.encoding(data, options)
|
21
|
+
if present?
|
22
|
+
encdet = ::CharlockHolmes::EncodingDetector.detect(data)
|
23
|
+
encdet[:encoding] if encdet[:confidence] > (options[:confidence] || DEFAULT_CONFIDENCE)*100
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
begin
|
2
|
+
require 'rchardet'
|
3
|
+
rescue LoadError
|
4
|
+
end
|
5
|
+
|
6
|
+
module ACSV
|
7
|
+
module Detect
|
8
|
+
module EncodingRChardet
|
9
|
+
|
10
|
+
DEFAULT_CONFIDENCE = 0.2
|
11
|
+
|
12
|
+
def self.require_name
|
13
|
+
'rchardet'
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.present?
|
17
|
+
defined? ::CharDet
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.encoding(data, options)
|
21
|
+
if present?
|
22
|
+
encdet = ::CharDet.detect(data)
|
23
|
+
encdet["encoding"] if encdet["confidence"] > (options[:confidence] || DEFAULT_CONFIDENCE)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
begin
|
2
|
+
require 'uchardet'
|
3
|
+
rescue LoadError
|
4
|
+
end
|
5
|
+
|
6
|
+
module ACSV
|
7
|
+
module Detect
|
8
|
+
module EncodingUChardet
|
9
|
+
|
10
|
+
DEFAULT_CONFIDENCE = 0.01
|
11
|
+
|
12
|
+
def self.require_name
|
13
|
+
'uchardet'
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.present?
|
17
|
+
defined? ::ICU::UCharsetDetector
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.encoding(data, options)
|
21
|
+
if present?
|
22
|
+
encdet = ::ICU::UCharsetDetector.detect(data)
|
23
|
+
encdet[:encoding] if encdet[:confidence] > (options[:confidence] || DEFAULT_CONFIDENCE)*100
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module ACSV
|
2
|
+
module Detect
|
3
|
+
|
4
|
+
# Possible CSV separators to check
|
5
|
+
SEPARATORS = [",", ";", "\t", "|", "#"]
|
6
|
+
|
7
|
+
# @param file_or_data [File, String] CSV file or data to probe
|
8
|
+
# @return [String] most probable column separator character from first line, or +nil+ when none found
|
9
|
+
# @todo return whichever character returns the same number of columns over multiple lines
|
10
|
+
def self.separator(file_or_data)
|
11
|
+
if file_or_data.is_a? File
|
12
|
+
position = file_or_data.tell
|
13
|
+
firstline = file_or_data.readline
|
14
|
+
file_or_data.seek(position)
|
15
|
+
else
|
16
|
+
firstline = file_or_data.split("\n", 2)[0]
|
17
|
+
end
|
18
|
+
separators = SEPARATORS.map{|s| s.encode(firstline.encoding)}
|
19
|
+
sep = separators.map {|x| [firstline.count(x),x]}.sort_by {|x| x[0]}.last
|
20
|
+
sep[0] == 0 ? nil : sep[1].encode('ascii')
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
data/lib/acsv-p.rb
ADDED
metadata
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: acsv-p
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- wvengen
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-11-24 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: charlock_holmes
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.7.3
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.7.3
|
55
|
+
description: A wrapper for Ruby's standard CSV class that auto-detects column separator
|
56
|
+
and file encoding.
|
57
|
+
email: dev-rails@willem.engen.nl
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- lib/acsv-p.rb
|
63
|
+
- lib/acsv-p/csv.rb
|
64
|
+
- lib/acsv-p/detect/encoding.rb
|
65
|
+
- lib/acsv-p/detect/encoding_holmes.rb
|
66
|
+
- lib/acsv-p/detect/encoding_rchardet.rb
|
67
|
+
- lib/acsv-p/detect/encoding_uchardet.rb
|
68
|
+
- lib/acsv-p/detect/separator.rb
|
69
|
+
- lib/acsv-p/version.rb
|
70
|
+
homepage: https://github.com/wvengen/ruby-acsv-p
|
71
|
+
licenses:
|
72
|
+
- GPL-3.0+
|
73
|
+
metadata: {}
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
requirements: []
|
89
|
+
rubygems_version: 3.0.9
|
90
|
+
signing_key:
|
91
|
+
specification_version: 4
|
92
|
+
summary: Read CSV files without configuration
|
93
|
+
test_files: []
|