acsv-p 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/acsv-p/csv.rb +65 -0
- data/lib/acsv-p/detect/encoding.rb +67 -0
- data/lib/acsv-p/detect/encoding_holmes.rb +29 -0
- data/lib/acsv-p/detect/encoding_rchardet.rb +29 -0
- data/lib/acsv-p/detect/encoding_uchardet.rb +29 -0
- data/lib/acsv-p/detect/separator.rb +24 -0
- data/lib/acsv-p/version.rb +3 -0
- data/lib/acsv-p.rb +4 -0
- metadata +93 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c166ac5fe7e88938a2ed9e95823d8c826adeb53beea927075f6105feea6c2e7a
|
4
|
+
data.tar.gz: 0e96a37108e13846f3ea5f41b9f142df1a9e24b01c4cc768573b555698165fb8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 38b5148f5a6c6f7b1dbbe057d3964b29e112ea25893bb94197dcb89e193feee92100095cc4d48b76fdc676ff91002ce9c8f904811a3e05650440401821f263af
|
7
|
+
data.tar.gz: 8ca9488af41e5997cf3441605dcf0d79583fc9ba1ad444e1dd7dbcc9703d993f777008212baff13c614246a4bd2adeb7caf41678f2742abc72ff946be38198ca
|
data/lib/acsv-p/csv.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module ACSV
|
4
|
+
# This class provides a complete interface to CSV files and data while trying
|
5
|
+
# to detect the separator and character set. It is Ruby's standard CSV class
|
6
|
+
# with auto-detection facilities.
|
7
|
+
#
|
8
|
+
# Please note that non-rewindable IO objects, like STDIN, are not supported.
|
9
|
+
#
|
10
|
+
# @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html
|
11
|
+
class CSV < ::CSV
|
12
|
+
# This constructor will wrap either a String or IO object passed in data for reading and/or writing.
|
13
|
+
# In case of reading, the character separator is auto-detected (unless given as an option).
|
14
|
+
#
|
15
|
+
# @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html#method-c-new
|
16
|
+
def initialize(data, options = Hash.new)
|
17
|
+
options[:col_sep] ||= ACSV::Detect.separator(data)
|
18
|
+
super(data, options)
|
19
|
+
end
|
20
|
+
|
21
|
+
# This method opens an IO object, and wraps that with CSV. For reading, separator
|
22
|
+
# and character encoding (when an encoding-detection gem is loaded) are auto-detected.
|
23
|
+
#
|
24
|
+
# If the +encoding+ or +external_encoding+ option is set (and not +nil+), or if the
|
25
|
+
# external encoding is specified as part of the mode parameter or option, no
|
26
|
+
# auto-detection takes place (since the given encoding is used).
|
27
|
+
#
|
28
|
+
# When auto-detection fails, the default encoding as used by CSV and IO is taken.
|
29
|
+
#
|
30
|
+
# @option args [Number] :confidence minimum confidence level (0-1)
|
31
|
+
# @option args [String] :method try only specific method, one of {ACSV::Detect.encoding_methods}
|
32
|
+
# @see ACSV::Detect.encoding
|
33
|
+
# @see http://www.ruby-doc.org/stdlib/libdoc/csv/rdoc/CSV.html#method-c-open
|
34
|
+
def self.open(*args)
|
35
|
+
# find the +options+ Hash
|
36
|
+
options = if args.last.is_a? Hash then args.pop else Hash.new end
|
37
|
+
# auto-detect encoding unless external encoding is specified
|
38
|
+
full_mode = args[1] || 'rb'
|
39
|
+
mode, ext_enc, int_enc = full_mode.split(':')
|
40
|
+
if (ext_enc.nil? || ext_enc=='') && options[:encoding].nil? && options[:external_encoding].nil?
|
41
|
+
# try to detect encoding
|
42
|
+
if ext_enc = ACSV::Detect.encoding(File.open(args[0], mode, options), options)
|
43
|
+
# workaround for http://stackoverflow.com/a/20723346
|
44
|
+
ext_enc = "BOM|#{ext_enc}" if ext_enc =~ /UTF/
|
45
|
+
# create new mode specification if there was one, else store in option
|
46
|
+
# only one may be supplied to IO#new so we need to check this
|
47
|
+
# also, BOM may only be specified as part of a mode parameter
|
48
|
+
if full_mode.include?(':') || ext_enc.include?('BOM|')
|
49
|
+
mode = "#{mode}:#{ext_enc}"
|
50
|
+
mode += ":#{int_enc}" if int_enc
|
51
|
+
args[1] = mode
|
52
|
+
else
|
53
|
+
options[:external_encoding] = ext_enc
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
# remove options CSV doesn't understand
|
58
|
+
options.delete :confidence
|
59
|
+
options.delete :method
|
60
|
+
# to superclass
|
61
|
+
args << options
|
62
|
+
super(*args)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require_relative 'encoding_holmes'
|
2
|
+
require_relative 'encoding_rchardet'
|
3
|
+
require_relative 'encoding_uchardet'
|
4
|
+
|
5
|
+
module ACSV
|
6
|
+
module Detect
|
7
|
+
class << self
|
8
|
+
|
9
|
+
# Default confidence level for encoding detection to succeed
|
10
|
+
CONFIDENCE = 0.6
|
11
|
+
# Number of bytes to test encoding on
|
12
|
+
PREVIEW_BYTES = 8 * 4096
|
13
|
+
|
14
|
+
# Tries to detect the file encoding.
|
15
|
+
#
|
16
|
+
# @param file_or_data [File, String] CSV file or data to probe
|
17
|
+
# @option options [Number] :confidence minimum confidence level (0-1)
|
18
|
+
# @option options [String] :method try only specific method, one of {encoding_methods}
|
19
|
+
# @return [String] most probable encoding
|
20
|
+
def encoding(file_or_data, options={})
|
21
|
+
if file_or_data.is_a? File
|
22
|
+
position = file_or_data.tell
|
23
|
+
data = file_or_data.read(PREVIEW_BYTES)
|
24
|
+
file_or_data.seek(position)
|
25
|
+
else
|
26
|
+
data = file_or_data
|
27
|
+
end
|
28
|
+
|
29
|
+
detector_do(options) do |detector|
|
30
|
+
if enc = detector.encoding(data, options)
|
31
|
+
return enc
|
32
|
+
end
|
33
|
+
end
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
37
|
+
# @return [Array<String>] List of available methods for encoding
|
38
|
+
def encoding_methods
|
39
|
+
ENCODING_DETECTORS_AVAIL.map(&:require_name)
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return [Array<String>] List of possible methods for encoding (even if its gem is missing)
|
43
|
+
def encoding_methods_all
|
44
|
+
ENCODING_DETECTORS_ALL.map(&:require_name)
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
ENCODING_DETECTORS_ALL = [ EncodingHolmes, EncodingRChardet, EncodingUChardet ]
|
50
|
+
ENCODING_DETECTORS_AVAIL = ENCODING_DETECTORS_ALL.select(&:present?)
|
51
|
+
|
52
|
+
# Run supplied block on detectors
|
53
|
+
# @option options [Boolean] :method Only try this method, instead of trying all
|
54
|
+
def detector_do(options)
|
55
|
+
if options[:method]
|
56
|
+
detector = ENCODING_DETECTORS_AVAIL.select{|d| d.require_name == options[:method]}.first
|
57
|
+
yield detector
|
58
|
+
else
|
59
|
+
ENCODING_DETECTORS_AVAIL.each do |detector|
|
60
|
+
yield detector if detector.present?
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
begin
|
2
|
+
require 'charlock_holmes'
|
3
|
+
rescue LoadError
|
4
|
+
end
|
5
|
+
|
6
|
+
module ACSV
|
7
|
+
module Detect
|
8
|
+
module EncodingHolmes
|
9
|
+
|
10
|
+
DEFAULT_CONFIDENCE = 0.01
|
11
|
+
|
12
|
+
def self.require_name
|
13
|
+
'charlock_holmes'
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.present?
|
17
|
+
defined? ::CharlockHolmes::EncodingDetector
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.encoding(data, options)
|
21
|
+
if present?
|
22
|
+
encdet = ::CharlockHolmes::EncodingDetector.detect(data)
|
23
|
+
encdet[:encoding] if encdet[:confidence] > (options[:confidence] || DEFAULT_CONFIDENCE)*100
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
begin
|
2
|
+
require 'rchardet'
|
3
|
+
rescue LoadError
|
4
|
+
end
|
5
|
+
|
6
|
+
module ACSV
|
7
|
+
module Detect
|
8
|
+
module EncodingRChardet
|
9
|
+
|
10
|
+
DEFAULT_CONFIDENCE = 0.2
|
11
|
+
|
12
|
+
def self.require_name
|
13
|
+
'rchardet'
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.present?
|
17
|
+
defined? ::CharDet
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.encoding(data, options)
|
21
|
+
if present?
|
22
|
+
encdet = ::CharDet.detect(data)
|
23
|
+
encdet["encoding"] if encdet["confidence"] > (options[:confidence] || DEFAULT_CONFIDENCE)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
begin
|
2
|
+
require 'uchardet'
|
3
|
+
rescue LoadError
|
4
|
+
end
|
5
|
+
|
6
|
+
module ACSV
|
7
|
+
module Detect
|
8
|
+
module EncodingUChardet
|
9
|
+
|
10
|
+
DEFAULT_CONFIDENCE = 0.01
|
11
|
+
|
12
|
+
def self.require_name
|
13
|
+
'uchardet'
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.present?
|
17
|
+
defined? ::ICU::UCharsetDetector
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.encoding(data, options)
|
21
|
+
if present?
|
22
|
+
encdet = ::ICU::UCharsetDetector.detect(data)
|
23
|
+
encdet[:encoding] if encdet[:confidence] > (options[:confidence] || DEFAULT_CONFIDENCE)*100
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module ACSV
|
2
|
+
module Detect
|
3
|
+
|
4
|
+
# Possible CSV separators to check
|
5
|
+
SEPARATORS = [",", ";", "\t", "|", "#"]
|
6
|
+
|
7
|
+
# @param file_or_data [File, String] CSV file or data to probe
|
8
|
+
# @return [String] most probable column separator character from first line, or +nil+ when none found
|
9
|
+
# @todo return whichever character returns the same number of columns over multiple lines
|
10
|
+
def self.separator(file_or_data)
|
11
|
+
if file_or_data.is_a? File
|
12
|
+
position = file_or_data.tell
|
13
|
+
firstline = file_or_data.readline
|
14
|
+
file_or_data.seek(position)
|
15
|
+
else
|
16
|
+
firstline = file_or_data.split("\n", 2)[0]
|
17
|
+
end
|
18
|
+
separators = SEPARATORS.map{|s| s.encode(firstline.encoding)}
|
19
|
+
sep = separators.map {|x| [firstline.count(x),x]}.sort_by {|x| x[0]}.last
|
20
|
+
sep[0] == 0 ? nil : sep[1].encode('ascii')
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
data/lib/acsv-p.rb
ADDED
metadata
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: acsv-p
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- wvengen
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-11-24 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: charlock_holmes
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.7.3
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.7.3
|
55
|
+
description: A wrapper for Ruby's standard CSV class that auto-detects column separator
|
56
|
+
and file encoding.
|
57
|
+
email: dev-rails@willem.engen.nl
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- lib/acsv-p.rb
|
63
|
+
- lib/acsv-p/csv.rb
|
64
|
+
- lib/acsv-p/detect/encoding.rb
|
65
|
+
- lib/acsv-p/detect/encoding_holmes.rb
|
66
|
+
- lib/acsv-p/detect/encoding_rchardet.rb
|
67
|
+
- lib/acsv-p/detect/encoding_uchardet.rb
|
68
|
+
- lib/acsv-p/detect/separator.rb
|
69
|
+
- lib/acsv-p/version.rb
|
70
|
+
homepage: https://github.com/wvengen/ruby-acsv-p
|
71
|
+
licenses:
|
72
|
+
- GPL-3.0+
|
73
|
+
metadata: {}
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
requirements: []
|
89
|
+
rubygems_version: 3.0.9
|
90
|
+
signing_key:
|
91
|
+
specification_version: 4
|
92
|
+
summary: Read CSV files without configuration
|
93
|
+
test_files: []
|