charlotte 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/charlotte +2 -0
- data/lib/charlotte.rb +98 -0
- metadata +48 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a44e734b24416c3ff9b2b0cc178d15e8d03417e3
|
4
|
+
data.tar.gz: 7dc568b7fbb302a23639b5c3f46a76967227a054
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 5e2b740400db55f30e1623e14da783d2341a5b8593c07be0de86f296a54b0098d2e18ec3c202dcedb3a4ea39dd36a47060d3c51d69bef28c6f837ced01c7a3a0
|
7
|
+
data.tar.gz: cce9165998efa8f618648204e0880830f5fb8cdcee12f418cfb372fc04434277912c8d2efe2d0261f5ebadabcc897e53f1a526158c846fe4f314c8d806dfd5fd
|
data/bin/charlotte
ADDED
data/lib/charlotte.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
module Charlotte # (c) Geoff Nixon, 2014. MIT licence.
|
2
|
+
#### Charlotte -- Fast and dirty encoding-or-binary detector/auto-converter.
|
3
|
+
#### Pronounced "charlet"; rhymes with "chardet". Also, my kid sister's name!
|
4
|
+
|
5
|
+
# Adapted from: https://github.com/file/file/blob/master/src/encoding.c:
|
6
|
+
|
7
|
+
# T: Character appears in plain ASCII text.
|
8
|
+
# I: Character appears in ISO-8859 text.
|
9
|
+
# X: Character appears in extended ASCII text.
|
10
|
+
# F: Character never appears in single-byte text.
|
11
|
+
|
12
|
+
#####################################################
|
13
|
+
## \x00 # F F F F F F F T T T T F T T F F # \x0F ##
|
14
|
+
## \x10 # F F F F F F F F F F F T F F F F # \x1F ##
|
15
|
+
## \x20 # T T T T T T T T T T T T T T T T # \x2F ##
|
16
|
+
## \x30 # T T T T T T T T T T T T T T T T # \x3F ##
|
17
|
+
## \x40 # T T T T T T T T T T T T T T T T # \x4F ##
|
18
|
+
## \x50 # T T T T T T T T T T T T T T T T # \x5F ##
|
19
|
+
## \x60 # T T T T T T T T T T T T T T T T # \x6F ##
|
20
|
+
## \x70 # T T T T T T T T T T T T T T T F # \x7F ##
|
21
|
+
## \x80 # X X X X X T X X X X X X X X X X # \x8F ##
|
22
|
+
## \x90 # X X X X X X X X X X X X X X X X # \x9F ##
|
23
|
+
## \xA0 # I I I I I I I I I I I I I I I I # \xAF ##
|
24
|
+
## \xB0 # I I I I I I I I I I I I I I I I # \xBF ##
|
25
|
+
## \xC0 # I I I I I I I I I I I I I I I I # \xCF ##
|
26
|
+
## \xD0 # I I I I I I I I I I I I I I I I # \xDF ##
|
27
|
+
## \xE0 # I I I I I I I I I I I I I I I I # \xEF ##
|
28
|
+
## \xF0 # I I I I I I I I I I I I I I I I # \xFF ##
|
29
|
+
#####################################################
|
30
|
+
|
31
|
+
UTF8HASBOM = /^\xEF\xBB\xBF/n # [239, 187, 191]
|
32
|
+
UTF32LEBOM = /^\xFF\xFE\x00\x00/n # [255, 254, 0, 0]
|
33
|
+
UTF32BEBOM = /^\x00\x00\xFE\xFF/n # [0, 0, 254, 255]
|
34
|
+
|
35
|
+
UTF16LEBOM = /^\xFF\xFE/n # [255, 254]
|
36
|
+
UTF16BEBOM = /^\xFE\xFF/n # [254, 255]
|
37
|
+
|
38
|
+
NOTIN1BYTE = /[\x00-\x06\x0B\x0E-\x1A\x1C-\x1F\x7F]/n
|
39
|
+
NOTISO8859 = /[\x00-\x06\x0B\x0E-\x1A\x1C-\x1F\x7F\x80-\x84\x86-\x9F]/n
|
40
|
+
|
41
|
+
# It is *much* faster simply to read into the "string" with regex than to
|
42
|
+
# convert into a byte array/set: http://stackoverflow.com/a/27283992/2351351.
|
43
|
+
|
44
|
+
def punch_encoding
|
45
|
+
# The basic premise is just to quickly duck-punch encodings, branching
|
46
|
+
# as early as possible on the most likely scenarios. Handles UTF-8/16/32,
|
47
|
+
# ISO-8859-1, and other extented-ASCII encodings; long-tail, legacy
|
48
|
+
# multibyte encodings are returned as ASCII-8BIT along with binary files.
|
49
|
+
|
50
|
+
force_encoding('BINARY') # Needed to prevent non-matching regex charset.
|
51
|
+
sample = self[0..19] # Keep sample string under 23 bytes.
|
52
|
+
self.sub!(UTF8HASBOM, '') if sample[UTF8HASBOM] # Strip any UTF-8 BOM.
|
53
|
+
|
54
|
+
# See: http://www.daniellesucher.com/2013/07/23/ruby-case-versus-if/
|
55
|
+
if sample.ascii_only? && force_encoding('UTF-8').valid_encoding?
|
56
|
+
|
57
|
+
elsif sample[UTF32LEBOM] && force_encoding('UTF-32LE').valid_encoding?
|
58
|
+
elsif sample[UTF32BEBOM] && force_encoding('UTF-32BE').valid_encoding?
|
59
|
+
elsif sample[UTF16LEBOM] && force_encoding('UTF-16LE').valid_encoding?
|
60
|
+
elsif sample[UTF16BEBOM] && force_encoding('UTF-16BE').valid_encoding?
|
61
|
+
|
62
|
+
elsif force_encoding('UTF-8').valid_encoding?
|
63
|
+
|
64
|
+
elsif force_encoding('BINARY')[NOTISO8859].nil?
|
65
|
+
force_encoding('ISO-8859-1')
|
66
|
+
|
67
|
+
elsif force_encoding('BINARY')[NOTIN1BYTE].nil?
|
68
|
+
force_encoding('Windows-1252')
|
69
|
+
|
70
|
+
else force_encoding('BINARY')
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def detect_encoding
|
75
|
+
# TODO: Fake Charlock's {:encoding, :language, :ruby_language, :confidence}?
|
76
|
+
punch_encoding
|
77
|
+
encoding
|
78
|
+
end
|
79
|
+
|
80
|
+
alias_method :detected_encoding, :detect_encoding
|
81
|
+
|
82
|
+
def autoencode
|
83
|
+
# TODO: Use Ruby 2.1 String#scrub if we're already UTF-8.
|
84
|
+
# TODO: Use Ruby 2.2 Unicode normalization.
|
85
|
+
|
86
|
+
unless detected_encoding == Encoding::BINARY
|
87
|
+
self.encode!('UTF-8', invalid: :replace, undef: :replace,
|
88
|
+
replace: ' ').encode!(universal_newline: true)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
alias_method :autoencode!, :autoencode
|
93
|
+
alias_method :detect_encoding!, :autoencode
|
94
|
+
end
|
95
|
+
|
96
|
+
class String
|
97
|
+
include Charlotte
|
98
|
+
end
|
metadata
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: charlotte
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Geoff Nixon
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-12-12 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A simple (but fast!) character set encoding/binary detector and auto-converter
|
14
|
+
for common encodings (UTF-8/16/32, ISO-8859-1, MacRoman, etc.). Extends String with
|
15
|
+
String.detect_encoding, String.autoencode.
|
16
|
+
email: geoff@geoff.codes
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- bin/charlotte
|
22
|
+
- lib/charlotte.rb
|
23
|
+
homepage: https://github.com/geoff-codes/charlotte
|
24
|
+
licenses:
|
25
|
+
- MIT
|
26
|
+
metadata: {}
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 1.9.3
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubyforge_project:
|
43
|
+
rubygems_version: 2.4.5
|
44
|
+
signing_key:
|
45
|
+
specification_version: 4
|
46
|
+
summary: Simple, pure Ruby character set encoding detector.
|
47
|
+
test_files: []
|
48
|
+
has_rdoc:
|