charlotte 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/charlotte +2 -0
- data/lib/charlotte.rb +98 -0
- metadata +48 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a44e734b24416c3ff9b2b0cc178d15e8d03417e3
|
4
|
+
data.tar.gz: 7dc568b7fbb302a23639b5c3f46a76967227a054
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 5e2b740400db55f30e1623e14da783d2341a5b8593c07be0de86f296a54b0098d2e18ec3c202dcedb3a4ea39dd36a47060d3c51d69bef28c6f837ced01c7a3a0
|
7
|
+
data.tar.gz: cce9165998efa8f618648204e0880830f5fb8cdcee12f418cfb372fc04434277912c8d2efe2d0261f5ebadabcc897e53f1a526158c846fe4f314c8d806dfd5fd
|
data/bin/charlotte
ADDED
data/lib/charlotte.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
module Charlotte # (c) Geoff Nixon, 2014. MIT licence.
|
2
|
+
#### Charlotte -- Fast and dirty encoding-or-binary detector/auto-converter.
|
3
|
+
#### Pronounced "charlet"; rhymes with "chardet". Also, my kid sister's name!
|
4
|
+
|
5
|
+
# Adapted from: https://github.com/file/file/blob/master/src/encoding.c:
|
6
|
+
|
7
|
+
# T: Character appears in plain ASCII text.
|
8
|
+
# I: Character appears in ISO-8859 text.
|
9
|
+
# X: Character appears in extended ASCII text.
|
10
|
+
# F: Character never appears in single-byte text.
|
11
|
+
|
12
|
+
#####################################################
|
13
|
+
## \x00 # F F F F F F F T T T T F T T F F # \x0F ##
|
14
|
+
## \x10 # F F F F F F F F F F F T F F F F # \x1F ##
|
15
|
+
## \x20 # T T T T T T T T T T T T T T T T # \x2F ##
|
16
|
+
## \x30 # T T T T T T T T T T T T T T T T # \x3F ##
|
17
|
+
## \x40 # T T T T T T T T T T T T T T T T # \x4F ##
|
18
|
+
## \x50 # T T T T T T T T T T T T T T T T # \x5F ##
|
19
|
+
## \x60 # T T T T T T T T T T T T T T T T # \x6F ##
|
20
|
+
## \x70 # T T T T T T T T T T T T T T T F # \x7F ##
|
21
|
+
## \x80 # X X X X X T X X X X X X X X X X # \x8F ##
|
22
|
+
## \x90 # X X X X X X X X X X X X X X X X # \x9F ##
|
23
|
+
## \xA0 # I I I I I I I I I I I I I I I I # \xAF ##
|
24
|
+
## \xB0 # I I I I I I I I I I I I I I I I # \xBF ##
|
25
|
+
## \xC0 # I I I I I I I I I I I I I I I I # \xCF ##
|
26
|
+
## \xD0 # I I I I I I I I I I I I I I I I # \xDF ##
|
27
|
+
## \xE0 # I I I I I I I I I I I I I I I I # \xEF ##
|
28
|
+
## \xF0 # I I I I I I I I I I I I I I I I # \xFF ##
|
29
|
+
#####################################################
|
30
|
+
|
31
|
+
UTF8HASBOM = /^\xEF\xBB\xBF/n # [239, 187, 191]
|
32
|
+
UTF32LEBOM = /^\xFF\xFE\x00\x00/n # [255, 254, 0, 0]
|
33
|
+
UTF32BEBOM = /^\x00\x00\xFE\xFF/n # [0, 0, 254, 255]
|
34
|
+
|
35
|
+
UTF16LEBOM = /^\xFF\xFE/n # [255, 254]
|
36
|
+
UTF16BEBOM = /^\xFE\xFF/n # [254, 255]
|
37
|
+
|
38
|
+
NOTIN1BYTE = /[\x00-\x06\x0B\x0E-\x1A\x1C-\x1F\x7F]/n
|
39
|
+
NOTISO8859 = /[\x00-\x06\x0B\x0E-\x1A\x1C-\x1F\x7F\x80-\x84\x86-\x9F]/n
|
40
|
+
|
41
|
+
# It is *much* faster simply to read into the "string" with regex than to
|
42
|
+
# convert into a byte array/set: http://stackoverflow.com/a/27283992/2351351.
|
43
|
+
|
44
|
+
def punch_encoding
|
45
|
+
# The basic premise is just to quickly duck-punch encodings, branching
|
46
|
+
# as early as possible on the most likely scenarios. Handles UTF-8/16/32,
|
47
|
+
# ISO-8859-1, and other extented-ASCII encodings; long-tail, legacy
|
48
|
+
# multibyte encodings are returned as ASCII-8BIT along with binary files.
|
49
|
+
|
50
|
+
force_encoding('BINARY') # Needed to prevent non-matching regex charset.
|
51
|
+
sample = self[0..19] # Keep sample string under 23 bytes.
|
52
|
+
self.sub!(UTF8HASBOM, '') if sample[UTF8HASBOM] # Strip any UTF-8 BOM.
|
53
|
+
|
54
|
+
# See: http://www.daniellesucher.com/2013/07/23/ruby-case-versus-if/
|
55
|
+
if sample.ascii_only? && force_encoding('UTF-8').valid_encoding?
|
56
|
+
|
57
|
+
elsif sample[UTF32LEBOM] && force_encoding('UTF-32LE').valid_encoding?
|
58
|
+
elsif sample[UTF32BEBOM] && force_encoding('UTF-32BE').valid_encoding?
|
59
|
+
elsif sample[UTF16LEBOM] && force_encoding('UTF-16LE').valid_encoding?
|
60
|
+
elsif sample[UTF16BEBOM] && force_encoding('UTF-16BE').valid_encoding?
|
61
|
+
|
62
|
+
elsif force_encoding('UTF-8').valid_encoding?
|
63
|
+
|
64
|
+
elsif force_encoding('BINARY')[NOTISO8859].nil?
|
65
|
+
force_encoding('ISO-8859-1')
|
66
|
+
|
67
|
+
elsif force_encoding('BINARY')[NOTIN1BYTE].nil?
|
68
|
+
force_encoding('Windows-1252')
|
69
|
+
|
70
|
+
else force_encoding('BINARY')
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def detect_encoding
|
75
|
+
# TODO: Fake Charlock's {:encoding, :language, :ruby_language, :confidence}?
|
76
|
+
punch_encoding
|
77
|
+
encoding
|
78
|
+
end
|
79
|
+
|
80
|
+
alias_method :detected_encoding, :detect_encoding
|
81
|
+
|
82
|
+
def autoencode
|
83
|
+
# TODO: Use Ruby 2.1 String#scrub if we're already UTF-8.
|
84
|
+
# TODO: Use Ruby 2.2 Unicode normalization.
|
85
|
+
|
86
|
+
unless detected_encoding == Encoding::BINARY
|
87
|
+
self.encode!('UTF-8', invalid: :replace, undef: :replace,
|
88
|
+
replace: ' ').encode!(universal_newline: true)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
alias_method :autoencode!, :autoencode
|
93
|
+
alias_method :detect_encoding!, :autoencode
|
94
|
+
end
|
95
|
+
|
96
|
+
class String
|
97
|
+
include Charlotte
|
98
|
+
end
|
metadata
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: charlotte
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Geoff Nixon
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-12-12 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A simple (but fast!) character set encoding/binary detector and auto-converter
|
14
|
+
for common encodings (UTF-8/16/32, ISO-8859-1, MacRoman, etc.). Extends String with
|
15
|
+
String.detect_encoding, String.autoencode.
|
16
|
+
email: geoff@geoff.codes
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- bin/charlotte
|
22
|
+
- lib/charlotte.rb
|
23
|
+
homepage: https://github.com/geoff-codes/charlotte
|
24
|
+
licenses:
|
25
|
+
- MIT
|
26
|
+
metadata: {}
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 1.9.3
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubyforge_project:
|
43
|
+
rubygems_version: 2.4.5
|
44
|
+
signing_key:
|
45
|
+
specification_version: 4
|
46
|
+
summary: Simple, pure Ruby character set encoding detector.
|
47
|
+
test_files: []
|
48
|
+
has_rdoc:
|