charlotte 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +7 -0
  2. data/bin/charlotte +2 -0
  3. data/lib/charlotte.rb +98 -0
  4. metadata +48 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a44e734b24416c3ff9b2b0cc178d15e8d03417e3
4
+ data.tar.gz: 7dc568b7fbb302a23639b5c3f46a76967227a054
5
+ SHA512:
6
+ metadata.gz: 5e2b740400db55f30e1623e14da783d2341a5b8593c07be0de86f296a54b0098d2e18ec3c202dcedb3a4ea39dd36a47060d3c51d69bef28c6f837ced01c7a3a0
7
+ data.tar.gz: cce9165998efa8f618648204e0880830f5fb8cdcee12f418cfb372fc04434277912c8d2efe2d0261f5ebadabcc897e53f1a526158c846fe4f314c8d806dfd5fd
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env bash -x
2
+ time ruby -r "charlotte" -e "ARGV.each{|arg| puts arg; puts File.binread(arg).detect_encoding; puts ''}" -- $(find "$@" -type f)
@@ -0,0 +1,98 @@
1
+ module Charlotte # (c) Geoff Nixon, 2014. MIT licence.
2
+ #### Charlotte -- Fast and dirty encoding-or-binary detector/auto-converter.
3
+ #### Pronounced "charlet"; rhymes with "chardet". Also, my kid sister's name!
4
+
5
+ # Adapted from: https://github.com/file/file/blob/master/src/encoding.c:
6
+
7
+ # T: Character appears in plain ASCII text.
8
+ # I: Character appears in ISO-8859 text.
9
+ # X: Character appears in extended ASCII text.
10
+ # F: Character never appears in single-byte text.
11
+
12
+ #####################################################
13
+ ## \x00 # F F F F F F F T T T T F T T F F # \x0F ##
14
+ ## \x10 # F F F F F F F F F F F T F F F F # \x1F ##
15
+ ## \x20 # T T T T T T T T T T T T T T T T # \x2F ##
16
+ ## \x30 # T T T T T T T T T T T T T T T T # \x3F ##
17
+ ## \x40 # T T T T T T T T T T T T T T T T # \x4F ##
18
+ ## \x50 # T T T T T T T T T T T T T T T T # \x5F ##
19
+ ## \x60 # T T T T T T T T T T T T T T T T # \x6F ##
20
+ ## \x70 # T T T T T T T T T T T T T T T F # \x7F ##
21
+ ## \x80 # X X X X X T X X X X X X X X X X # \x8F ##
22
+ ## \x90 # X X X X X X X X X X X X X X X X # \x9F ##
23
+ ## \xA0 # I I I I I I I I I I I I I I I I # \xAF ##
24
+ ## \xB0 # I I I I I I I I I I I I I I I I # \xBF ##
25
+ ## \xC0 # I I I I I I I I I I I I I I I I # \xCF ##
26
+ ## \xD0 # I I I I I I I I I I I I I I I I # \xDF ##
27
+ ## \xE0 # I I I I I I I I I I I I I I I I # \xEF ##
28
+ ## \xF0 # I I I I I I I I I I I I I I I I # \xFF ##
29
+ #####################################################
30
+
31
+ UTF8HASBOM = /^\xEF\xBB\xBF/n # [239, 187, 191]
32
+ UTF32LEBOM = /^\xFF\xFE\x00\x00/n # [255, 254, 0, 0]
33
+ UTF32BEBOM = /^\x00\x00\xFE\xFF/n # [0, 0, 254, 255]
34
+
35
+ UTF16LEBOM = /^\xFF\xFE/n # [255, 254]
36
+ UTF16BEBOM = /^\xFE\xFF/n # [254, 255]
37
+
38
+ NOTIN1BYTE = /[\x00-\x06\x0B\x0E-\x1A\x1C-\x1F\x7F]/n
39
+ NOTISO8859 = /[\x00-\x06\x0B\x0E-\x1A\x1C-\x1F\x7F\x80-\x84\x86-\x9F]/n
40
+
41
+ # It is *much* faster simply to read into the "string" with regex than to
42
+ # convert into a byte array/set: http://stackoverflow.com/a/27283992/2351351.
43
+
44
+ def punch_encoding
45
+ # The basic premise is just to quickly duck-punch encodings, branching
46
+ # as early as possible on the most likely scenarios. Handles UTF-8/16/32,
47
+ # ISO-8859-1, and other extented-ASCII encodings; long-tail, legacy
48
+ # multibyte encodings are returned as ASCII-8BIT along with binary files.
49
+
50
+ force_encoding('BINARY') # Needed to prevent non-matching regex charset.
51
+ sample = self[0..19] # Keep sample string under 23 bytes.
52
+ self.sub!(UTF8HASBOM, '') if sample[UTF8HASBOM] # Strip any UTF-8 BOM.
53
+
54
+ # See: http://www.daniellesucher.com/2013/07/23/ruby-case-versus-if/
55
+ if sample.ascii_only? && force_encoding('UTF-8').valid_encoding?
56
+
57
+ elsif sample[UTF32LEBOM] && force_encoding('UTF-32LE').valid_encoding?
58
+ elsif sample[UTF32BEBOM] && force_encoding('UTF-32BE').valid_encoding?
59
+ elsif sample[UTF16LEBOM] && force_encoding('UTF-16LE').valid_encoding?
60
+ elsif sample[UTF16BEBOM] && force_encoding('UTF-16BE').valid_encoding?
61
+
62
+ elsif force_encoding('UTF-8').valid_encoding?
63
+
64
+ elsif force_encoding('BINARY')[NOTISO8859].nil?
65
+ force_encoding('ISO-8859-1')
66
+
67
+ elsif force_encoding('BINARY')[NOTIN1BYTE].nil?
68
+ force_encoding('Windows-1252')
69
+
70
+ else force_encoding('BINARY')
71
+ end
72
+ end
73
+
74
+ def detect_encoding
75
+ # TODO: Fake Charlock's {:encoding, :language, :ruby_language, :confidence}?
76
+ punch_encoding
77
+ encoding
78
+ end
79
+
80
+ alias_method :detected_encoding, :detect_encoding
81
+
82
+ def autoencode
83
+ # TODO: Use Ruby 2.1 String#scrub if we're already UTF-8.
84
+ # TODO: Use Ruby 2.2 Unicode normalization.
85
+
86
+ unless detected_encoding == Encoding::BINARY
87
+ self.encode!('UTF-8', invalid: :replace, undef: :replace,
88
+ replace: ' ').encode!(universal_newline: true)
89
+ end
90
+ end
91
+
92
+ alias_method :autoencode!, :autoencode
93
+ alias_method :detect_encoding!, :autoencode
94
+ end
95
+
96
+ class String
97
+ include Charlotte
98
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: charlotte
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Geoff Nixon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-12-12 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A simple (but fast!) character set encoding/binary detector and auto-converter
14
+ for common encodings (UTF-8/16/32, ISO-8859-1, MacRoman, etc.). Extends String with
15
+ String.detect_encoding, String.autoencode.
16
+ email: geoff@geoff.codes
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - bin/charlotte
22
+ - lib/charlotte.rb
23
+ homepage: https://github.com/geoff-codes/charlotte
24
+ licenses:
25
+ - MIT
26
+ metadata: {}
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: 1.9.3
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 2.4.5
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: Simple, pure Ruby character set encoding detector.
47
+ test_files: []
48
+ has_rdoc: