charlotte 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +7 -0
  2. data/bin/charlotte +2 -0
  3. data/lib/charlotte.rb +98 -0
  4. metadata +48 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a44e734b24416c3ff9b2b0cc178d15e8d03417e3
4
+ data.tar.gz: 7dc568b7fbb302a23639b5c3f46a76967227a054
5
+ SHA512:
6
+ metadata.gz: 5e2b740400db55f30e1623e14da783d2341a5b8593c07be0de86f296a54b0098d2e18ec3c202dcedb3a4ea39dd36a47060d3c51d69bef28c6f837ced01c7a3a0
7
+ data.tar.gz: cce9165998efa8f618648204e0880830f5fb8cdcee12f418cfb372fc04434277912c8d2efe2d0261f5ebadabcc897e53f1a526158c846fe4f314c8d806dfd5fd
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env bash -x
2
+ time ruby -r "charlotte" -e "ARGV.each{|arg| puts arg; puts File.binread(arg).detect_encoding; puts ''}" -- $(find "$@" -type f)
@@ -0,0 +1,98 @@
1
+ module Charlotte # (c) Geoff Nixon, 2014. MIT licence.
2
+ #### Charlotte -- Fast and dirty encoding-or-binary detector/auto-converter.
3
+ #### Pronounced "charlet"; rhymes with "chardet". Also, my kid sister's name!
4
+
5
+ # Adapted from: https://github.com/file/file/blob/master/src/encoding.c:
6
+
7
+ # T: Character appears in plain ASCII text.
8
+ # I: Character appears in ISO-8859 text.
9
+ # X: Character appears in extended ASCII text.
10
+ # F: Character never appears in single-byte text.
11
+
12
+ #####################################################
13
+ ## \x00 # F F F F F F F T T T T F T T F F # \x0F ##
14
+ ## \x10 # F F F F F F F F F F F T F F F F # \x1F ##
15
+ ## \x20 # T T T T T T T T T T T T T T T T # \x2F ##
16
+ ## \x30 # T T T T T T T T T T T T T T T T # \x3F ##
17
+ ## \x40 # T T T T T T T T T T T T T T T T # \x4F ##
18
+ ## \x50 # T T T T T T T T T T T T T T T T # \x5F ##
19
+ ## \x60 # T T T T T T T T T T T T T T T T # \x6F ##
20
+ ## \x70 # T T T T T T T T T T T T T T T F # \x7F ##
21
+ ## \x80 # X X X X X T X X X X X X X X X X # \x8F ##
22
+ ## \x90 # X X X X X X X X X X X X X X X X # \x9F ##
23
+ ## \xA0 # I I I I I I I I I I I I I I I I # \xAF ##
24
+ ## \xB0 # I I I I I I I I I I I I I I I I # \xBF ##
25
+ ## \xC0 # I I I I I I I I I I I I I I I I # \xCF ##
26
+ ## \xD0 # I I I I I I I I I I I I I I I I # \xDF ##
27
+ ## \xE0 # I I I I I I I I I I I I I I I I # \xEF ##
28
+ ## \xF0 # I I I I I I I I I I I I I I I I # \xFF ##
29
+ #####################################################
30
+
31
+ UTF8HASBOM = /^\xEF\xBB\xBF/n # [239, 187, 191]
32
+ UTF32LEBOM = /^\xFF\xFE\x00\x00/n # [255, 254, 0, 0]
33
+ UTF32BEBOM = /^\x00\x00\xFE\xFF/n # [0, 0, 254, 255]
34
+
35
+ UTF16LEBOM = /^\xFF\xFE/n # [255, 254]
36
+ UTF16BEBOM = /^\xFE\xFF/n # [254, 255]
37
+
38
+ NOTIN1BYTE = /[\x00-\x06\x0B\x0E-\x1A\x1C-\x1F\x7F]/n
39
+ NOTISO8859 = /[\x00-\x06\x0B\x0E-\x1A\x1C-\x1F\x7F\x80-\x84\x86-\x9F]/n
40
+
41
+ # It is *much* faster simply to read into the "string" with regex than to
42
+ # convert into a byte array/set: http://stackoverflow.com/a/27283992/2351351.
43
+
44
+ def punch_encoding
45
+ # The basic premise is just to quickly duck-punch encodings, branching
46
+ # as early as possible on the most likely scenarios. Handles UTF-8/16/32,
47
+ # ISO-8859-1, and other extented-ASCII encodings; long-tail, legacy
48
+ # multibyte encodings are returned as ASCII-8BIT along with binary files.
49
+
50
+ force_encoding('BINARY') # Needed to prevent non-matching regex charset.
51
+ sample = self[0..19] # Keep sample string under 23 bytes.
52
+ self.sub!(UTF8HASBOM, '') if sample[UTF8HASBOM] # Strip any UTF-8 BOM.
53
+
54
+ # See: http://www.daniellesucher.com/2013/07/23/ruby-case-versus-if/
55
+ if sample.ascii_only? && force_encoding('UTF-8').valid_encoding?
56
+
57
+ elsif sample[UTF32LEBOM] && force_encoding('UTF-32LE').valid_encoding?
58
+ elsif sample[UTF32BEBOM] && force_encoding('UTF-32BE').valid_encoding?
59
+ elsif sample[UTF16LEBOM] && force_encoding('UTF-16LE').valid_encoding?
60
+ elsif sample[UTF16BEBOM] && force_encoding('UTF-16BE').valid_encoding?
61
+
62
+ elsif force_encoding('UTF-8').valid_encoding?
63
+
64
+ elsif force_encoding('BINARY')[NOTISO8859].nil?
65
+ force_encoding('ISO-8859-1')
66
+
67
+ elsif force_encoding('BINARY')[NOTIN1BYTE].nil?
68
+ force_encoding('Windows-1252')
69
+
70
+ else force_encoding('BINARY')
71
+ end
72
+ end
73
+
74
+ def detect_encoding
75
+ # TODO: Fake Charlock's {:encoding, :language, :ruby_language, :confidence}?
76
+ punch_encoding
77
+ encoding
78
+ end
79
+
80
+ alias_method :detected_encoding, :detect_encoding
81
+
82
+ def autoencode
83
+ # TODO: Use Ruby 2.1 String#scrub if we're already UTF-8.
84
+ # TODO: Use Ruby 2.2 Unicode normalization.
85
+
86
+ unless detected_encoding == Encoding::BINARY
87
+ self.encode!('UTF-8', invalid: :replace, undef: :replace,
88
+ replace: ' ').encode!(universal_newline: true)
89
+ end
90
+ end
91
+
92
+ alias_method :autoencode!, :autoencode
93
+ alias_method :detect_encoding!, :autoencode
94
+ end
95
+
96
+ class String
97
+ include Charlotte
98
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: charlotte
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Geoff Nixon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-12-12 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A simple (but fast!) character set encoding/binary detector and auto-converter
14
+ for common encodings (UTF-8/16/32, ISO-8859-1, MacRoman, etc.). Extends String with
15
+ String.detect_encoding, String.autoencode.
16
+ email: geoff@geoff.codes
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - bin/charlotte
22
+ - lib/charlotte.rb
23
+ homepage: https://github.com/geoff-codes/charlotte
24
+ licenses:
25
+ - MIT
26
+ metadata: {}
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: 1.9.3
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 2.4.5
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: Simple, pure Ruby character set encoding detector.
47
+ test_files: []
48
+ has_rdoc: