cmess 0.0.4.136
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +676 -0
- data/ChangeLog +6 -0
- data/README +53 -0
- data/Rakefile +30 -0
- data/bin/cinderella +186 -0
- data/bin/decode_entities +101 -0
- data/bin/guess_encoding +183 -0
- data/data/csets/iso_8859-1.yaml +195 -0
- data/data/csets/iso_8859-15.yaml +204 -0
- data/data/csets/latin1.yaml +195 -0
- data/data/csets/unicode/basic_latin.yaml +97 -0
- data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
- data/data/csets/unicode/cyrillic.yaml +256 -0
- data/data/csets/unicode/greek.yaml +129 -0
- data/data/csets/unicode/ipa_extensions.yaml +97 -0
- data/data/csets/unicode/latin-extended-c.yaml +18 -0
- data/data/csets/unicode/latin-extended-d.yaml +3 -0
- data/data/csets/unicode/latin_1_supplement.yaml +128 -0
- data/data/csets/unicode/latin_extended_a.yaml +129 -0
- data/data/csets/unicode/latin_extended_additional.yaml +247 -0
- data/data/csets/unicode/latin_extended_b.yaml +209 -0
- data/data/csets/unicode/letterlike_symbols.yaml +80 -0
- data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
- data/data/csets/utf-8.yaml +1504 -0
- data/data/csets/utf8.yaml +1504 -0
- data/example/crop +127 -0
- data/example/crop_repaired +127 -0
- data/example/empty6-slash.txt +1495 -0
- data/example/empty6-slash_repaired.txt +1495 -0
- data/example/pot +1368 -0
- data/lib/cmess.rb +44 -0
- data/lib/cmess/cinderella.rb +63 -0
- data/lib/cmess/cli.rb +79 -0
- data/lib/cmess/decode_entities.rb +68 -0
- data/lib/cmess/guess_encoding.rb +372 -0
- data/lib/cmess/version.rb +51 -0
- metadata +119 -0
data/lib/cmess.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# cmess -- Assist with handling messed up encodings #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
# Bundles several tools that aim at dealing with various problems occurring in
|
30
|
+
# the context of character sets and encodings. Currently, there are:
|
31
|
+
#
|
32
|
+
# guess_encoding:: Simple helper to identify the encoding of a given string.
|
33
|
+
# Includes the ability to automatically detect the encoding
|
34
|
+
# of an input. (see GuessEncoding)
|
35
|
+
# cinderella:: When characters are "double encoded", you can't easily
|
36
|
+
# convert them back -- this is where cinderella comes in,
|
37
|
+
# sorting the good ones into the pot and the (potentially)
|
38
|
+
# bad ones into the crop... (see Cinderella)
|
39
|
+
# decode_entities:: Decode HTML entities in a string. (see DecodeEntities)
|
40
|
+
|
41
|
+
module CMess
|
42
|
+
end
|
43
|
+
|
44
|
+
require 'cmess/version'
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'iconv'
|
30
|
+
|
31
|
+
# Find (and possibly repair) doubly encoded characters. Here's how it's done:
|
32
|
+
#
|
33
|
+
# Treats characters encoded in target encoding as if they were encoded in
|
34
|
+
# source encoding, converts them to target encoding and "grep"s for lines
|
35
|
+
# containing those doubly encoded characters; if asked to repair doubly
|
36
|
+
# encoded characters, substitutes them with their original character.
|
37
|
+
|
38
|
+
module CMess::Cinderella
|
39
|
+
|
40
|
+
extend self
|
41
|
+
|
42
|
+
# our version ;-)
|
43
|
+
VERSION = '0.0.3'
|
44
|
+
|
45
|
+
def pick(input, pot, crop, source_encoding, target_encoding, chars, repair = false)
|
46
|
+
iconv = Iconv.new(target_encoding, source_encoding)
|
47
|
+
|
48
|
+
encoded = chars.inject({}) { |hash, char|
|
49
|
+
hash.update(iconv.iconv(char) => char)
|
50
|
+
}
|
51
|
+
|
52
|
+
regexp = Regexp.union(*encoded.keys)
|
53
|
+
|
54
|
+
input.each { |line|
|
55
|
+
if out = line =~ regexp ? crop : pot
|
56
|
+
line.gsub!(regexp) { |m| encoded[m] } if repair
|
57
|
+
|
58
|
+
out.puts(line)
|
59
|
+
end
|
60
|
+
}
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
data/lib/cmess/cli.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
module CMess::CLI
|
30
|
+
|
31
|
+
def ensure_readable(file)
|
32
|
+
abort "Can't find input file: #{file}" unless File.readable?(file)
|
33
|
+
end
|
34
|
+
|
35
|
+
def ensure_directory(dir)
|
36
|
+
abort "Directory not found: #{dir}" unless File.directory?(dir)
|
37
|
+
end
|
38
|
+
|
39
|
+
def open_file_in_place(file)
|
40
|
+
ensure_readable(file)
|
41
|
+
[File.readlines(file), File.open(file, 'w')]
|
42
|
+
end
|
43
|
+
|
44
|
+
def open_file_or_std(file, mode = 'r')
|
45
|
+
if file == '-'
|
46
|
+
case mode
|
47
|
+
when 'r': STDIN
|
48
|
+
when 'w': STDOUT
|
49
|
+
when 'a': STDERR
|
50
|
+
else raise ArgumentError, "don't know how to handle mode '#{mode}'"
|
51
|
+
end
|
52
|
+
else
|
53
|
+
ensure_readable(file) unless mode == 'w'
|
54
|
+
File.open(file, mode)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def determine_system_encoding
|
59
|
+
ENV['SYSTEM_ENCODING'] ||
|
60
|
+
ENV['LANG'][/\.(.*)/, 1] ||
|
61
|
+
system_encoding_not_found
|
62
|
+
end
|
63
|
+
|
64
|
+
def system_encoding_not_found
|
65
|
+
not_found = lambda {
|
66
|
+
abort <<-EOT
|
67
|
+
Your system's encoding couldn't be determined automatically -- please specify it
|
68
|
+
explicitly via the SYSTEM_ENCODING environment variable or via the '-t' option.
|
69
|
+
EOT
|
70
|
+
}
|
71
|
+
|
72
|
+
def not_found.to_s
|
73
|
+
'NOT FOUND'
|
74
|
+
end
|
75
|
+
|
76
|
+
not_found
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'iconv'
|
30
|
+
|
31
|
+
require 'rubygems'
|
32
|
+
require 'htmlentities/string'
|
33
|
+
|
34
|
+
module CMess::DecodeEntities
|
35
|
+
|
36
|
+
extend self
|
37
|
+
|
38
|
+
# our version ;-)
|
39
|
+
VERSION = '0.0.2'
|
40
|
+
|
41
|
+
# HTMLEntities requires UTF-8
|
42
|
+
INTERMEDIATE_ENCODING = 'utf-8'
|
43
|
+
|
44
|
+
ICONV_DUMMY = begin
|
45
|
+
dummy = Object.new
|
46
|
+
|
47
|
+
def dummy.iconv(string)
|
48
|
+
string
|
49
|
+
end
|
50
|
+
|
51
|
+
dummy
|
52
|
+
end
|
53
|
+
|
54
|
+
def decode(input, output, source_encoding, target_encoding = nil)
|
55
|
+
target_encoding ||= source_encoding
|
56
|
+
|
57
|
+
iconv_in = source_encoding != INTERMEDIATE_ENCODING ?
|
58
|
+
Iconv.new(INTERMEDIATE_ENCODING, source_encoding) : ICONV_DUMMY
|
59
|
+
|
60
|
+
iconv_out = target_encoding != INTERMEDIATE_ENCODING ?
|
61
|
+
Iconv.new(target_encoding, INTERMEDIATE_ENCODING) : ICONV_DUMMY
|
62
|
+
|
63
|
+
input.each { |line|
|
64
|
+
output.puts iconv_out.iconv(iconv_in.iconv(line).decode_entities)
|
65
|
+
}
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,372 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# Contributors: #
|
14
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
15
|
+
# for automatic encoding detection) #
|
16
|
+
# #
|
17
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
18
|
+
# terms of the GNU General Public License as published by the Free Software #
|
19
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
20
|
+
# version. #
|
21
|
+
# #
|
22
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
23
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
24
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
25
|
+
# details. #
|
26
|
+
# #
|
27
|
+
# You should have received a copy of the GNU General Public License along #
|
28
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
29
|
+
# #
|
30
|
+
###############################################################################
|
31
|
+
#++
|
32
|
+
|
33
|
+
require 'iconv'
|
34
|
+
|
35
|
+
# Outputs given string (or line), being encoded in target encoding, encoded in
|
36
|
+
# various test encodings, thus allowing to identify the (seemingly) correct
|
37
|
+
# encoding by visually comparing the input string with its desired appearance.
|
38
|
+
#
|
39
|
+
# In addition to that manual procedure, may be used to detect the encoding
|
40
|
+
# automatically. Works actually pretty good -- for the supported encodings
|
41
|
+
# (see Automatic for details).
|
42
|
+
|
43
|
+
module CMess::GuessEncoding
|
44
|
+
|
45
|
+
# our version ;-)
|
46
|
+
VERSION = '0.0.5'
|
47
|
+
|
48
|
+
# Namespace for our encodings.
|
49
|
+
module Encoding
|
50
|
+
|
51
|
+
%w[
|
52
|
+
UNKNOWN ASCII MACINTOSH
|
53
|
+
ISO-8859-1 ISO-8859-2 ISO-8859-15
|
54
|
+
CP1250 CP1251 CP1252 CP850 CP852 CP856
|
55
|
+
UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
|
56
|
+
UTF-7 UTF-EBCDIC SCSU BOCU-1
|
57
|
+
ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
|
58
|
+
].each { |encoding|
|
59
|
+
const = encoding.tr('-', '_').gsub(/\W/, '')
|
60
|
+
const_set(const, encoding.freeze)
|
61
|
+
}
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
module Manual
|
66
|
+
|
67
|
+
extend self
|
68
|
+
|
69
|
+
include Encoding
|
70
|
+
|
71
|
+
# default encodings to try
|
72
|
+
ENCODINGS = [
|
73
|
+
ISO_8859_1,
|
74
|
+
ISO_8859_2,
|
75
|
+
ISO_8859_15,
|
76
|
+
CP1250,
|
77
|
+
CP1251,
|
78
|
+
CP1252,
|
79
|
+
CP850,
|
80
|
+
CP852,
|
81
|
+
CP856,
|
82
|
+
UTF_8
|
83
|
+
]
|
84
|
+
|
85
|
+
# likely candidates to suggest to the user
|
86
|
+
CANDIDATES = [
|
87
|
+
ANSI_X34,
|
88
|
+
EBCDIC_AT_DE,
|
89
|
+
EBCDIC_US,
|
90
|
+
EUC_JP,
|
91
|
+
KOI_8,
|
92
|
+
MACINTOSH,
|
93
|
+
MS_ANSI,
|
94
|
+
SHIFT_JIS,
|
95
|
+
UTF_7,
|
96
|
+
UTF_16,
|
97
|
+
UTF_16BE,
|
98
|
+
UTF_16LE,
|
99
|
+
UTF_32,
|
100
|
+
UTF_32BE,
|
101
|
+
UTF_32LE
|
102
|
+
]
|
103
|
+
|
104
|
+
def display(input, target_encoding, encodings = nil, additional_encodings = [])
|
105
|
+
target = target_encoding
|
106
|
+
|
107
|
+
encodings = (encodings || ENCODINGS) + additional_encodings
|
108
|
+
encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
|
109
|
+
# staying at the end
|
110
|
+
encodings = [target] + (encodings - [target]) # move target encoding to front
|
111
|
+
|
112
|
+
max_length = encodings.map { |encoding| encoding.length }.max
|
113
|
+
|
114
|
+
encodings.each { |encoding|
|
115
|
+
converted = begin
|
116
|
+
Iconv.conv(target, encoding, input)
|
117
|
+
rescue Iconv::IllegalSequence => err
|
118
|
+
"ILLEGAL INPUT SEQUENCE: #{err}"
|
119
|
+
rescue Iconv::InvalidEncoding
|
120
|
+
if encoding == target
|
121
|
+
abort "Invalid encoding: #{encoding}"
|
122
|
+
else
|
123
|
+
"INVALID ENCODING!"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
puts "%-#{max_length}s : %s" % [encoding, converted]
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
# Tries to detect the encoding of a given input by applying several
|
134
|
+
# heuristics to determine the <b>most likely</b> candidate. If no heuristic
|
135
|
+
# catches on, resorts to Encoding::UNKNOWN.
|
136
|
+
#
|
137
|
+
# If a BOM is found, it may determine the encoding directly.
|
138
|
+
class Automatic
|
139
|
+
|
140
|
+
extend Forwardable
|
141
|
+
|
142
|
+
def_delegators :@klass, :encoding_guessers, :supported_encoding?,
|
143
|
+
:bom_guessers, :supported_bom?
|
144
|
+
|
145
|
+
include Encoding
|
146
|
+
|
147
|
+
@supported_encodings = []
|
148
|
+
@encoding_guessers = []
|
149
|
+
@supported_boms = []
|
150
|
+
@bom_guessers = []
|
151
|
+
|
152
|
+
class << self
|
153
|
+
|
154
|
+
attr_reader :supported_encodings, :encoding_guessers,
|
155
|
+
:supported_boms, :bom_guessers
|
156
|
+
|
157
|
+
def guess(input, chunk_size = nil, ignore_bom = false)
|
158
|
+
new(input, chunk_size).guess(ignore_bom)
|
159
|
+
end
|
160
|
+
|
161
|
+
private
|
162
|
+
|
163
|
+
def encoding(encoding, &condition_block)
|
164
|
+
encoding_block = lambda {
|
165
|
+
encoding if instance_eval(&condition_block)
|
166
|
+
}
|
167
|
+
|
168
|
+
encodings(encoding, &encoding_block)
|
169
|
+
end
|
170
|
+
|
171
|
+
def encodings(*encodings, &encoding_block)
|
172
|
+
encodings.each { |encoding|
|
173
|
+
@supported_encodings << encoding
|
174
|
+
@encoding_guessers << encoding_block
|
175
|
+
}
|
176
|
+
end
|
177
|
+
|
178
|
+
def supported_encoding?(encoding)
|
179
|
+
supported_encodings.include?(encoding)
|
180
|
+
end
|
181
|
+
|
182
|
+
def bom_encoding(encoding, &condition_block)
|
183
|
+
encoding_block = lambda {
|
184
|
+
encoding if instance_eval(&condition_block)
|
185
|
+
}
|
186
|
+
|
187
|
+
@supported_boms << encoding
|
188
|
+
@bom_guessers << encoding_block
|
189
|
+
end
|
190
|
+
|
191
|
+
def supported_bom?(encoding)
|
192
|
+
supported_boms.include?(encoding)
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
196
|
+
|
197
|
+
attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
|
198
|
+
|
199
|
+
def initialize(input, chunk_size = nil)
|
200
|
+
@input = input
|
201
|
+
@chunk_size = chunk_size
|
202
|
+
|
203
|
+
@klass = self.class
|
204
|
+
end
|
205
|
+
|
206
|
+
def guess(ignore_bom = false)
|
207
|
+
return bom if bom && !ignore_bom
|
208
|
+
|
209
|
+
while read
|
210
|
+
encoding_guessers.each { |block|
|
211
|
+
encoding = instance_eval(&block)
|
212
|
+
return encoding if encoding && supported_encoding?(encoding)
|
213
|
+
}
|
214
|
+
end
|
215
|
+
|
216
|
+
# nothing suitable found :-(
|
217
|
+
UNKNOWN
|
218
|
+
end
|
219
|
+
|
220
|
+
def bom
|
221
|
+
@bom ||= check_bom
|
222
|
+
end
|
223
|
+
|
224
|
+
private
|
225
|
+
|
226
|
+
def eof?
|
227
|
+
input.eof?
|
228
|
+
end
|
229
|
+
|
230
|
+
def check_bom
|
231
|
+
return if eof?
|
232
|
+
|
233
|
+
bom_guessers.each { |block|
|
234
|
+
encoding = instance_eval(&block)
|
235
|
+
return encoding if encoding && supported_bom?(encoding)
|
236
|
+
|
237
|
+
# read bytes don't build a BOM, so rewind...
|
238
|
+
input.rewind
|
239
|
+
}
|
240
|
+
|
241
|
+
# nothing suitable found :-(
|
242
|
+
nil
|
243
|
+
end
|
244
|
+
|
245
|
+
def next_byte
|
246
|
+
input.read(1).unpack('C').first
|
247
|
+
end
|
248
|
+
|
249
|
+
def starts_with?(*bytes)
|
250
|
+
bytes.all? { |byte|
|
251
|
+
next_byte == byte
|
252
|
+
}
|
253
|
+
end
|
254
|
+
|
255
|
+
def next_one_of?(*bytes)
|
256
|
+
bytes.include?(next_byte)
|
257
|
+
end
|
258
|
+
|
259
|
+
def read(chunk_size = chunk_size)
|
260
|
+
# => initialize counters
|
261
|
+
@byte_count ||= Hash.new(0)
|
262
|
+
@byte_total ||= 0
|
263
|
+
|
264
|
+
return if eof?
|
265
|
+
|
266
|
+
bytes_before = @byte_total
|
267
|
+
|
268
|
+
input.read(chunk_size).each_byte { |byte|
|
269
|
+
@byte_count[byte] += 1
|
270
|
+
@byte_total += 1
|
271
|
+
|
272
|
+
@first_byte ||= byte
|
273
|
+
}
|
274
|
+
|
275
|
+
@byte_total > bytes_before
|
276
|
+
end
|
277
|
+
|
278
|
+
def byte_count_sum(*bytes)
|
279
|
+
bytes = *bytes # treat arrays/ranges and lists alike
|
280
|
+
bytes.inject(0) { |sum, n| sum + byte_count[n] }
|
281
|
+
end
|
282
|
+
|
283
|
+
def relative_byte_count(count)
|
284
|
+
count.to_f / byte_total
|
285
|
+
end
|
286
|
+
|
287
|
+
### Definition of guessing heuristics. Order matters!
|
288
|
+
|
289
|
+
# ASCII, if all bytes are within the lower 128 bytes
|
290
|
+
# (Unfortunately, we have to read the *whole* file to make that decision)
|
291
|
+
encoding ASCII do
|
292
|
+
eof? && byte_count_sum(0x0..0x7f) == byte_total
|
293
|
+
end
|
294
|
+
|
295
|
+
# UTF-16, if lots of NULL bytes present
|
296
|
+
encodings UTF_16BE, UTF_16LE, UTF_16 do
|
297
|
+
if relative_byte_count(byte_count[0]) > 0.25
|
298
|
+
case first_byte
|
299
|
+
when 0: UTF_32
|
300
|
+
when 254: UTF_16BE
|
301
|
+
when 255: UTF_16LE
|
302
|
+
else UTF_16
|
303
|
+
end
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
# UTF-8, if number of escape-bytes and following bytes
|
308
|
+
# is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
|
309
|
+
encoding UTF_8 do
|
310
|
+
esc_bytes = byte_count_sum(0xc0..0xdf) \
|
311
|
+
# => 110xxxxx 10xxxxxx
|
312
|
+
+ byte_count_sum(0xe0..0xef) * 2 \
|
313
|
+
# => 1110xxxx 10xxxxxx 10xxxxxx
|
314
|
+
+ byte_count_sum(0xf0..0xf7) * 3
|
315
|
+
# => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
316
|
+
fol_bytes = byte_count_sum(0x80..0xbf)
|
317
|
+
# => 10xxxxxx
|
318
|
+
|
319
|
+
esc_bytes > 0 && esc_bytes == fol_bytes
|
320
|
+
end
|
321
|
+
|
322
|
+
# Analyse statistical appearance of German umlauts (=> ÄäÖöÜüß)
|
323
|
+
encodings MACINTOSH, ISO_8859_1 do
|
324
|
+
{
|
325
|
+
MACINTOSH => [0x80, 0x8a, 0x85, 0x9a, 0x86, 0x9f, 0xa7],
|
326
|
+
ISO_8859_1 => [0xc4, 0xe4, 0xd6, 0xf6, 0xdc, 0xfc, 0xdf]
|
327
|
+
}.each { |encoding, umlauts|
|
328
|
+
break encoding if relative_byte_count(byte_count_sum(umlauts)) > 0.001
|
329
|
+
}
|
330
|
+
end
|
331
|
+
|
332
|
+
### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
|
333
|
+
|
334
|
+
bom_encoding UTF_8 do
|
335
|
+
starts_with?(0xef, 0xbb, 0xbf)
|
336
|
+
end
|
337
|
+
|
338
|
+
bom_encoding UTF_16BE do
|
339
|
+
starts_with?(0xfe, 0xff)
|
340
|
+
end
|
341
|
+
|
342
|
+
bom_encoding UTF_16LE do
|
343
|
+
starts_with?(0xff, 0xfe)
|
344
|
+
end
|
345
|
+
|
346
|
+
bom_encoding UTF_32BE do
|
347
|
+
starts_with?(0x00, 0x00, 0xfe, 0xff)
|
348
|
+
end
|
349
|
+
|
350
|
+
bom_encoding UTF_32LE do
|
351
|
+
starts_with?(0xff, 0xfe, 0x00, 0x00)
|
352
|
+
end
|
353
|
+
|
354
|
+
bom_encoding SCSU do
|
355
|
+
starts_with?(0x0e, 0xfe, 0xff)
|
356
|
+
end
|
357
|
+
|
358
|
+
bom_encoding UTF_7 do
|
359
|
+
starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
|
360
|
+
end
|
361
|
+
|
362
|
+
bom_encoding UTF_EBCDIC do
|
363
|
+
starts_with?(0xdd, 0x73, 0x66, 0x73)
|
364
|
+
end
|
365
|
+
|
366
|
+
bom_encoding BOCU_1 do
|
367
|
+
starts_with?(0xfb, 0xee, 0x28)
|
368
|
+
end
|
369
|
+
|
370
|
+
end
|
371
|
+
|
372
|
+
end
|