cmess 0.0.4.136
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +676 -0
- data/ChangeLog +6 -0
- data/README +53 -0
- data/Rakefile +30 -0
- data/bin/cinderella +186 -0
- data/bin/decode_entities +101 -0
- data/bin/guess_encoding +183 -0
- data/data/csets/iso_8859-1.yaml +195 -0
- data/data/csets/iso_8859-15.yaml +204 -0
- data/data/csets/latin1.yaml +195 -0
- data/data/csets/unicode/basic_latin.yaml +97 -0
- data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
- data/data/csets/unicode/cyrillic.yaml +256 -0
- data/data/csets/unicode/greek.yaml +129 -0
- data/data/csets/unicode/ipa_extensions.yaml +97 -0
- data/data/csets/unicode/latin-extended-c.yaml +18 -0
- data/data/csets/unicode/latin-extended-d.yaml +3 -0
- data/data/csets/unicode/latin_1_supplement.yaml +128 -0
- data/data/csets/unicode/latin_extended_a.yaml +129 -0
- data/data/csets/unicode/latin_extended_additional.yaml +247 -0
- data/data/csets/unicode/latin_extended_b.yaml +209 -0
- data/data/csets/unicode/letterlike_symbols.yaml +80 -0
- data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
- data/data/csets/utf-8.yaml +1504 -0
- data/data/csets/utf8.yaml +1504 -0
- data/example/crop +127 -0
- data/example/crop_repaired +127 -0
- data/example/empty6-slash.txt +1495 -0
- data/example/empty6-slash_repaired.txt +1495 -0
- data/example/pot +1368 -0
- data/lib/cmess.rb +44 -0
- data/lib/cmess/cinderella.rb +63 -0
- data/lib/cmess/cli.rb +79 -0
- data/lib/cmess/decode_entities.rb +68 -0
- data/lib/cmess/guess_encoding.rb +372 -0
- data/lib/cmess/version.rb +51 -0
- metadata +119 -0
data/lib/cmess.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# cmess -- Assist with handling messed up encodings #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
# Bundles several tools that aim at dealing with various problems occurring in
|
30
|
+
# the context of character sets and encodings. Currently, there are:
|
31
|
+
#
|
32
|
+
# guess_encoding:: Simple helper to identify the encoding of a given string.
|
33
|
+
# Includes the ability to automatically detect the encoding
|
34
|
+
# of an input. (see GuessEncoding)
|
35
|
+
# cinderella:: When characters are "double encoded", you can't easily
|
36
|
+
# convert them back -- this is where cinderella comes in,
|
37
|
+
# sorting the good ones into the pot and the (potentially)
|
38
|
+
# bad ones into the crop... (see Cinderella)
|
39
|
+
# decode_entities:: Decode HTML entities in a string. (see DecodeEntities)
|
40
|
+
|
41
|
+
module CMess
|
42
|
+
end
|
43
|
+
|
44
|
+
require 'cmess/version'
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'iconv'
|
30
|
+
|
31
|
+
# Find (and possibly repair) doubly encoded characters. Here's how it's done:
|
32
|
+
#
|
33
|
+
# Treats characters encoded in target encoding as if they were encoded in
|
34
|
+
# source encoding, converts them to target encoding and "grep"s for lines
|
35
|
+
# containing those doubly encoded characters; if asked to repair doubly
|
36
|
+
# encoded characters, substitutes them with their original character.
|
37
|
+
|
38
|
+
module CMess::Cinderella
|
39
|
+
|
40
|
+
extend self
|
41
|
+
|
42
|
+
# our version ;-)
|
43
|
+
VERSION = '0.0.3'
|
44
|
+
|
45
|
+
def pick(input, pot, crop, source_encoding, target_encoding, chars, repair = false)
|
46
|
+
iconv = Iconv.new(target_encoding, source_encoding)
|
47
|
+
|
48
|
+
encoded = chars.inject({}) { |hash, char|
|
49
|
+
hash.update(iconv.iconv(char) => char)
|
50
|
+
}
|
51
|
+
|
52
|
+
regexp = Regexp.union(*encoded.keys)
|
53
|
+
|
54
|
+
input.each { |line|
|
55
|
+
if out = line =~ regexp ? crop : pot
|
56
|
+
line.gsub!(regexp) { |m| encoded[m] } if repair
|
57
|
+
|
58
|
+
out.puts(line)
|
59
|
+
end
|
60
|
+
}
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
data/lib/cmess/cli.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
module CMess::CLI
|
30
|
+
|
31
|
+
def ensure_readable(file)
|
32
|
+
abort "Can't find input file: #{file}" unless File.readable?(file)
|
33
|
+
end
|
34
|
+
|
35
|
+
def ensure_directory(dir)
|
36
|
+
abort "Directory not found: #{dir}" unless File.directory?(dir)
|
37
|
+
end
|
38
|
+
|
39
|
+
def open_file_in_place(file)
|
40
|
+
ensure_readable(file)
|
41
|
+
[File.readlines(file), File.open(file, 'w')]
|
42
|
+
end
|
43
|
+
|
44
|
+
def open_file_or_std(file, mode = 'r')
|
45
|
+
if file == '-'
|
46
|
+
case mode
|
47
|
+
when 'r': STDIN
|
48
|
+
when 'w': STDOUT
|
49
|
+
when 'a': STDERR
|
50
|
+
else raise ArgumentError, "don't know how to handle mode '#{mode}'"
|
51
|
+
end
|
52
|
+
else
|
53
|
+
ensure_readable(file) unless mode == 'w'
|
54
|
+
File.open(file, mode)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def determine_system_encoding
|
59
|
+
ENV['SYSTEM_ENCODING'] ||
|
60
|
+
ENV['LANG'][/\.(.*)/, 1] ||
|
61
|
+
system_encoding_not_found
|
62
|
+
end
|
63
|
+
|
64
|
+
def system_encoding_not_found
|
65
|
+
not_found = lambda {
|
66
|
+
abort <<-EOT
|
67
|
+
Your system's encoding couldn't be determined automatically -- please specify it
|
68
|
+
explicitly via the SYSTEM_ENCODING environment variable or via the '-t' option.
|
69
|
+
EOT
|
70
|
+
}
|
71
|
+
|
72
|
+
def not_found.to_s
|
73
|
+
'NOT FOUND'
|
74
|
+
end
|
75
|
+
|
76
|
+
not_found
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'iconv'
|
30
|
+
|
31
|
+
require 'rubygems'
|
32
|
+
require 'htmlentities/string'
|
33
|
+
|
34
|
+
module CMess::DecodeEntities
|
35
|
+
|
36
|
+
extend self
|
37
|
+
|
38
|
+
# our version ;-)
|
39
|
+
VERSION = '0.0.2'
|
40
|
+
|
41
|
+
# HTMLEntities requires UTF-8
|
42
|
+
INTERMEDIATE_ENCODING = 'utf-8'
|
43
|
+
|
44
|
+
ICONV_DUMMY = begin
|
45
|
+
dummy = Object.new
|
46
|
+
|
47
|
+
def dummy.iconv(string)
|
48
|
+
string
|
49
|
+
end
|
50
|
+
|
51
|
+
dummy
|
52
|
+
end
|
53
|
+
|
54
|
+
def decode(input, output, source_encoding, target_encoding = nil)
|
55
|
+
target_encoding ||= source_encoding
|
56
|
+
|
57
|
+
iconv_in = source_encoding != INTERMEDIATE_ENCODING ?
|
58
|
+
Iconv.new(INTERMEDIATE_ENCODING, source_encoding) : ICONV_DUMMY
|
59
|
+
|
60
|
+
iconv_out = target_encoding != INTERMEDIATE_ENCODING ?
|
61
|
+
Iconv.new(target_encoding, INTERMEDIATE_ENCODING) : ICONV_DUMMY
|
62
|
+
|
63
|
+
input.each { |line|
|
64
|
+
output.puts iconv_out.iconv(iconv_in.iconv(line).decode_entities)
|
65
|
+
}
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,372 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# Contributors: #
|
14
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
15
|
+
# for automatic encoding detection) #
|
16
|
+
# #
|
17
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
18
|
+
# terms of the GNU General Public License as published by the Free Software #
|
19
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
20
|
+
# version. #
|
21
|
+
# #
|
22
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
23
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
24
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
25
|
+
# details. #
|
26
|
+
# #
|
27
|
+
# You should have received a copy of the GNU General Public License along #
|
28
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
29
|
+
# #
|
30
|
+
###############################################################################
|
31
|
+
#++
|
32
|
+
|
33
|
+
require 'iconv'
|
34
|
+
|
35
|
+
# Outputs given string (or line), being encoded in target encoding, encoded in
|
36
|
+
# various test encodings, thus allowing to identify the (seemingly) correct
|
37
|
+
# encoding by visually comparing the input string with its desired appearance.
|
38
|
+
#
|
39
|
+
# In addition to that manual procedure, may be used to detect the encoding
|
40
|
+
# automatically. Works actually pretty good -- for the supported encodings
|
41
|
+
# (see Automatic for details).
|
42
|
+
|
43
|
+
module CMess::GuessEncoding
|
44
|
+
|
45
|
+
# our version ;-)
|
46
|
+
VERSION = '0.0.5'
|
47
|
+
|
48
|
+
# Namespace for our encodings.
|
49
|
+
module Encoding
|
50
|
+
|
51
|
+
%w[
|
52
|
+
UNKNOWN ASCII MACINTOSH
|
53
|
+
ISO-8859-1 ISO-8859-2 ISO-8859-15
|
54
|
+
CP1250 CP1251 CP1252 CP850 CP852 CP856
|
55
|
+
UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
|
56
|
+
UTF-7 UTF-EBCDIC SCSU BOCU-1
|
57
|
+
ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
|
58
|
+
].each { |encoding|
|
59
|
+
const = encoding.tr('-', '_').gsub(/\W/, '')
|
60
|
+
const_set(const, encoding.freeze)
|
61
|
+
}
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
module Manual
|
66
|
+
|
67
|
+
extend self
|
68
|
+
|
69
|
+
include Encoding
|
70
|
+
|
71
|
+
# default encodings to try
|
72
|
+
ENCODINGS = [
|
73
|
+
ISO_8859_1,
|
74
|
+
ISO_8859_2,
|
75
|
+
ISO_8859_15,
|
76
|
+
CP1250,
|
77
|
+
CP1251,
|
78
|
+
CP1252,
|
79
|
+
CP850,
|
80
|
+
CP852,
|
81
|
+
CP856,
|
82
|
+
UTF_8
|
83
|
+
]
|
84
|
+
|
85
|
+
# likely candidates to suggest to the user
|
86
|
+
CANDIDATES = [
|
87
|
+
ANSI_X34,
|
88
|
+
EBCDIC_AT_DE,
|
89
|
+
EBCDIC_US,
|
90
|
+
EUC_JP,
|
91
|
+
KOI_8,
|
92
|
+
MACINTOSH,
|
93
|
+
MS_ANSI,
|
94
|
+
SHIFT_JIS,
|
95
|
+
UTF_7,
|
96
|
+
UTF_16,
|
97
|
+
UTF_16BE,
|
98
|
+
UTF_16LE,
|
99
|
+
UTF_32,
|
100
|
+
UTF_32BE,
|
101
|
+
UTF_32LE
|
102
|
+
]
|
103
|
+
|
104
|
+
def display(input, target_encoding, encodings = nil, additional_encodings = [])
|
105
|
+
target = target_encoding
|
106
|
+
|
107
|
+
encodings = (encodings || ENCODINGS) + additional_encodings
|
108
|
+
encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
|
109
|
+
# staying at the end
|
110
|
+
encodings = [target] + (encodings - [target]) # move target encoding to front
|
111
|
+
|
112
|
+
max_length = encodings.map { |encoding| encoding.length }.max
|
113
|
+
|
114
|
+
encodings.each { |encoding|
|
115
|
+
converted = begin
|
116
|
+
Iconv.conv(target, encoding, input)
|
117
|
+
rescue Iconv::IllegalSequence => err
|
118
|
+
"ILLEGAL INPUT SEQUENCE: #{err}"
|
119
|
+
rescue Iconv::InvalidEncoding
|
120
|
+
if encoding == target
|
121
|
+
abort "Invalid encoding: #{encoding}"
|
122
|
+
else
|
123
|
+
"INVALID ENCODING!"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
puts "%-#{max_length}s : %s" % [encoding, converted]
|
128
|
+
}
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
# Tries to detect the encoding of a given input by applying several
|
134
|
+
# heuristics to determine the <b>most likely</b> candidate. If no heuristic
|
135
|
+
# catches on, resorts to Encoding::UNKNOWN.
|
136
|
+
#
|
137
|
+
# If a BOM is found, it may determine the encoding directly.
|
138
|
+
class Automatic
|
139
|
+
|
140
|
+
extend Forwardable
|
141
|
+
|
142
|
+
def_delegators :@klass, :encoding_guessers, :supported_encoding?,
|
143
|
+
:bom_guessers, :supported_bom?
|
144
|
+
|
145
|
+
include Encoding
|
146
|
+
|
147
|
+
@supported_encodings = []
|
148
|
+
@encoding_guessers = []
|
149
|
+
@supported_boms = []
|
150
|
+
@bom_guessers = []
|
151
|
+
|
152
|
+
class << self
|
153
|
+
|
154
|
+
attr_reader :supported_encodings, :encoding_guessers,
|
155
|
+
:supported_boms, :bom_guessers
|
156
|
+
|
157
|
+
def guess(input, chunk_size = nil, ignore_bom = false)
|
158
|
+
new(input, chunk_size).guess(ignore_bom)
|
159
|
+
end
|
160
|
+
|
161
|
+
private
|
162
|
+
|
163
|
+
def encoding(encoding, &condition_block)
|
164
|
+
encoding_block = lambda {
|
165
|
+
encoding if instance_eval(&condition_block)
|
166
|
+
}
|
167
|
+
|
168
|
+
encodings(encoding, &encoding_block)
|
169
|
+
end
|
170
|
+
|
171
|
+
def encodings(*encodings, &encoding_block)
|
172
|
+
encodings.each { |encoding|
|
173
|
+
@supported_encodings << encoding
|
174
|
+
@encoding_guessers << encoding_block
|
175
|
+
}
|
176
|
+
end
|
177
|
+
|
178
|
+
def supported_encoding?(encoding)
|
179
|
+
supported_encodings.include?(encoding)
|
180
|
+
end
|
181
|
+
|
182
|
+
def bom_encoding(encoding, &condition_block)
|
183
|
+
encoding_block = lambda {
|
184
|
+
encoding if instance_eval(&condition_block)
|
185
|
+
}
|
186
|
+
|
187
|
+
@supported_boms << encoding
|
188
|
+
@bom_guessers << encoding_block
|
189
|
+
end
|
190
|
+
|
191
|
+
def supported_bom?(encoding)
|
192
|
+
supported_boms.include?(encoding)
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
196
|
+
|
197
|
+
attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
|
198
|
+
|
199
|
+
def initialize(input, chunk_size = nil)
|
200
|
+
@input = input
|
201
|
+
@chunk_size = chunk_size
|
202
|
+
|
203
|
+
@klass = self.class
|
204
|
+
end
|
205
|
+
|
206
|
+
def guess(ignore_bom = false)
|
207
|
+
return bom if bom && !ignore_bom
|
208
|
+
|
209
|
+
while read
|
210
|
+
encoding_guessers.each { |block|
|
211
|
+
encoding = instance_eval(&block)
|
212
|
+
return encoding if encoding && supported_encoding?(encoding)
|
213
|
+
}
|
214
|
+
end
|
215
|
+
|
216
|
+
# nothing suitable found :-(
|
217
|
+
UNKNOWN
|
218
|
+
end
|
219
|
+
|
220
|
+
def bom
|
221
|
+
@bom ||= check_bom
|
222
|
+
end
|
223
|
+
|
224
|
+
private
|
225
|
+
|
226
|
+
def eof?
|
227
|
+
input.eof?
|
228
|
+
end
|
229
|
+
|
230
|
+
def check_bom
|
231
|
+
return if eof?
|
232
|
+
|
233
|
+
bom_guessers.each { |block|
|
234
|
+
encoding = instance_eval(&block)
|
235
|
+
return encoding if encoding && supported_bom?(encoding)
|
236
|
+
|
237
|
+
# read bytes don't build a BOM, so rewind...
|
238
|
+
input.rewind
|
239
|
+
}
|
240
|
+
|
241
|
+
# nothing suitable found :-(
|
242
|
+
nil
|
243
|
+
end
|
244
|
+
|
245
|
+
def next_byte
|
246
|
+
input.read(1).unpack('C').first
|
247
|
+
end
|
248
|
+
|
249
|
+
def starts_with?(*bytes)
|
250
|
+
bytes.all? { |byte|
|
251
|
+
next_byte == byte
|
252
|
+
}
|
253
|
+
end
|
254
|
+
|
255
|
+
def next_one_of?(*bytes)
|
256
|
+
bytes.include?(next_byte)
|
257
|
+
end
|
258
|
+
|
259
|
+
def read(chunk_size = chunk_size)
|
260
|
+
# => initialize counters
|
261
|
+
@byte_count ||= Hash.new(0)
|
262
|
+
@byte_total ||= 0
|
263
|
+
|
264
|
+
return if eof?
|
265
|
+
|
266
|
+
bytes_before = @byte_total
|
267
|
+
|
268
|
+
input.read(chunk_size).each_byte { |byte|
|
269
|
+
@byte_count[byte] += 1
|
270
|
+
@byte_total += 1
|
271
|
+
|
272
|
+
@first_byte ||= byte
|
273
|
+
}
|
274
|
+
|
275
|
+
@byte_total > bytes_before
|
276
|
+
end
|
277
|
+
|
278
|
+
def byte_count_sum(*bytes)
|
279
|
+
bytes = *bytes # treat arrays/ranges and lists alike
|
280
|
+
bytes.inject(0) { |sum, n| sum + byte_count[n] }
|
281
|
+
end
|
282
|
+
|
283
|
+
def relative_byte_count(count)
|
284
|
+
count.to_f / byte_total
|
285
|
+
end
|
286
|
+
|
287
|
+
### Definition of guessing heuristics. Order matters!
|
288
|
+
|
289
|
+
# ASCII, if all bytes are within the lower 128 bytes
|
290
|
+
# (Unfortunately, we have to read the *whole* file to make that decision)
|
291
|
+
encoding ASCII do
|
292
|
+
eof? && byte_count_sum(0x0..0x7f) == byte_total
|
293
|
+
end
|
294
|
+
|
295
|
+
# UTF-16, if lots of NULL bytes present
|
296
|
+
encodings UTF_16BE, UTF_16LE, UTF_16 do
|
297
|
+
if relative_byte_count(byte_count[0]) > 0.25
|
298
|
+
case first_byte
|
299
|
+
when 0: UTF_32
|
300
|
+
when 254: UTF_16BE
|
301
|
+
when 255: UTF_16LE
|
302
|
+
else UTF_16
|
303
|
+
end
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
# UTF-8, if number of escape-bytes and following bytes
|
308
|
+
# is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
|
309
|
+
encoding UTF_8 do
|
310
|
+
esc_bytes = byte_count_sum(0xc0..0xdf) \
|
311
|
+
# => 110xxxxx 10xxxxxx
|
312
|
+
+ byte_count_sum(0xe0..0xef) * 2 \
|
313
|
+
# => 1110xxxx 10xxxxxx 10xxxxxx
|
314
|
+
+ byte_count_sum(0xf0..0xf7) * 3
|
315
|
+
# => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
316
|
+
fol_bytes = byte_count_sum(0x80..0xbf)
|
317
|
+
# => 10xxxxxx
|
318
|
+
|
319
|
+
esc_bytes > 0 && esc_bytes == fol_bytes
|
320
|
+
end
|
321
|
+
|
322
|
+
# Analyse statistical appearance of German umlauts (=> ÄäÖöÜüß)
|
323
|
+
encodings MACINTOSH, ISO_8859_1 do
|
324
|
+
{
|
325
|
+
MACINTOSH => [0x80, 0x8a, 0x85, 0x9a, 0x86, 0x9f, 0xa7],
|
326
|
+
ISO_8859_1 => [0xc4, 0xe4, 0xd6, 0xf6, 0xdc, 0xfc, 0xdf]
|
327
|
+
}.each { |encoding, umlauts|
|
328
|
+
break encoding if relative_byte_count(byte_count_sum(umlauts)) > 0.001
|
329
|
+
}
|
330
|
+
end
|
331
|
+
|
332
|
+
### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
|
333
|
+
|
334
|
+
bom_encoding UTF_8 do
|
335
|
+
starts_with?(0xef, 0xbb, 0xbf)
|
336
|
+
end
|
337
|
+
|
338
|
+
bom_encoding UTF_16BE do
|
339
|
+
starts_with?(0xfe, 0xff)
|
340
|
+
end
|
341
|
+
|
342
|
+
bom_encoding UTF_16LE do
|
343
|
+
starts_with?(0xff, 0xfe)
|
344
|
+
end
|
345
|
+
|
346
|
+
bom_encoding UTF_32BE do
|
347
|
+
starts_with?(0x00, 0x00, 0xfe, 0xff)
|
348
|
+
end
|
349
|
+
|
350
|
+
bom_encoding UTF_32LE do
|
351
|
+
starts_with?(0xff, 0xfe, 0x00, 0x00)
|
352
|
+
end
|
353
|
+
|
354
|
+
bom_encoding SCSU do
|
355
|
+
starts_with?(0x0e, 0xfe, 0xff)
|
356
|
+
end
|
357
|
+
|
358
|
+
bom_encoding UTF_7 do
|
359
|
+
starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
|
360
|
+
end
|
361
|
+
|
362
|
+
bom_encoding UTF_EBCDIC do
|
363
|
+
starts_with?(0xdd, 0x73, 0x66, 0x73)
|
364
|
+
end
|
365
|
+
|
366
|
+
bom_encoding BOCU_1 do
|
367
|
+
starts_with?(0xfb, 0xee, 0x28)
|
368
|
+
end
|
369
|
+
|
370
|
+
end
|
371
|
+
|
372
|
+
end
|