cmess 0.0.8.274 → 0.0.9.276
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +7 -0
- data/README +1 -1
- data/lib/cmess/guess_encoding.rb +14 -398
- data/lib/cmess/guess_encoding/automatic.rb +341 -0
- data/lib/cmess/guess_encoding/encoding.rb +61 -0
- data/lib/cmess/guess_encoding/manual.rb +105 -0
- data/lib/cmess/version.rb +1 -1
- metadata +5 -2
data/ChangeLog
CHANGED
@@ -1,5 +1,12 @@
|
|
1
1
|
= Revision history for cmess
|
2
2
|
|
3
|
+
== 0.0.9 [2008-08-15]
|
4
|
+
|
5
|
+
* Reorganized file structure for guess_encoding
|
6
|
+
* Added shortcuts GuessEncoding.manual/.automatic
|
7
|
+
* GuessEncoding::Automatic now also takes a String
|
8
|
+
as input (will be converted to a StringIO)
|
9
|
+
|
3
10
|
== 0.0.8 [2008-08-14]
|
4
11
|
|
5
12
|
* Require 'cmess' inside libs, so the user doesn't have to
|
data/README
CHANGED
data/lib/cmess/guess_encoding.rb
CHANGED
@@ -30,416 +30,32 @@
|
|
30
30
|
###############################################################################
|
31
31
|
#++
|
32
32
|
|
33
|
-
$KCODE = 'u'
|
34
|
-
|
35
|
-
require 'yaml'
|
36
|
-
require 'iconv'
|
37
|
-
require 'forwardable'
|
38
|
-
|
39
33
|
require 'cmess'
|
40
34
|
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
# In addition to that manual procedure, may be used to detect the encoding
|
46
|
-
# automatically. Works actually pretty good -- for the supported encodings
|
47
|
-
# (see Automatic for details).
|
35
|
+
# Allows to guess an input's encoding either manually or automatically.
|
36
|
+
# Works actually pretty good -- for the supported encodings. See Manual
|
37
|
+
# and Automatic for details.
|
48
38
|
|
49
39
|
module CMess::GuessEncoding
|
50
40
|
|
51
41
|
# our version ;-)
|
52
|
-
VERSION = '0.0.
|
53
|
-
|
54
|
-
# Namespace for our encodings.
|
55
|
-
module Encoding
|
56
|
-
|
57
|
-
extend self
|
58
|
-
|
59
|
-
def const_name_for(encoding)
|
60
|
-
encoding.tr('-', '_').gsub(/\W/, '').upcase
|
61
|
-
end
|
62
|
-
|
63
|
-
def set_encoding_const(encoding, const = const_name_for(encoding))
|
64
|
-
const_set(const, encoding.freeze)
|
65
|
-
end
|
66
|
-
|
67
|
-
def get_or_set_encoding_const(encoding)
|
68
|
-
const_defined?(const = const_name_for(encoding)) ? const_get(const) :
|
69
|
-
set_encoding_const(encoding, const)
|
70
|
-
end
|
71
|
-
|
72
|
-
%w[
|
73
|
-
UNKNOWN ASCII MACINTOSH
|
74
|
-
ISO-8859-1 ISO-8859-2 ISO-8859-15
|
75
|
-
CP1250 CP1251 CP1252 CP850 CP852 CP856
|
76
|
-
UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
|
77
|
-
UTF-7 UTF-EBCDIC SCSU BOCU-1
|
78
|
-
ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
|
79
|
-
].each { |encoding| set_encoding_const(encoding) }
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
module Manual
|
84
|
-
|
85
|
-
extend self
|
86
|
-
|
87
|
-
include Encoding
|
88
|
-
|
89
|
-
# default encodings to try
|
90
|
-
ENCODINGS = [
|
91
|
-
ISO_8859_1,
|
92
|
-
ISO_8859_2,
|
93
|
-
ISO_8859_15,
|
94
|
-
CP1250,
|
95
|
-
CP1251,
|
96
|
-
CP1252,
|
97
|
-
CP850,
|
98
|
-
CP852,
|
99
|
-
CP856,
|
100
|
-
UTF_8
|
101
|
-
]
|
102
|
-
|
103
|
-
# likely candidates to suggest to the user
|
104
|
-
CANDIDATES = [
|
105
|
-
ANSI_X34,
|
106
|
-
EBCDIC_AT_DE,
|
107
|
-
EBCDIC_US,
|
108
|
-
EUC_JP,
|
109
|
-
KOI_8,
|
110
|
-
MACINTOSH,
|
111
|
-
MS_ANSI,
|
112
|
-
SHIFT_JIS,
|
113
|
-
UTF_7,
|
114
|
-
UTF_16,
|
115
|
-
UTF_16BE,
|
116
|
-
UTF_16LE,
|
117
|
-
UTF_32,
|
118
|
-
UTF_32BE,
|
119
|
-
UTF_32LE
|
120
|
-
]
|
121
|
-
|
122
|
-
def display(input, target_encoding, encodings = nil, additional_encodings = [])
|
123
|
-
target = target_encoding
|
124
|
-
|
125
|
-
encodings = (encodings || ENCODINGS) + additional_encodings
|
126
|
-
encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
|
127
|
-
# staying at the end
|
128
|
-
encodings = [target] + (encodings - [target]) # move target encoding to front
|
129
|
-
|
130
|
-
max_length = encodings.map { |encoding| encoding.length }.max
|
131
|
-
|
132
|
-
encodings.each { |encoding|
|
133
|
-
converted = begin
|
134
|
-
Iconv.conv(target, encoding, input)
|
135
|
-
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
|
136
|
-
"ILLEGAL INPUT SEQUENCE: #{err}"
|
137
|
-
rescue Iconv::InvalidEncoding
|
138
|
-
if encoding == target
|
139
|
-
abort "Invalid encoding: #{encoding}"
|
140
|
-
else
|
141
|
-
"INVALID ENCODING!"
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
puts "%-#{max_length}s : %s" % [encoding, converted]
|
146
|
-
}
|
147
|
-
end
|
148
|
-
|
149
|
-
end
|
150
|
-
|
151
|
-
# Tries to detect the encoding of a given input by applying several
|
152
|
-
# heuristics to determine the <b>most likely</b> candidate. If no heuristic
|
153
|
-
# catches on, resorts to Encoding::UNKNOWN.
|
154
|
-
#
|
155
|
-
# If a BOM is found, it may determine the encoding directly.
|
156
|
-
class Automatic
|
157
|
-
|
158
|
-
extend Forwardable
|
159
|
-
|
160
|
-
def_delegators self, :encoding_guessers, :supported_encoding?,
|
161
|
-
:bom_guessers, :supported_bom?
|
162
|
-
|
163
|
-
include Encoding
|
164
|
-
|
165
|
-
# Creates a converter for desired encoding (from UTF-8)
|
166
|
-
ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
|
167
|
-
|
168
|
-
# Single-byte encodings to test statistically by TEST_CHARS
|
169
|
-
TEST_ENCODINGS = [
|
170
|
-
MACINTOSH,
|
171
|
-
ISO_8859_1,
|
172
|
-
ISO_8859_15,
|
173
|
-
CP1252,
|
174
|
-
CP850,
|
175
|
-
MS_ANSI
|
176
|
-
]
|
177
|
-
|
178
|
-
# Certain (non-ASCII) chars to test for in TEST_ENCODINGS
|
179
|
-
CHARS_TO_TEST = (
|
180
|
-
'€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
|
181
|
-
'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
|
182
|
-
).split(//)
|
183
|
-
|
184
|
-
# Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
|
185
|
-
TEST_CHARS = Hash.new { |hash, encoding|
|
186
|
-
encoding = Encoding.get_or_set_encoding_const(encoding)
|
187
|
-
encchars = CHARS_TO_TEST.map { |char|
|
188
|
-
begin
|
189
|
-
byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
|
190
|
-
rescue Iconv::IllegalSequence
|
191
|
-
end
|
192
|
-
}.compact
|
193
|
-
|
194
|
-
TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
|
195
|
-
hash[encoding] = encchars
|
196
|
-
}.update(YAML.load_file(
|
197
|
-
File.join(File.dirname(__FILE__), '..', '..', 'data', 'test_chars.yaml')
|
198
|
-
))
|
199
|
-
|
200
|
-
# Relative count of TEST_CHARS must exceed this threshold to yield
|
201
|
-
# a direct match
|
202
|
-
TEST_THRESHOLD_DIRECT = 0.1
|
203
|
-
|
204
|
-
# Relative count of TEST_CHARS must exceed this threshold to yield
|
205
|
-
# an approximate match
|
206
|
-
TEST_THRESHOLD_APPROX = 0.0004
|
207
|
-
|
208
|
-
@supported_encodings = []
|
209
|
-
@encoding_guessers = []
|
210
|
-
@supported_boms = []
|
211
|
-
@bom_guessers = []
|
212
|
-
|
213
|
-
class << self
|
214
|
-
|
215
|
-
attr_reader :supported_encodings, :encoding_guessers,
|
216
|
-
:supported_boms, :bom_guessers
|
217
|
-
|
218
|
-
def guess(input, chunk_size = nil, ignore_bom = false)
|
219
|
-
new(input, chunk_size).guess(ignore_bom)
|
220
|
-
end
|
221
|
-
|
222
|
-
private
|
223
|
-
|
224
|
-
def encoding(encoding, &condition_block)
|
225
|
-
encoding_block = lambda {
|
226
|
-
encoding if instance_eval(&condition_block)
|
227
|
-
}
|
228
|
-
|
229
|
-
encodings(encoding, &encoding_block)
|
230
|
-
end
|
231
|
-
|
232
|
-
def encodings(*encodings, &encoding_block)
|
233
|
-
encodings.each { |encoding|
|
234
|
-
@supported_encodings << encoding
|
235
|
-
@encoding_guessers << encoding_block \
|
236
|
-
unless @encoding_guessers.include?(encoding_block)
|
237
|
-
}
|
238
|
-
end
|
239
|
-
|
240
|
-
def supported_encoding?(encoding)
|
241
|
-
supported_encodings.include?(encoding)
|
242
|
-
end
|
243
|
-
|
244
|
-
def bom_encoding(encoding, &condition_block)
|
245
|
-
encoding_block = lambda {
|
246
|
-
encoding if instance_eval(&condition_block)
|
247
|
-
}
|
248
|
-
|
249
|
-
@supported_boms << encoding
|
250
|
-
@bom_guessers << encoding_block \
|
251
|
-
unless @bom_guessers.include?(encoding_block)
|
252
|
-
end
|
253
|
-
|
254
|
-
def supported_bom?(encoding)
|
255
|
-
supported_boms.include?(encoding)
|
256
|
-
end
|
257
|
-
|
258
|
-
end
|
259
|
-
|
260
|
-
attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
|
261
|
-
|
262
|
-
def initialize(input, chunk_size = nil)
|
263
|
-
@input = input
|
264
|
-
@chunk_size = chunk_size
|
265
|
-
end
|
266
|
-
|
267
|
-
def guess(ignore_bom = false)
|
268
|
-
return bom if bom && !ignore_bom
|
269
|
-
|
270
|
-
while read
|
271
|
-
encoding_guessers.each { |block|
|
272
|
-
encoding = instance_eval(&block)
|
273
|
-
return encoding if encoding && supported_encoding?(encoding)
|
274
|
-
}
|
275
|
-
end
|
42
|
+
VERSION = '0.0.7'
|
276
43
|
|
277
|
-
|
278
|
-
UNKNOWN
|
279
|
-
end
|
280
|
-
|
281
|
-
def bom
|
282
|
-
@bom ||= check_bom
|
283
|
-
end
|
284
|
-
|
285
|
-
private
|
286
|
-
|
287
|
-
def eof?
|
288
|
-
input.eof?
|
289
|
-
end
|
290
|
-
|
291
|
-
def check_bom
|
292
|
-
return if eof?
|
293
|
-
|
294
|
-
# prevent "Illegal seek" error inside a pipe
|
295
|
-
begin
|
296
|
-
input.pos
|
297
|
-
rescue Errno::ESPIPE
|
298
|
-
return
|
299
|
-
end
|
300
|
-
|
301
|
-
bom_guessers.each { |block|
|
302
|
-
encoding = instance_eval(&block)
|
303
|
-
return encoding if encoding && supported_bom?(encoding)
|
304
|
-
|
305
|
-
# read bytes don't build a BOM, so rewind...
|
306
|
-
input.rewind
|
307
|
-
}
|
308
|
-
|
309
|
-
# nothing suitable found :-(
|
310
|
-
nil
|
311
|
-
end
|
312
|
-
|
313
|
-
def next_byte
|
314
|
-
input.read(1).unpack('C').first
|
315
|
-
end
|
316
|
-
|
317
|
-
def starts_with?(*bytes)
|
318
|
-
bytes.all? { |byte|
|
319
|
-
next_byte == byte
|
320
|
-
}
|
321
|
-
end
|
322
|
-
|
323
|
-
def next_one_of?(*bytes)
|
324
|
-
bytes.include?(next_byte)
|
325
|
-
end
|
326
|
-
|
327
|
-
def read(chunk_size = chunk_size)
|
328
|
-
# => initialize counters
|
329
|
-
@byte_count ||= Hash.new(0)
|
330
|
-
@byte_total ||= 0
|
331
|
-
|
332
|
-
return if eof?
|
333
|
-
|
334
|
-
bytes_before = @byte_total
|
335
|
-
|
336
|
-
input.read(chunk_size).each_byte { |byte|
|
337
|
-
@byte_count[byte] += 1
|
338
|
-
@byte_total += 1
|
339
|
-
|
340
|
-
@first_byte ||= byte
|
341
|
-
}
|
342
|
-
|
343
|
-
@byte_total > bytes_before
|
344
|
-
end
|
345
|
-
|
346
|
-
def byte_count_sum(*bytes)
|
347
|
-
bytes = *bytes # treat arrays/ranges and lists alike
|
348
|
-
bytes.inject(0) { |sum, n| sum + byte_count[n] }
|
349
|
-
end
|
350
|
-
|
351
|
-
def relative_byte_count(count)
|
352
|
-
count.to_f / byte_total
|
353
|
-
end
|
354
|
-
|
355
|
-
### Definition of guessing heuristics. Order matters!
|
356
|
-
|
357
|
-
# ASCII, if all bytes are within the lower 128 bytes
|
358
|
-
# (Unfortunately, we have to read the *whole* file to make that decision)
|
359
|
-
encoding ASCII do
|
360
|
-
eof? && byte_count_sum(0x0..0x7f) == byte_total
|
361
|
-
end
|
362
|
-
|
363
|
-
# UTF-16, if lots of NULL bytes present
|
364
|
-
encodings UTF_16BE, UTF_16LE, UTF_16 do
|
365
|
-
if relative_byte_count(byte_count[0]) > 0.25
|
366
|
-
case first_byte
|
367
|
-
when 0x0: UTF_32
|
368
|
-
when 0xfe: UTF_16BE
|
369
|
-
when 0xff: UTF_16LE
|
370
|
-
else UTF_16
|
371
|
-
end
|
372
|
-
end
|
373
|
-
end
|
374
|
-
|
375
|
-
# UTF-8, if number of escape-bytes and following bytes
|
376
|
-
# is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
|
377
|
-
encoding UTF_8 do
|
378
|
-
esc_bytes = byte_count_sum(0xc0..0xdf) \
|
379
|
-
# => 110xxxxx 10xxxxxx
|
380
|
-
+ byte_count_sum(0xe0..0xef) * 2 \
|
381
|
-
# => 1110xxxx 10xxxxxx 10xxxxxx
|
382
|
-
+ byte_count_sum(0xf0..0xf7) * 3
|
383
|
-
# => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
384
|
-
fol_bytes = byte_count_sum(0x80..0xbf)
|
385
|
-
# => 10xxxxxx
|
386
|
-
|
387
|
-
esc_bytes > 0 && esc_bytes == fol_bytes
|
388
|
-
end
|
389
|
-
|
390
|
-
# Analyse statistical appearance of German umlauts and other accented
|
391
|
-
# letters (see TEST_CHARS)
|
392
|
-
encodings *TEST_ENCODINGS do
|
393
|
-
ratios = {}
|
394
|
-
|
395
|
-
TEST_ENCODINGS.find(lambda {
|
396
|
-
ratio, encoding = ratios.sort.last
|
397
|
-
encoding if ratio >= TEST_THRESHOLD_APPROX
|
398
|
-
}) { |encoding|
|
399
|
-
ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
|
400
|
-
#p [encoding, ratio]
|
401
|
-
ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
|
402
|
-
}
|
403
|
-
end
|
404
|
-
|
405
|
-
### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
|
406
|
-
|
407
|
-
bom_encoding UTF_8 do
|
408
|
-
starts_with?(0xef, 0xbb, 0xbf)
|
409
|
-
end
|
410
|
-
|
411
|
-
bom_encoding UTF_16BE do
|
412
|
-
starts_with?(0xfe, 0xff)
|
413
|
-
end
|
414
|
-
|
415
|
-
bom_encoding UTF_16LE do
|
416
|
-
starts_with?(0xff, 0xfe)
|
417
|
-
end
|
418
|
-
|
419
|
-
bom_encoding UTF_32BE do
|
420
|
-
starts_with?(0x00, 0x00, 0xfe, 0xff)
|
421
|
-
end
|
44
|
+
class << self
|
422
45
|
|
423
|
-
|
424
|
-
|
46
|
+
def manual(*args)
|
47
|
+
Manual.display(*args)
|
425
48
|
end
|
426
49
|
|
427
|
-
|
428
|
-
|
429
|
-
end
|
430
|
-
|
431
|
-
bom_encoding UTF_7 do
|
432
|
-
starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
|
433
|
-
end
|
434
|
-
|
435
|
-
bom_encoding UTF_EBCDIC do
|
436
|
-
starts_with?(0xdd, 0x73, 0x66, 0x73)
|
437
|
-
end
|
438
|
-
|
439
|
-
bom_encoding BOCU_1 do
|
440
|
-
starts_with?(0xfb, 0xee, 0x28)
|
50
|
+
def automatic(*args)
|
51
|
+
Automatic.guess(*args)
|
441
52
|
end
|
442
53
|
|
443
54
|
end
|
444
55
|
|
445
56
|
end
|
57
|
+
|
58
|
+
%w[encoding manual automatic].each { |lib|
|
59
|
+
lib = "cmess/guess_encoding/#{lib}"
|
60
|
+
require lib
|
61
|
+
}
|
@@ -0,0 +1,341 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# Contributors: #
|
14
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
15
|
+
# for automatic encoding detection) #
|
16
|
+
# #
|
17
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
18
|
+
# terms of the GNU General Public License as published by the Free Software #
|
19
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
20
|
+
# version. #
|
21
|
+
# #
|
22
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
23
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
24
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
25
|
+
# details. #
|
26
|
+
# #
|
27
|
+
# You should have received a copy of the GNU General Public License along #
|
28
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
29
|
+
# #
|
30
|
+
###############################################################################
|
31
|
+
#++
|
32
|
+
|
33
|
+
$KCODE = 'u'
|
34
|
+
|
35
|
+
require 'yaml'
|
36
|
+
require 'iconv'
|
37
|
+
require 'stringio'
|
38
|
+
require 'forwardable'
|
39
|
+
|
40
|
+
# Tries to detect the encoding of a given input by applying several
|
41
|
+
# heuristics to determine the <b>most likely</b> candidate. If no heuristic
|
42
|
+
# catches on, resorts to Encoding::UNKNOWN.
|
43
|
+
#
|
44
|
+
# If a BOM is found, it may determine the encoding directly.
|
45
|
+
|
46
|
+
class CMess::GuessEncoding::Automatic
|
47
|
+
|
48
|
+
extend Forwardable
|
49
|
+
|
50
|
+
def_delegators self, :encoding_guessers, :supported_encoding?,
|
51
|
+
:bom_guessers, :supported_bom?
|
52
|
+
|
53
|
+
include CMess::GuessEncoding::Encoding
|
54
|
+
|
55
|
+
# Creates a converter for desired encoding (from UTF-8)
|
56
|
+
ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
|
57
|
+
|
58
|
+
# Single-byte encodings to test statistically by TEST_CHARS
|
59
|
+
TEST_ENCODINGS = [
|
60
|
+
MACINTOSH,
|
61
|
+
ISO_8859_1,
|
62
|
+
ISO_8859_15,
|
63
|
+
CP1252,
|
64
|
+
CP850,
|
65
|
+
MS_ANSI
|
66
|
+
]
|
67
|
+
|
68
|
+
# Certain (non-ASCII) chars to test for in TEST_ENCODINGS
|
69
|
+
CHARS_TO_TEST = (
|
70
|
+
'€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
|
71
|
+
'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
|
72
|
+
).split(//)
|
73
|
+
|
74
|
+
# Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
|
75
|
+
TEST_CHARS = Hash.new { |hash, encoding|
|
76
|
+
encoding = Encoding.get_or_set_encoding_const(encoding)
|
77
|
+
encchars = CHARS_TO_TEST.map { |char|
|
78
|
+
begin
|
79
|
+
byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
|
80
|
+
rescue Iconv::IllegalSequence
|
81
|
+
end
|
82
|
+
}.compact
|
83
|
+
|
84
|
+
TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
|
85
|
+
hash[encoding] = encchars
|
86
|
+
}.update(YAML.load_file(
|
87
|
+
File.join(File.dirname(__FILE__), *%w[.. .. .. data test_chars.yaml])
|
88
|
+
))
|
89
|
+
|
90
|
+
# Relative count of TEST_CHARS must exceed this threshold to yield
|
91
|
+
# a direct match
|
92
|
+
TEST_THRESHOLD_DIRECT = 0.1
|
93
|
+
|
94
|
+
# Relative count of TEST_CHARS must exceed this threshold to yield
|
95
|
+
# an approximate match
|
96
|
+
TEST_THRESHOLD_APPROX = 0.0004
|
97
|
+
|
98
|
+
@supported_encodings = []
|
99
|
+
@encoding_guessers = []
|
100
|
+
@supported_boms = []
|
101
|
+
@bom_guessers = []
|
102
|
+
|
103
|
+
class << self
|
104
|
+
|
105
|
+
attr_reader :supported_encodings, :encoding_guessers,
|
106
|
+
:supported_boms, :bom_guessers
|
107
|
+
|
108
|
+
def guess(input, chunk_size = nil, ignore_bom = false)
|
109
|
+
new(input, chunk_size).guess(ignore_bom)
|
110
|
+
end
|
111
|
+
|
112
|
+
private
|
113
|
+
|
114
|
+
def encoding(encoding, &condition_block)
|
115
|
+
encoding_block = lambda {
|
116
|
+
encoding if instance_eval(&condition_block)
|
117
|
+
}
|
118
|
+
|
119
|
+
encodings(encoding, &encoding_block)
|
120
|
+
end
|
121
|
+
|
122
|
+
def encodings(*encodings, &encoding_block)
|
123
|
+
encodings.each { |encoding|
|
124
|
+
@supported_encodings << encoding
|
125
|
+
@encoding_guessers << encoding_block \
|
126
|
+
unless @encoding_guessers.include?(encoding_block)
|
127
|
+
}
|
128
|
+
end
|
129
|
+
|
130
|
+
def supported_encoding?(encoding)
|
131
|
+
supported_encodings.include?(encoding)
|
132
|
+
end
|
133
|
+
|
134
|
+
def bom_encoding(encoding, &condition_block)
|
135
|
+
encoding_block = lambda {
|
136
|
+
encoding if instance_eval(&condition_block)
|
137
|
+
}
|
138
|
+
|
139
|
+
@supported_boms << encoding
|
140
|
+
@bom_guessers << encoding_block \
|
141
|
+
unless @bom_guessers.include?(encoding_block)
|
142
|
+
end
|
143
|
+
|
144
|
+
def supported_bom?(encoding)
|
145
|
+
supported_boms.include?(encoding)
|
146
|
+
end
|
147
|
+
|
148
|
+
end
|
149
|
+
|
150
|
+
attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
|
151
|
+
|
152
|
+
def initialize(input, chunk_size = nil)
|
153
|
+
@input = case input
|
154
|
+
when IO # that's what we want
|
155
|
+
input
|
156
|
+
when String # convert it to an IO
|
157
|
+
StringIO.new(input)
|
158
|
+
else # um, what's that...?
|
159
|
+
raise ArgumentError, "don't know how to handle input of type #{input.class}"
|
160
|
+
end
|
161
|
+
|
162
|
+
@chunk_size = chunk_size
|
163
|
+
end
|
164
|
+
|
165
|
+
def guess(ignore_bom = false)
|
166
|
+
return bom if bom && !ignore_bom
|
167
|
+
|
168
|
+
while read
|
169
|
+
encoding_guessers.each { |block|
|
170
|
+
encoding = instance_eval(&block)
|
171
|
+
return encoding if encoding && supported_encoding?(encoding)
|
172
|
+
}
|
173
|
+
end
|
174
|
+
|
175
|
+
# nothing suitable found :-(
|
176
|
+
UNKNOWN
|
177
|
+
end
|
178
|
+
|
179
|
+
def bom
|
180
|
+
@bom ||= check_bom
|
181
|
+
end
|
182
|
+
|
183
|
+
private
|
184
|
+
|
185
|
+
def eof?
|
186
|
+
input.eof?
|
187
|
+
end
|
188
|
+
|
189
|
+
def check_bom
|
190
|
+
return if eof?
|
191
|
+
|
192
|
+
# prevent "Illegal seek" error inside a pipe
|
193
|
+
begin
|
194
|
+
input.pos
|
195
|
+
rescue Errno::ESPIPE
|
196
|
+
return
|
197
|
+
end
|
198
|
+
|
199
|
+
bom_guessers.each { |block|
|
200
|
+
encoding = instance_eval(&block)
|
201
|
+
return encoding if encoding && supported_bom?(encoding)
|
202
|
+
|
203
|
+
# read bytes don't build a BOM, so rewind...
|
204
|
+
input.rewind
|
205
|
+
}
|
206
|
+
|
207
|
+
# nothing suitable found :-(
|
208
|
+
nil
|
209
|
+
end
|
210
|
+
|
211
|
+
def next_byte
|
212
|
+
input.read(1).unpack('C').first
|
213
|
+
end
|
214
|
+
|
215
|
+
def starts_with?(*bytes)
|
216
|
+
bytes.all? { |byte|
|
217
|
+
next_byte == byte
|
218
|
+
}
|
219
|
+
end
|
220
|
+
|
221
|
+
def next_one_of?(*bytes)
|
222
|
+
bytes.include?(next_byte)
|
223
|
+
end
|
224
|
+
|
225
|
+
def read(chunk_size = chunk_size)
|
226
|
+
# => initialize counters
|
227
|
+
@byte_count ||= Hash.new(0)
|
228
|
+
@byte_total ||= 0
|
229
|
+
|
230
|
+
return if eof?
|
231
|
+
|
232
|
+
bytes_before = @byte_total
|
233
|
+
|
234
|
+
input.read(chunk_size).each_byte { |byte|
|
235
|
+
@byte_count[byte] += 1
|
236
|
+
@byte_total += 1
|
237
|
+
|
238
|
+
@first_byte ||= byte
|
239
|
+
}
|
240
|
+
|
241
|
+
@byte_total > bytes_before
|
242
|
+
end
|
243
|
+
|
244
|
+
def byte_count_sum(*bytes)
|
245
|
+
bytes = *bytes # treat arrays/ranges and lists alike
|
246
|
+
bytes.inject(0) { |sum, n| sum + byte_count[n] }
|
247
|
+
end
|
248
|
+
|
249
|
+
def relative_byte_count(count)
|
250
|
+
count.to_f / byte_total
|
251
|
+
end
|
252
|
+
|
253
|
+
### Definition of guessing heuristics. Order matters!
|
254
|
+
|
255
|
+
# ASCII, if all bytes are within the lower 128 bytes
|
256
|
+
# (Unfortunately, we have to read the *whole* file to make that decision)
|
257
|
+
encoding ASCII do
|
258
|
+
eof? && byte_count_sum(0x0..0x7f) == byte_total
|
259
|
+
end
|
260
|
+
|
261
|
+
# UTF-16, if lots of NULL bytes present
|
262
|
+
encodings UTF_16BE, UTF_16LE, UTF_16 do
|
263
|
+
if relative_byte_count(byte_count[0]) > 0.25
|
264
|
+
case first_byte
|
265
|
+
when 0x0: UTF_32
|
266
|
+
when 0xfe: UTF_16BE
|
267
|
+
when 0xff: UTF_16LE
|
268
|
+
else UTF_16
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
# UTF-8, if number of escape-bytes and following bytes
|
274
|
+
# is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
|
275
|
+
encoding UTF_8 do
|
276
|
+
esc_bytes = byte_count_sum(0xc0..0xdf) \
|
277
|
+
# => 110xxxxx 10xxxxxx
|
278
|
+
+ byte_count_sum(0xe0..0xef) * 2 \
|
279
|
+
# => 1110xxxx 10xxxxxx 10xxxxxx
|
280
|
+
+ byte_count_sum(0xf0..0xf7) * 3
|
281
|
+
# => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
282
|
+
fol_bytes = byte_count_sum(0x80..0xbf)
|
283
|
+
# => 10xxxxxx
|
284
|
+
|
285
|
+
esc_bytes > 0 && esc_bytes == fol_bytes
|
286
|
+
end
|
287
|
+
|
288
|
+
# Analyse statistical appearance of German umlauts and other accented
|
289
|
+
# letters (see TEST_CHARS)
|
290
|
+
encodings *TEST_ENCODINGS do
|
291
|
+
ratios = {}
|
292
|
+
|
293
|
+
TEST_ENCODINGS.find(lambda {
|
294
|
+
ratio, encoding = ratios.sort.last
|
295
|
+
encoding if ratio >= TEST_THRESHOLD_APPROX
|
296
|
+
}) { |encoding|
|
297
|
+
ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
|
298
|
+
#p [encoding, ratio]
|
299
|
+
ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
|
300
|
+
}
|
301
|
+
end
|
302
|
+
|
303
|
+
### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
|
304
|
+
|
305
|
+
bom_encoding UTF_8 do
|
306
|
+
starts_with?(0xef, 0xbb, 0xbf)
|
307
|
+
end
|
308
|
+
|
309
|
+
bom_encoding UTF_16BE do
|
310
|
+
starts_with?(0xfe, 0xff)
|
311
|
+
end
|
312
|
+
|
313
|
+
bom_encoding UTF_16LE do
|
314
|
+
starts_with?(0xff, 0xfe)
|
315
|
+
end
|
316
|
+
|
317
|
+
bom_encoding UTF_32BE do
|
318
|
+
starts_with?(0x00, 0x00, 0xfe, 0xff)
|
319
|
+
end
|
320
|
+
|
321
|
+
bom_encoding UTF_32LE do
|
322
|
+
starts_with?(0xff, 0xfe, 0x00, 0x00)
|
323
|
+
end
|
324
|
+
|
325
|
+
bom_encoding SCSU do
|
326
|
+
starts_with?(0x0e, 0xfe, 0xff)
|
327
|
+
end
|
328
|
+
|
329
|
+
bom_encoding UTF_7 do
|
330
|
+
starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
|
331
|
+
end
|
332
|
+
|
333
|
+
bom_encoding UTF_EBCDIC do
|
334
|
+
starts_with?(0xdd, 0x73, 0x66, 0x73)
|
335
|
+
end
|
336
|
+
|
337
|
+
bom_encoding BOCU_1 do
|
338
|
+
starts_with?(0xfb, 0xee, 0x28)
|
339
|
+
end
|
340
|
+
|
341
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# Contributors: #
|
14
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
15
|
+
# for automatic encoding detection) #
|
16
|
+
# #
|
17
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
18
|
+
# terms of the GNU General Public License as published by the Free Software #
|
19
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
20
|
+
# version. #
|
21
|
+
# #
|
22
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
23
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
24
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
25
|
+
# details. #
|
26
|
+
# #
|
27
|
+
# You should have received a copy of the GNU General Public License along #
|
28
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
29
|
+
# #
|
30
|
+
###############################################################################
|
31
|
+
#++
|
32
|
+
|
33
|
+
# Namespace for our encodings.
|
34
|
+
|
35
|
+
module CMess::GuessEncoding::Encoding
|
36
|
+
|
37
|
+
extend self
|
38
|
+
|
39
|
+
def const_name_for(encoding)
|
40
|
+
encoding.tr('-', '_').gsub(/\W/, '').upcase
|
41
|
+
end
|
42
|
+
|
43
|
+
def set_encoding_const(encoding, const = const_name_for(encoding))
|
44
|
+
const_set(const, encoding.freeze)
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_or_set_encoding_const(encoding)
|
48
|
+
const_defined?(const = const_name_for(encoding)) ?
|
49
|
+
const_get(const) : set_encoding_const(encoding, const)
|
50
|
+
end
|
51
|
+
|
52
|
+
%w[
|
53
|
+
UNKNOWN ASCII MACINTOSH
|
54
|
+
ISO-8859-1 ISO-8859-2 ISO-8859-15
|
55
|
+
CP1250 CP1251 CP1252 CP850 CP852 CP856
|
56
|
+
UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
|
57
|
+
UTF-7 UTF-EBCDIC SCSU BOCU-1
|
58
|
+
ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
|
59
|
+
].each { |encoding| set_encoding_const(encoding) }
|
60
|
+
|
61
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# Contributors: #
|
14
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
15
|
+
# for automatic encoding detection) #
|
16
|
+
# #
|
17
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
18
|
+
# terms of the GNU General Public License as published by the Free Software #
|
19
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
20
|
+
# version. #
|
21
|
+
# #
|
22
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
23
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
24
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
25
|
+
# details. #
|
26
|
+
# #
|
27
|
+
# You should have received a copy of the GNU General Public License along #
|
28
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
29
|
+
# #
|
30
|
+
###############################################################################
|
31
|
+
#++
|
32
|
+
|
33
|
+
require 'iconv'
|
34
|
+
|
35
|
+
# Outputs given string (or line), being encoded in target encoding, encoded in
|
36
|
+
# various test encodings, thus allowing to identify the (seemingly) correct
|
37
|
+
# encoding by visually comparing the input string with its desired appearance.
|
38
|
+
|
39
|
+
module CMess::GuessEncoding::Manual
|
40
|
+
|
41
|
+
extend self
|
42
|
+
|
43
|
+
include CMess::GuessEncoding::Encoding
|
44
|
+
|
45
|
+
# default encodings to try
|
46
|
+
ENCODINGS = [
|
47
|
+
ISO_8859_1,
|
48
|
+
ISO_8859_2,
|
49
|
+
ISO_8859_15,
|
50
|
+
CP1250,
|
51
|
+
CP1251,
|
52
|
+
CP1252,
|
53
|
+
CP850,
|
54
|
+
CP852,
|
55
|
+
CP856,
|
56
|
+
UTF_8
|
57
|
+
]
|
58
|
+
|
59
|
+
# likely candidates to suggest to the user
|
60
|
+
CANDIDATES = [
|
61
|
+
ANSI_X34,
|
62
|
+
EBCDIC_AT_DE,
|
63
|
+
EBCDIC_US,
|
64
|
+
EUC_JP,
|
65
|
+
KOI_8,
|
66
|
+
MACINTOSH,
|
67
|
+
MS_ANSI,
|
68
|
+
SHIFT_JIS,
|
69
|
+
UTF_7,
|
70
|
+
UTF_16,
|
71
|
+
UTF_16BE,
|
72
|
+
UTF_16LE,
|
73
|
+
UTF_32,
|
74
|
+
UTF_32BE,
|
75
|
+
UTF_32LE
|
76
|
+
]
|
77
|
+
|
78
|
+
def display(input, target_encoding, encodings = nil, additional_encodings = [])
|
79
|
+
target = target_encoding
|
80
|
+
|
81
|
+
encodings = (encodings || ENCODINGS) + additional_encodings
|
82
|
+
encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
|
83
|
+
# staying at the end
|
84
|
+
encodings = [target] + (encodings - [target]) # move target encoding to front
|
85
|
+
|
86
|
+
max_length = encodings.map { |encoding| encoding.length }.max
|
87
|
+
|
88
|
+
encodings.each { |encoding|
|
89
|
+
converted = begin
|
90
|
+
Iconv.conv(target, encoding, input)
|
91
|
+
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
|
92
|
+
"ILLEGAL INPUT SEQUENCE: #{err}"
|
93
|
+
rescue Iconv::InvalidEncoding
|
94
|
+
if encoding == target
|
95
|
+
abort "Invalid encoding: #{encoding}"
|
96
|
+
else
|
97
|
+
"INVALID ENCODING!"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
puts "%-#{max_length}s : %s" % [encoding, converted]
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
data/lib/cmess/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cmess
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9.276
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Wille
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-08-
|
12
|
+
date: 2008-08-15 00:00:00 +02:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -51,6 +51,9 @@ files:
|
|
51
51
|
- lib/cmess/cli.rb
|
52
52
|
- lib/cmess/cinderella.rb
|
53
53
|
- lib/cmess/decode_entities.rb
|
54
|
+
- lib/cmess/guess_encoding/manual.rb
|
55
|
+
- lib/cmess/guess_encoding/encoding.rb
|
56
|
+
- lib/cmess/guess_encoding/automatic.rb
|
54
57
|
- bin/cinderella
|
55
58
|
- bin/decode_entities
|
56
59
|
- bin/guess_encoding
|