cmess 0.0.8.274 → 0.0.9.276
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +7 -0
- data/README +1 -1
- data/lib/cmess/guess_encoding.rb +14 -398
- data/lib/cmess/guess_encoding/automatic.rb +341 -0
- data/lib/cmess/guess_encoding/encoding.rb +61 -0
- data/lib/cmess/guess_encoding/manual.rb +105 -0
- data/lib/cmess/version.rb +1 -1
- metadata +5 -2
data/ChangeLog
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
= Revision history for cmess
|
|
2
2
|
|
|
3
|
+
== 0.0.9 [2008-08-15]
|
|
4
|
+
|
|
5
|
+
* Reorganized file structure for guess_encoding
|
|
6
|
+
* Added shortcuts GuessEncoding.manual/.automatic
|
|
7
|
+
* GuessEncoding::Automatic now also takes a String
|
|
8
|
+
as input (will be converted to a StringIO)
|
|
9
|
+
|
|
3
10
|
== 0.0.8 [2008-08-14]
|
|
4
11
|
|
|
5
12
|
* Require 'cmess' inside libs, so the user doesn't have to
|
data/README
CHANGED
data/lib/cmess/guess_encoding.rb
CHANGED
|
@@ -30,416 +30,32 @@
|
|
|
30
30
|
###############################################################################
|
|
31
31
|
#++
|
|
32
32
|
|
|
33
|
-
$KCODE = 'u'
|
|
34
|
-
|
|
35
|
-
require 'yaml'
|
|
36
|
-
require 'iconv'
|
|
37
|
-
require 'forwardable'
|
|
38
|
-
|
|
39
33
|
require 'cmess'
|
|
40
34
|
|
|
41
|
-
#
|
|
42
|
-
#
|
|
43
|
-
#
|
|
44
|
-
#
|
|
45
|
-
# In addition to that manual procedure, may be used to detect the encoding
|
|
46
|
-
# automatically. Works actually pretty good -- for the supported encodings
|
|
47
|
-
# (see Automatic for details).
|
|
35
|
+
# Allows to guess an input's encoding either manually or automatically.
|
|
36
|
+
# Works actually pretty good -- for the supported encodings. See Manual
|
|
37
|
+
# and Automatic for details.
|
|
48
38
|
|
|
49
39
|
module CMess::GuessEncoding
|
|
50
40
|
|
|
51
41
|
# our version ;-)
|
|
52
|
-
VERSION = '0.0.
|
|
53
|
-
|
|
54
|
-
# Namespace for our encodings.
|
|
55
|
-
module Encoding
|
|
56
|
-
|
|
57
|
-
extend self
|
|
58
|
-
|
|
59
|
-
def const_name_for(encoding)
|
|
60
|
-
encoding.tr('-', '_').gsub(/\W/, '').upcase
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
def set_encoding_const(encoding, const = const_name_for(encoding))
|
|
64
|
-
const_set(const, encoding.freeze)
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
def get_or_set_encoding_const(encoding)
|
|
68
|
-
const_defined?(const = const_name_for(encoding)) ? const_get(const) :
|
|
69
|
-
set_encoding_const(encoding, const)
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
%w[
|
|
73
|
-
UNKNOWN ASCII MACINTOSH
|
|
74
|
-
ISO-8859-1 ISO-8859-2 ISO-8859-15
|
|
75
|
-
CP1250 CP1251 CP1252 CP850 CP852 CP856
|
|
76
|
-
UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
|
|
77
|
-
UTF-7 UTF-EBCDIC SCSU BOCU-1
|
|
78
|
-
ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
|
|
79
|
-
].each { |encoding| set_encoding_const(encoding) }
|
|
80
|
-
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
module Manual
|
|
84
|
-
|
|
85
|
-
extend self
|
|
86
|
-
|
|
87
|
-
include Encoding
|
|
88
|
-
|
|
89
|
-
# default encodings to try
|
|
90
|
-
ENCODINGS = [
|
|
91
|
-
ISO_8859_1,
|
|
92
|
-
ISO_8859_2,
|
|
93
|
-
ISO_8859_15,
|
|
94
|
-
CP1250,
|
|
95
|
-
CP1251,
|
|
96
|
-
CP1252,
|
|
97
|
-
CP850,
|
|
98
|
-
CP852,
|
|
99
|
-
CP856,
|
|
100
|
-
UTF_8
|
|
101
|
-
]
|
|
102
|
-
|
|
103
|
-
# likely candidates to suggest to the user
|
|
104
|
-
CANDIDATES = [
|
|
105
|
-
ANSI_X34,
|
|
106
|
-
EBCDIC_AT_DE,
|
|
107
|
-
EBCDIC_US,
|
|
108
|
-
EUC_JP,
|
|
109
|
-
KOI_8,
|
|
110
|
-
MACINTOSH,
|
|
111
|
-
MS_ANSI,
|
|
112
|
-
SHIFT_JIS,
|
|
113
|
-
UTF_7,
|
|
114
|
-
UTF_16,
|
|
115
|
-
UTF_16BE,
|
|
116
|
-
UTF_16LE,
|
|
117
|
-
UTF_32,
|
|
118
|
-
UTF_32BE,
|
|
119
|
-
UTF_32LE
|
|
120
|
-
]
|
|
121
|
-
|
|
122
|
-
def display(input, target_encoding, encodings = nil, additional_encodings = [])
|
|
123
|
-
target = target_encoding
|
|
124
|
-
|
|
125
|
-
encodings = (encodings || ENCODINGS) + additional_encodings
|
|
126
|
-
encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
|
|
127
|
-
# staying at the end
|
|
128
|
-
encodings = [target] + (encodings - [target]) # move target encoding to front
|
|
129
|
-
|
|
130
|
-
max_length = encodings.map { |encoding| encoding.length }.max
|
|
131
|
-
|
|
132
|
-
encodings.each { |encoding|
|
|
133
|
-
converted = begin
|
|
134
|
-
Iconv.conv(target, encoding, input)
|
|
135
|
-
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
|
|
136
|
-
"ILLEGAL INPUT SEQUENCE: #{err}"
|
|
137
|
-
rescue Iconv::InvalidEncoding
|
|
138
|
-
if encoding == target
|
|
139
|
-
abort "Invalid encoding: #{encoding}"
|
|
140
|
-
else
|
|
141
|
-
"INVALID ENCODING!"
|
|
142
|
-
end
|
|
143
|
-
end
|
|
144
|
-
|
|
145
|
-
puts "%-#{max_length}s : %s" % [encoding, converted]
|
|
146
|
-
}
|
|
147
|
-
end
|
|
148
|
-
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
# Tries to detect the encoding of a given input by applying several
|
|
152
|
-
# heuristics to determine the <b>most likely</b> candidate. If no heuristic
|
|
153
|
-
# catches on, resorts to Encoding::UNKNOWN.
|
|
154
|
-
#
|
|
155
|
-
# If a BOM is found, it may determine the encoding directly.
|
|
156
|
-
class Automatic
|
|
157
|
-
|
|
158
|
-
extend Forwardable
|
|
159
|
-
|
|
160
|
-
def_delegators self, :encoding_guessers, :supported_encoding?,
|
|
161
|
-
:bom_guessers, :supported_bom?
|
|
162
|
-
|
|
163
|
-
include Encoding
|
|
164
|
-
|
|
165
|
-
# Creates a converter for desired encoding (from UTF-8)
|
|
166
|
-
ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
|
|
167
|
-
|
|
168
|
-
# Single-byte encodings to test statistically by TEST_CHARS
|
|
169
|
-
TEST_ENCODINGS = [
|
|
170
|
-
MACINTOSH,
|
|
171
|
-
ISO_8859_1,
|
|
172
|
-
ISO_8859_15,
|
|
173
|
-
CP1252,
|
|
174
|
-
CP850,
|
|
175
|
-
MS_ANSI
|
|
176
|
-
]
|
|
177
|
-
|
|
178
|
-
# Certain (non-ASCII) chars to test for in TEST_ENCODINGS
|
|
179
|
-
CHARS_TO_TEST = (
|
|
180
|
-
'€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
|
|
181
|
-
'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
|
|
182
|
-
).split(//)
|
|
183
|
-
|
|
184
|
-
# Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
|
|
185
|
-
TEST_CHARS = Hash.new { |hash, encoding|
|
|
186
|
-
encoding = Encoding.get_or_set_encoding_const(encoding)
|
|
187
|
-
encchars = CHARS_TO_TEST.map { |char|
|
|
188
|
-
begin
|
|
189
|
-
byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
|
|
190
|
-
rescue Iconv::IllegalSequence
|
|
191
|
-
end
|
|
192
|
-
}.compact
|
|
193
|
-
|
|
194
|
-
TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
|
|
195
|
-
hash[encoding] = encchars
|
|
196
|
-
}.update(YAML.load_file(
|
|
197
|
-
File.join(File.dirname(__FILE__), '..', '..', 'data', 'test_chars.yaml')
|
|
198
|
-
))
|
|
199
|
-
|
|
200
|
-
# Relative count of TEST_CHARS must exceed this threshold to yield
|
|
201
|
-
# a direct match
|
|
202
|
-
TEST_THRESHOLD_DIRECT = 0.1
|
|
203
|
-
|
|
204
|
-
# Relative count of TEST_CHARS must exceed this threshold to yield
|
|
205
|
-
# an approximate match
|
|
206
|
-
TEST_THRESHOLD_APPROX = 0.0004
|
|
207
|
-
|
|
208
|
-
@supported_encodings = []
|
|
209
|
-
@encoding_guessers = []
|
|
210
|
-
@supported_boms = []
|
|
211
|
-
@bom_guessers = []
|
|
212
|
-
|
|
213
|
-
class << self
|
|
214
|
-
|
|
215
|
-
attr_reader :supported_encodings, :encoding_guessers,
|
|
216
|
-
:supported_boms, :bom_guessers
|
|
217
|
-
|
|
218
|
-
def guess(input, chunk_size = nil, ignore_bom = false)
|
|
219
|
-
new(input, chunk_size).guess(ignore_bom)
|
|
220
|
-
end
|
|
221
|
-
|
|
222
|
-
private
|
|
223
|
-
|
|
224
|
-
def encoding(encoding, &condition_block)
|
|
225
|
-
encoding_block = lambda {
|
|
226
|
-
encoding if instance_eval(&condition_block)
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
encodings(encoding, &encoding_block)
|
|
230
|
-
end
|
|
231
|
-
|
|
232
|
-
def encodings(*encodings, &encoding_block)
|
|
233
|
-
encodings.each { |encoding|
|
|
234
|
-
@supported_encodings << encoding
|
|
235
|
-
@encoding_guessers << encoding_block \
|
|
236
|
-
unless @encoding_guessers.include?(encoding_block)
|
|
237
|
-
}
|
|
238
|
-
end
|
|
239
|
-
|
|
240
|
-
def supported_encoding?(encoding)
|
|
241
|
-
supported_encodings.include?(encoding)
|
|
242
|
-
end
|
|
243
|
-
|
|
244
|
-
def bom_encoding(encoding, &condition_block)
|
|
245
|
-
encoding_block = lambda {
|
|
246
|
-
encoding if instance_eval(&condition_block)
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
@supported_boms << encoding
|
|
250
|
-
@bom_guessers << encoding_block \
|
|
251
|
-
unless @bom_guessers.include?(encoding_block)
|
|
252
|
-
end
|
|
253
|
-
|
|
254
|
-
def supported_bom?(encoding)
|
|
255
|
-
supported_boms.include?(encoding)
|
|
256
|
-
end
|
|
257
|
-
|
|
258
|
-
end
|
|
259
|
-
|
|
260
|
-
attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
|
|
261
|
-
|
|
262
|
-
def initialize(input, chunk_size = nil)
|
|
263
|
-
@input = input
|
|
264
|
-
@chunk_size = chunk_size
|
|
265
|
-
end
|
|
266
|
-
|
|
267
|
-
def guess(ignore_bom = false)
|
|
268
|
-
return bom if bom && !ignore_bom
|
|
269
|
-
|
|
270
|
-
while read
|
|
271
|
-
encoding_guessers.each { |block|
|
|
272
|
-
encoding = instance_eval(&block)
|
|
273
|
-
return encoding if encoding && supported_encoding?(encoding)
|
|
274
|
-
}
|
|
275
|
-
end
|
|
42
|
+
VERSION = '0.0.7'
|
|
276
43
|
|
|
277
|
-
|
|
278
|
-
UNKNOWN
|
|
279
|
-
end
|
|
280
|
-
|
|
281
|
-
def bom
|
|
282
|
-
@bom ||= check_bom
|
|
283
|
-
end
|
|
284
|
-
|
|
285
|
-
private
|
|
286
|
-
|
|
287
|
-
def eof?
|
|
288
|
-
input.eof?
|
|
289
|
-
end
|
|
290
|
-
|
|
291
|
-
def check_bom
|
|
292
|
-
return if eof?
|
|
293
|
-
|
|
294
|
-
# prevent "Illegal seek" error inside a pipe
|
|
295
|
-
begin
|
|
296
|
-
input.pos
|
|
297
|
-
rescue Errno::ESPIPE
|
|
298
|
-
return
|
|
299
|
-
end
|
|
300
|
-
|
|
301
|
-
bom_guessers.each { |block|
|
|
302
|
-
encoding = instance_eval(&block)
|
|
303
|
-
return encoding if encoding && supported_bom?(encoding)
|
|
304
|
-
|
|
305
|
-
# read bytes don't build a BOM, so rewind...
|
|
306
|
-
input.rewind
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
# nothing suitable found :-(
|
|
310
|
-
nil
|
|
311
|
-
end
|
|
312
|
-
|
|
313
|
-
def next_byte
|
|
314
|
-
input.read(1).unpack('C').first
|
|
315
|
-
end
|
|
316
|
-
|
|
317
|
-
def starts_with?(*bytes)
|
|
318
|
-
bytes.all? { |byte|
|
|
319
|
-
next_byte == byte
|
|
320
|
-
}
|
|
321
|
-
end
|
|
322
|
-
|
|
323
|
-
def next_one_of?(*bytes)
|
|
324
|
-
bytes.include?(next_byte)
|
|
325
|
-
end
|
|
326
|
-
|
|
327
|
-
def read(chunk_size = chunk_size)
|
|
328
|
-
# => initialize counters
|
|
329
|
-
@byte_count ||= Hash.new(0)
|
|
330
|
-
@byte_total ||= 0
|
|
331
|
-
|
|
332
|
-
return if eof?
|
|
333
|
-
|
|
334
|
-
bytes_before = @byte_total
|
|
335
|
-
|
|
336
|
-
input.read(chunk_size).each_byte { |byte|
|
|
337
|
-
@byte_count[byte] += 1
|
|
338
|
-
@byte_total += 1
|
|
339
|
-
|
|
340
|
-
@first_byte ||= byte
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
@byte_total > bytes_before
|
|
344
|
-
end
|
|
345
|
-
|
|
346
|
-
def byte_count_sum(*bytes)
|
|
347
|
-
bytes = *bytes # treat arrays/ranges and lists alike
|
|
348
|
-
bytes.inject(0) { |sum, n| sum + byte_count[n] }
|
|
349
|
-
end
|
|
350
|
-
|
|
351
|
-
def relative_byte_count(count)
|
|
352
|
-
count.to_f / byte_total
|
|
353
|
-
end
|
|
354
|
-
|
|
355
|
-
### Definition of guessing heuristics. Order matters!
|
|
356
|
-
|
|
357
|
-
# ASCII, if all bytes are within the lower 128 bytes
|
|
358
|
-
# (Unfortunately, we have to read the *whole* file to make that decision)
|
|
359
|
-
encoding ASCII do
|
|
360
|
-
eof? && byte_count_sum(0x0..0x7f) == byte_total
|
|
361
|
-
end
|
|
362
|
-
|
|
363
|
-
# UTF-16, if lots of NULL bytes present
|
|
364
|
-
encodings UTF_16BE, UTF_16LE, UTF_16 do
|
|
365
|
-
if relative_byte_count(byte_count[0]) > 0.25
|
|
366
|
-
case first_byte
|
|
367
|
-
when 0x0: UTF_32
|
|
368
|
-
when 0xfe: UTF_16BE
|
|
369
|
-
when 0xff: UTF_16LE
|
|
370
|
-
else UTF_16
|
|
371
|
-
end
|
|
372
|
-
end
|
|
373
|
-
end
|
|
374
|
-
|
|
375
|
-
# UTF-8, if number of escape-bytes and following bytes
|
|
376
|
-
# is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
|
|
377
|
-
encoding UTF_8 do
|
|
378
|
-
esc_bytes = byte_count_sum(0xc0..0xdf) \
|
|
379
|
-
# => 110xxxxx 10xxxxxx
|
|
380
|
-
+ byte_count_sum(0xe0..0xef) * 2 \
|
|
381
|
-
# => 1110xxxx 10xxxxxx 10xxxxxx
|
|
382
|
-
+ byte_count_sum(0xf0..0xf7) * 3
|
|
383
|
-
# => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
384
|
-
fol_bytes = byte_count_sum(0x80..0xbf)
|
|
385
|
-
# => 10xxxxxx
|
|
386
|
-
|
|
387
|
-
esc_bytes > 0 && esc_bytes == fol_bytes
|
|
388
|
-
end
|
|
389
|
-
|
|
390
|
-
# Analyse statistical appearance of German umlauts and other accented
|
|
391
|
-
# letters (see TEST_CHARS)
|
|
392
|
-
encodings *TEST_ENCODINGS do
|
|
393
|
-
ratios = {}
|
|
394
|
-
|
|
395
|
-
TEST_ENCODINGS.find(lambda {
|
|
396
|
-
ratio, encoding = ratios.sort.last
|
|
397
|
-
encoding if ratio >= TEST_THRESHOLD_APPROX
|
|
398
|
-
}) { |encoding|
|
|
399
|
-
ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
|
|
400
|
-
#p [encoding, ratio]
|
|
401
|
-
ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
|
|
402
|
-
}
|
|
403
|
-
end
|
|
404
|
-
|
|
405
|
-
### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
|
|
406
|
-
|
|
407
|
-
bom_encoding UTF_8 do
|
|
408
|
-
starts_with?(0xef, 0xbb, 0xbf)
|
|
409
|
-
end
|
|
410
|
-
|
|
411
|
-
bom_encoding UTF_16BE do
|
|
412
|
-
starts_with?(0xfe, 0xff)
|
|
413
|
-
end
|
|
414
|
-
|
|
415
|
-
bom_encoding UTF_16LE do
|
|
416
|
-
starts_with?(0xff, 0xfe)
|
|
417
|
-
end
|
|
418
|
-
|
|
419
|
-
bom_encoding UTF_32BE do
|
|
420
|
-
starts_with?(0x00, 0x00, 0xfe, 0xff)
|
|
421
|
-
end
|
|
44
|
+
class << self
|
|
422
45
|
|
|
423
|
-
|
|
424
|
-
|
|
46
|
+
def manual(*args)
|
|
47
|
+
Manual.display(*args)
|
|
425
48
|
end
|
|
426
49
|
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
end
|
|
430
|
-
|
|
431
|
-
bom_encoding UTF_7 do
|
|
432
|
-
starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
|
|
433
|
-
end
|
|
434
|
-
|
|
435
|
-
bom_encoding UTF_EBCDIC do
|
|
436
|
-
starts_with?(0xdd, 0x73, 0x66, 0x73)
|
|
437
|
-
end
|
|
438
|
-
|
|
439
|
-
bom_encoding BOCU_1 do
|
|
440
|
-
starts_with?(0xfb, 0xee, 0x28)
|
|
50
|
+
def automatic(*args)
|
|
51
|
+
Automatic.guess(*args)
|
|
441
52
|
end
|
|
442
53
|
|
|
443
54
|
end
|
|
444
55
|
|
|
445
56
|
end
|
|
57
|
+
|
|
58
|
+
%w[encoding manual automatic].each { |lib|
|
|
59
|
+
lib = "cmess/guess_encoding/#{lib}"
|
|
60
|
+
require lib
|
|
61
|
+
}
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
#--
|
|
2
|
+
###############################################################################
|
|
3
|
+
# #
|
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
|
5
|
+
# #
|
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
|
7
|
+
# Albertus-Magnus-Platz, #
|
|
8
|
+
# 50932 Cologne, Germany #
|
|
9
|
+
# #
|
|
10
|
+
# Authors: #
|
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
|
12
|
+
# #
|
|
13
|
+
# Contributors: #
|
|
14
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
|
15
|
+
# for automatic encoding detection) #
|
|
16
|
+
# #
|
|
17
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
|
18
|
+
# terms of the GNU General Public License as published by the Free Software #
|
|
19
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
|
20
|
+
# version. #
|
|
21
|
+
# #
|
|
22
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
23
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
24
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
|
25
|
+
# details. #
|
|
26
|
+
# #
|
|
27
|
+
# You should have received a copy of the GNU General Public License along #
|
|
28
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
|
29
|
+
# #
|
|
30
|
+
###############################################################################
|
|
31
|
+
#++
|
|
32
|
+
|
|
33
|
+
$KCODE = 'u'
|
|
34
|
+
|
|
35
|
+
require 'yaml'
|
|
36
|
+
require 'iconv'
|
|
37
|
+
require 'stringio'
|
|
38
|
+
require 'forwardable'
|
|
39
|
+
|
|
40
|
+
# Tries to detect the encoding of a given input by applying several
|
|
41
|
+
# heuristics to determine the <b>most likely</b> candidate. If no heuristic
|
|
42
|
+
# catches on, resorts to Encoding::UNKNOWN.
|
|
43
|
+
#
|
|
44
|
+
# If a BOM is found, it may determine the encoding directly.
|
|
45
|
+
|
|
46
|
+
class CMess::GuessEncoding::Automatic
|
|
47
|
+
|
|
48
|
+
extend Forwardable
|
|
49
|
+
|
|
50
|
+
def_delegators self, :encoding_guessers, :supported_encoding?,
|
|
51
|
+
:bom_guessers, :supported_bom?
|
|
52
|
+
|
|
53
|
+
include CMess::GuessEncoding::Encoding
|
|
54
|
+
|
|
55
|
+
# Creates a converter for desired encoding (from UTF-8)
|
|
56
|
+
ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
|
|
57
|
+
|
|
58
|
+
# Single-byte encodings to test statistically by TEST_CHARS
|
|
59
|
+
TEST_ENCODINGS = [
|
|
60
|
+
MACINTOSH,
|
|
61
|
+
ISO_8859_1,
|
|
62
|
+
ISO_8859_15,
|
|
63
|
+
CP1252,
|
|
64
|
+
CP850,
|
|
65
|
+
MS_ANSI
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
# Certain (non-ASCII) chars to test for in TEST_ENCODINGS
|
|
69
|
+
CHARS_TO_TEST = (
|
|
70
|
+
'€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
|
|
71
|
+
'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
|
|
72
|
+
).split(//)
|
|
73
|
+
|
|
74
|
+
# Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
|
|
75
|
+
TEST_CHARS = Hash.new { |hash, encoding|
|
|
76
|
+
encoding = Encoding.get_or_set_encoding_const(encoding)
|
|
77
|
+
encchars = CHARS_TO_TEST.map { |char|
|
|
78
|
+
begin
|
|
79
|
+
byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
|
|
80
|
+
rescue Iconv::IllegalSequence
|
|
81
|
+
end
|
|
82
|
+
}.compact
|
|
83
|
+
|
|
84
|
+
TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
|
|
85
|
+
hash[encoding] = encchars
|
|
86
|
+
}.update(YAML.load_file(
|
|
87
|
+
File.join(File.dirname(__FILE__), *%w[.. .. .. data test_chars.yaml])
|
|
88
|
+
))
|
|
89
|
+
|
|
90
|
+
# Relative count of TEST_CHARS must exceed this threshold to yield
|
|
91
|
+
# a direct match
|
|
92
|
+
TEST_THRESHOLD_DIRECT = 0.1
|
|
93
|
+
|
|
94
|
+
# Relative count of TEST_CHARS must exceed this threshold to yield
|
|
95
|
+
# an approximate match
|
|
96
|
+
TEST_THRESHOLD_APPROX = 0.0004
|
|
97
|
+
|
|
98
|
+
@supported_encodings = []
|
|
99
|
+
@encoding_guessers = []
|
|
100
|
+
@supported_boms = []
|
|
101
|
+
@bom_guessers = []
|
|
102
|
+
|
|
103
|
+
class << self
|
|
104
|
+
|
|
105
|
+
attr_reader :supported_encodings, :encoding_guessers,
|
|
106
|
+
:supported_boms, :bom_guessers
|
|
107
|
+
|
|
108
|
+
def guess(input, chunk_size = nil, ignore_bom = false)
|
|
109
|
+
new(input, chunk_size).guess(ignore_bom)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
private
|
|
113
|
+
|
|
114
|
+
def encoding(encoding, &condition_block)
|
|
115
|
+
encoding_block = lambda {
|
|
116
|
+
encoding if instance_eval(&condition_block)
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
encodings(encoding, &encoding_block)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def encodings(*encodings, &encoding_block)
|
|
123
|
+
encodings.each { |encoding|
|
|
124
|
+
@supported_encodings << encoding
|
|
125
|
+
@encoding_guessers << encoding_block \
|
|
126
|
+
unless @encoding_guessers.include?(encoding_block)
|
|
127
|
+
}
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def supported_encoding?(encoding)
|
|
131
|
+
supported_encodings.include?(encoding)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def bom_encoding(encoding, &condition_block)
|
|
135
|
+
encoding_block = lambda {
|
|
136
|
+
encoding if instance_eval(&condition_block)
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
@supported_boms << encoding
|
|
140
|
+
@bom_guessers << encoding_block \
|
|
141
|
+
unless @bom_guessers.include?(encoding_block)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def supported_bom?(encoding)
|
|
145
|
+
supported_boms.include?(encoding)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
|
|
151
|
+
|
|
152
|
+
def initialize(input, chunk_size = nil)
|
|
153
|
+
@input = case input
|
|
154
|
+
when IO # that's what we want
|
|
155
|
+
input
|
|
156
|
+
when String # convert it to an IO
|
|
157
|
+
StringIO.new(input)
|
|
158
|
+
else # um, what's that...?
|
|
159
|
+
raise ArgumentError, "don't know how to handle input of type #{input.class}"
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
@chunk_size = chunk_size
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def guess(ignore_bom = false)
|
|
166
|
+
return bom if bom && !ignore_bom
|
|
167
|
+
|
|
168
|
+
while read
|
|
169
|
+
encoding_guessers.each { |block|
|
|
170
|
+
encoding = instance_eval(&block)
|
|
171
|
+
return encoding if encoding && supported_encoding?(encoding)
|
|
172
|
+
}
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# nothing suitable found :-(
|
|
176
|
+
UNKNOWN
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def bom
|
|
180
|
+
@bom ||= check_bom
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
private
|
|
184
|
+
|
|
185
|
+
def eof?
|
|
186
|
+
input.eof?
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def check_bom
|
|
190
|
+
return if eof?
|
|
191
|
+
|
|
192
|
+
# prevent "Illegal seek" error inside a pipe
|
|
193
|
+
begin
|
|
194
|
+
input.pos
|
|
195
|
+
rescue Errno::ESPIPE
|
|
196
|
+
return
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
bom_guessers.each { |block|
|
|
200
|
+
encoding = instance_eval(&block)
|
|
201
|
+
return encoding if encoding && supported_bom?(encoding)
|
|
202
|
+
|
|
203
|
+
# read bytes don't build a BOM, so rewind...
|
|
204
|
+
input.rewind
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
# nothing suitable found :-(
|
|
208
|
+
nil
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def next_byte
|
|
212
|
+
input.read(1).unpack('C').first
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def starts_with?(*bytes)
|
|
216
|
+
bytes.all? { |byte|
|
|
217
|
+
next_byte == byte
|
|
218
|
+
}
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def next_one_of?(*bytes)
|
|
222
|
+
bytes.include?(next_byte)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def read(chunk_size = chunk_size)
|
|
226
|
+
# => initialize counters
|
|
227
|
+
@byte_count ||= Hash.new(0)
|
|
228
|
+
@byte_total ||= 0
|
|
229
|
+
|
|
230
|
+
return if eof?
|
|
231
|
+
|
|
232
|
+
bytes_before = @byte_total
|
|
233
|
+
|
|
234
|
+
input.read(chunk_size).each_byte { |byte|
|
|
235
|
+
@byte_count[byte] += 1
|
|
236
|
+
@byte_total += 1
|
|
237
|
+
|
|
238
|
+
@first_byte ||= byte
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
@byte_total > bytes_before
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def byte_count_sum(*bytes)
|
|
245
|
+
bytes = *bytes # treat arrays/ranges and lists alike
|
|
246
|
+
bytes.inject(0) { |sum, n| sum + byte_count[n] }
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def relative_byte_count(count)
|
|
250
|
+
count.to_f / byte_total
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
### Definition of guessing heuristics. Order matters!
|
|
254
|
+
|
|
255
|
+
# ASCII, if all bytes are within the lower 128 bytes
|
|
256
|
+
# (Unfortunately, we have to read the *whole* file to make that decision)
|
|
257
|
+
encoding ASCII do
|
|
258
|
+
eof? && byte_count_sum(0x0..0x7f) == byte_total
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# UTF-16, if lots of NULL bytes present
|
|
262
|
+
encodings UTF_16BE, UTF_16LE, UTF_16 do
|
|
263
|
+
if relative_byte_count(byte_count[0]) > 0.25
|
|
264
|
+
case first_byte
|
|
265
|
+
when 0x0: UTF_32
|
|
266
|
+
when 0xfe: UTF_16BE
|
|
267
|
+
when 0xff: UTF_16LE
|
|
268
|
+
else UTF_16
|
|
269
|
+
end
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
# UTF-8, if number of escape-bytes and following bytes
|
|
274
|
+
# is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
|
|
275
|
+
encoding UTF_8 do
|
|
276
|
+
esc_bytes = byte_count_sum(0xc0..0xdf) \
|
|
277
|
+
# => 110xxxxx 10xxxxxx
|
|
278
|
+
+ byte_count_sum(0xe0..0xef) * 2 \
|
|
279
|
+
# => 1110xxxx 10xxxxxx 10xxxxxx
|
|
280
|
+
+ byte_count_sum(0xf0..0xf7) * 3
|
|
281
|
+
# => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
282
|
+
fol_bytes = byte_count_sum(0x80..0xbf)
|
|
283
|
+
# => 10xxxxxx
|
|
284
|
+
|
|
285
|
+
esc_bytes > 0 && esc_bytes == fol_bytes
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
# Analyse statistical appearance of German umlauts and other accented
|
|
289
|
+
# letters (see TEST_CHARS)
|
|
290
|
+
encodings *TEST_ENCODINGS do
|
|
291
|
+
ratios = {}
|
|
292
|
+
|
|
293
|
+
TEST_ENCODINGS.find(lambda {
|
|
294
|
+
ratio, encoding = ratios.sort.last
|
|
295
|
+
encoding if ratio >= TEST_THRESHOLD_APPROX
|
|
296
|
+
}) { |encoding|
|
|
297
|
+
ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
|
|
298
|
+
#p [encoding, ratio]
|
|
299
|
+
ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
|
|
300
|
+
}
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
|
|
304
|
+
|
|
305
|
+
bom_encoding UTF_8 do
|
|
306
|
+
starts_with?(0xef, 0xbb, 0xbf)
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
bom_encoding UTF_16BE do
|
|
310
|
+
starts_with?(0xfe, 0xff)
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
bom_encoding UTF_16LE do
|
|
314
|
+
starts_with?(0xff, 0xfe)
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
bom_encoding UTF_32BE do
|
|
318
|
+
starts_with?(0x00, 0x00, 0xfe, 0xff)
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
bom_encoding UTF_32LE do
|
|
322
|
+
starts_with?(0xff, 0xfe, 0x00, 0x00)
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
bom_encoding SCSU do
|
|
326
|
+
starts_with?(0x0e, 0xfe, 0xff)
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
bom_encoding UTF_7 do
|
|
330
|
+
starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
bom_encoding UTF_EBCDIC do
|
|
334
|
+
starts_with?(0xdd, 0x73, 0x66, 0x73)
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
bom_encoding BOCU_1 do
|
|
338
|
+
starts_with?(0xfb, 0xee, 0x28)
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#--
|
|
2
|
+
###############################################################################
|
|
3
|
+
# #
|
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
|
5
|
+
# #
|
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
|
7
|
+
# Albertus-Magnus-Platz, #
|
|
8
|
+
# 50932 Cologne, Germany #
|
|
9
|
+
# #
|
|
10
|
+
# Authors: #
|
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
|
12
|
+
# #
|
|
13
|
+
# Contributors: #
|
|
14
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
|
15
|
+
# for automatic encoding detection) #
|
|
16
|
+
# #
|
|
17
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
|
18
|
+
# terms of the GNU General Public License as published by the Free Software #
|
|
19
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
|
20
|
+
# version. #
|
|
21
|
+
# #
|
|
22
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
23
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
24
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
|
25
|
+
# details. #
|
|
26
|
+
# #
|
|
27
|
+
# You should have received a copy of the GNU General Public License along #
|
|
28
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
|
29
|
+
# #
|
|
30
|
+
###############################################################################
|
|
31
|
+
#++
|
|
32
|
+
|
|
33
|
+
# Namespace for our encodings.
|
|
34
|
+
|
|
35
|
+
module CMess::GuessEncoding::Encoding
|
|
36
|
+
|
|
37
|
+
extend self
|
|
38
|
+
|
|
39
|
+
def const_name_for(encoding)
|
|
40
|
+
encoding.tr('-', '_').gsub(/\W/, '').upcase
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def set_encoding_const(encoding, const = const_name_for(encoding))
|
|
44
|
+
const_set(const, encoding.freeze)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def get_or_set_encoding_const(encoding)
|
|
48
|
+
const_defined?(const = const_name_for(encoding)) ?
|
|
49
|
+
const_get(const) : set_encoding_const(encoding, const)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
%w[
|
|
53
|
+
UNKNOWN ASCII MACINTOSH
|
|
54
|
+
ISO-8859-1 ISO-8859-2 ISO-8859-15
|
|
55
|
+
CP1250 CP1251 CP1252 CP850 CP852 CP856
|
|
56
|
+
UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
|
|
57
|
+
UTF-7 UTF-EBCDIC SCSU BOCU-1
|
|
58
|
+
ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
|
|
59
|
+
].each { |encoding| set_encoding_const(encoding) }
|
|
60
|
+
|
|
61
|
+
end
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#--
|
|
2
|
+
###############################################################################
|
|
3
|
+
# #
|
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
|
5
|
+
# #
|
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
|
7
|
+
# Albertus-Magnus-Platz, #
|
|
8
|
+
# 50932 Cologne, Germany #
|
|
9
|
+
# #
|
|
10
|
+
# Authors: #
|
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
|
12
|
+
# #
|
|
13
|
+
# Contributors: #
|
|
14
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
|
15
|
+
# for automatic encoding detection) #
|
|
16
|
+
# #
|
|
17
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
|
18
|
+
# terms of the GNU General Public License as published by the Free Software #
|
|
19
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
|
20
|
+
# version. #
|
|
21
|
+
# #
|
|
22
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
23
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
24
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
|
25
|
+
# details. #
|
|
26
|
+
# #
|
|
27
|
+
# You should have received a copy of the GNU General Public License along #
|
|
28
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
|
29
|
+
# #
|
|
30
|
+
###############################################################################
|
|
31
|
+
#++
|
|
32
|
+
|
|
33
|
+
require 'iconv'
|
|
34
|
+
|
|
35
|
+
# Outputs given string (or line), being encoded in target encoding, encoded in
|
|
36
|
+
# various test encodings, thus allowing to identify the (seemingly) correct
|
|
37
|
+
# encoding by visually comparing the input string with its desired appearance.
|
|
38
|
+
|
|
39
|
+
module CMess::GuessEncoding::Manual
|
|
40
|
+
|
|
41
|
+
extend self
|
|
42
|
+
|
|
43
|
+
include CMess::GuessEncoding::Encoding
|
|
44
|
+
|
|
45
|
+
# default encodings to try
|
|
46
|
+
ENCODINGS = [
|
|
47
|
+
ISO_8859_1,
|
|
48
|
+
ISO_8859_2,
|
|
49
|
+
ISO_8859_15,
|
|
50
|
+
CP1250,
|
|
51
|
+
CP1251,
|
|
52
|
+
CP1252,
|
|
53
|
+
CP850,
|
|
54
|
+
CP852,
|
|
55
|
+
CP856,
|
|
56
|
+
UTF_8
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# likely candidates to suggest to the user
|
|
60
|
+
CANDIDATES = [
|
|
61
|
+
ANSI_X34,
|
|
62
|
+
EBCDIC_AT_DE,
|
|
63
|
+
EBCDIC_US,
|
|
64
|
+
EUC_JP,
|
|
65
|
+
KOI_8,
|
|
66
|
+
MACINTOSH,
|
|
67
|
+
MS_ANSI,
|
|
68
|
+
SHIFT_JIS,
|
|
69
|
+
UTF_7,
|
|
70
|
+
UTF_16,
|
|
71
|
+
UTF_16BE,
|
|
72
|
+
UTF_16LE,
|
|
73
|
+
UTF_32,
|
|
74
|
+
UTF_32BE,
|
|
75
|
+
UTF_32LE
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
def display(input, target_encoding, encodings = nil, additional_encodings = [])
|
|
79
|
+
target = target_encoding
|
|
80
|
+
|
|
81
|
+
encodings = (encodings || ENCODINGS) + additional_encodings
|
|
82
|
+
encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
|
|
83
|
+
# staying at the end
|
|
84
|
+
encodings = [target] + (encodings - [target]) # move target encoding to front
|
|
85
|
+
|
|
86
|
+
max_length = encodings.map { |encoding| encoding.length }.max
|
|
87
|
+
|
|
88
|
+
encodings.each { |encoding|
|
|
89
|
+
converted = begin
|
|
90
|
+
Iconv.conv(target, encoding, input)
|
|
91
|
+
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
|
|
92
|
+
"ILLEGAL INPUT SEQUENCE: #{err}"
|
|
93
|
+
rescue Iconv::InvalidEncoding
|
|
94
|
+
if encoding == target
|
|
95
|
+
abort "Invalid encoding: #{encoding}"
|
|
96
|
+
else
|
|
97
|
+
"INVALID ENCODING!"
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
puts "%-#{max_length}s : %s" % [encoding, converted]
|
|
102
|
+
}
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
end
|
data/lib/cmess/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: cmess
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.9.276
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Jens Wille
|
|
@@ -9,7 +9,7 @@ autorequire:
|
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
11
|
|
|
12
|
-
date: 2008-08-
|
|
12
|
+
date: 2008-08-15 00:00:00 +02:00
|
|
13
13
|
default_executable:
|
|
14
14
|
dependencies:
|
|
15
15
|
- !ruby/object:Gem::Dependency
|
|
@@ -51,6 +51,9 @@ files:
|
|
|
51
51
|
- lib/cmess/cli.rb
|
|
52
52
|
- lib/cmess/cinderella.rb
|
|
53
53
|
- lib/cmess/decode_entities.rb
|
|
54
|
+
- lib/cmess/guess_encoding/manual.rb
|
|
55
|
+
- lib/cmess/guess_encoding/encoding.rb
|
|
56
|
+
- lib/cmess/guess_encoding/automatic.rb
|
|
54
57
|
- bin/cinderella
|
|
55
58
|
- bin/decode_entities
|
|
56
59
|
- bin/guess_encoding
|