cmess 0.0.8.274 → 0.0.9.276

Sign up to get free protection for your applications and to get access to all the features.
data/ChangeLog CHANGED
@@ -1,5 +1,12 @@
1
1
  = Revision history for cmess
2
2
 
3
+ == 0.0.9 [2008-08-15]
4
+
5
+ * Reorganized file structure for guess_encoding
6
+ * Added shortcuts GuessEncoding.manual/.automatic
7
+ * GuessEncoding::Automatic now also takes a String
8
+ as input (will be converted to a StringIO)
9
+
3
10
  == 0.0.8 [2008-08-14]
4
11
 
5
12
  * Require 'cmess' inside libs, so the user doesn't have to
data/README CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  == VERSION
4
4
 
5
- This documentation refers to cmess version 0.0.8
5
+ This documentation refers to cmess version 0.0.9
6
6
 
7
7
 
8
8
  == DESCRIPTION
@@ -30,416 +30,32 @@
30
30
  ###############################################################################
31
31
  #++
32
32
 
33
- $KCODE = 'u'
34
-
35
- require 'yaml'
36
- require 'iconv'
37
- require 'forwardable'
38
-
39
33
  require 'cmess'
40
34
 
41
- # Outputs given string (or line), being encoded in target encoding, encoded in
42
- # various test encodings, thus allowing to identify the (seemingly) correct
43
- # encoding by visually comparing the input string with its desired appearance.
44
- #
45
- # In addition to that manual procedure, may be used to detect the encoding
46
- # automatically. Works actually pretty good -- for the supported encodings
47
- # (see Automatic for details).
35
+ # Allows to guess an input's encoding either manually or automatically.
36
+ # Works actually pretty good -- for the supported encodings. See Manual
37
+ # and Automatic for details.
48
38
 
49
39
  module CMess::GuessEncoding
50
40
 
51
41
  # our version ;-)
52
- VERSION = '0.0.6'
53
-
54
- # Namespace for our encodings.
55
- module Encoding
56
-
57
- extend self
58
-
59
- def const_name_for(encoding)
60
- encoding.tr('-', '_').gsub(/\W/, '').upcase
61
- end
62
-
63
- def set_encoding_const(encoding, const = const_name_for(encoding))
64
- const_set(const, encoding.freeze)
65
- end
66
-
67
- def get_or_set_encoding_const(encoding)
68
- const_defined?(const = const_name_for(encoding)) ? const_get(const) :
69
- set_encoding_const(encoding, const)
70
- end
71
-
72
- %w[
73
- UNKNOWN ASCII MACINTOSH
74
- ISO-8859-1 ISO-8859-2 ISO-8859-15
75
- CP1250 CP1251 CP1252 CP850 CP852 CP856
76
- UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
77
- UTF-7 UTF-EBCDIC SCSU BOCU-1
78
- ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
79
- ].each { |encoding| set_encoding_const(encoding) }
80
-
81
- end
82
-
83
- module Manual
84
-
85
- extend self
86
-
87
- include Encoding
88
-
89
- # default encodings to try
90
- ENCODINGS = [
91
- ISO_8859_1,
92
- ISO_8859_2,
93
- ISO_8859_15,
94
- CP1250,
95
- CP1251,
96
- CP1252,
97
- CP850,
98
- CP852,
99
- CP856,
100
- UTF_8
101
- ]
102
-
103
- # likely candidates to suggest to the user
104
- CANDIDATES = [
105
- ANSI_X34,
106
- EBCDIC_AT_DE,
107
- EBCDIC_US,
108
- EUC_JP,
109
- KOI_8,
110
- MACINTOSH,
111
- MS_ANSI,
112
- SHIFT_JIS,
113
- UTF_7,
114
- UTF_16,
115
- UTF_16BE,
116
- UTF_16LE,
117
- UTF_32,
118
- UTF_32BE,
119
- UTF_32LE
120
- ]
121
-
122
- def display(input, target_encoding, encodings = nil, additional_encodings = [])
123
- target = target_encoding
124
-
125
- encodings = (encodings || ENCODINGS) + additional_encodings
126
- encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
127
- # staying at the end
128
- encodings = [target] + (encodings - [target]) # move target encoding to front
129
-
130
- max_length = encodings.map { |encoding| encoding.length }.max
131
-
132
- encodings.each { |encoding|
133
- converted = begin
134
- Iconv.conv(target, encoding, input)
135
- rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
136
- "ILLEGAL INPUT SEQUENCE: #{err}"
137
- rescue Iconv::InvalidEncoding
138
- if encoding == target
139
- abort "Invalid encoding: #{encoding}"
140
- else
141
- "INVALID ENCODING!"
142
- end
143
- end
144
-
145
- puts "%-#{max_length}s : %s" % [encoding, converted]
146
- }
147
- end
148
-
149
- end
150
-
151
- # Tries to detect the encoding of a given input by applying several
152
- # heuristics to determine the <b>most likely</b> candidate. If no heuristic
153
- # catches on, resorts to Encoding::UNKNOWN.
154
- #
155
- # If a BOM is found, it may determine the encoding directly.
156
- class Automatic
157
-
158
- extend Forwardable
159
-
160
- def_delegators self, :encoding_guessers, :supported_encoding?,
161
- :bom_guessers, :supported_bom?
162
-
163
- include Encoding
164
-
165
- # Creates a converter for desired encoding (from UTF-8)
166
- ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
167
-
168
- # Single-byte encodings to test statistically by TEST_CHARS
169
- TEST_ENCODINGS = [
170
- MACINTOSH,
171
- ISO_8859_1,
172
- ISO_8859_15,
173
- CP1252,
174
- CP850,
175
- MS_ANSI
176
- ]
177
-
178
- # Certain (non-ASCII) chars to test for in TEST_ENCODINGS
179
- CHARS_TO_TEST = (
180
- '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
181
- 'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
182
- ).split(//)
183
-
184
- # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
185
- TEST_CHARS = Hash.new { |hash, encoding|
186
- encoding = Encoding.get_or_set_encoding_const(encoding)
187
- encchars = CHARS_TO_TEST.map { |char|
188
- begin
189
- byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
190
- rescue Iconv::IllegalSequence
191
- end
192
- }.compact
193
-
194
- TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
195
- hash[encoding] = encchars
196
- }.update(YAML.load_file(
197
- File.join(File.dirname(__FILE__), '..', '..', 'data', 'test_chars.yaml')
198
- ))
199
-
200
- # Relative count of TEST_CHARS must exceed this threshold to yield
201
- # a direct match
202
- TEST_THRESHOLD_DIRECT = 0.1
203
-
204
- # Relative count of TEST_CHARS must exceed this threshold to yield
205
- # an approximate match
206
- TEST_THRESHOLD_APPROX = 0.0004
207
-
208
- @supported_encodings = []
209
- @encoding_guessers = []
210
- @supported_boms = []
211
- @bom_guessers = []
212
-
213
- class << self
214
-
215
- attr_reader :supported_encodings, :encoding_guessers,
216
- :supported_boms, :bom_guessers
217
-
218
- def guess(input, chunk_size = nil, ignore_bom = false)
219
- new(input, chunk_size).guess(ignore_bom)
220
- end
221
-
222
- private
223
-
224
- def encoding(encoding, &condition_block)
225
- encoding_block = lambda {
226
- encoding if instance_eval(&condition_block)
227
- }
228
-
229
- encodings(encoding, &encoding_block)
230
- end
231
-
232
- def encodings(*encodings, &encoding_block)
233
- encodings.each { |encoding|
234
- @supported_encodings << encoding
235
- @encoding_guessers << encoding_block \
236
- unless @encoding_guessers.include?(encoding_block)
237
- }
238
- end
239
-
240
- def supported_encoding?(encoding)
241
- supported_encodings.include?(encoding)
242
- end
243
-
244
- def bom_encoding(encoding, &condition_block)
245
- encoding_block = lambda {
246
- encoding if instance_eval(&condition_block)
247
- }
248
-
249
- @supported_boms << encoding
250
- @bom_guessers << encoding_block \
251
- unless @bom_guessers.include?(encoding_block)
252
- end
253
-
254
- def supported_bom?(encoding)
255
- supported_boms.include?(encoding)
256
- end
257
-
258
- end
259
-
260
- attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
261
-
262
- def initialize(input, chunk_size = nil)
263
- @input = input
264
- @chunk_size = chunk_size
265
- end
266
-
267
- def guess(ignore_bom = false)
268
- return bom if bom && !ignore_bom
269
-
270
- while read
271
- encoding_guessers.each { |block|
272
- encoding = instance_eval(&block)
273
- return encoding if encoding && supported_encoding?(encoding)
274
- }
275
- end
42
+ VERSION = '0.0.7'
276
43
 
277
- # nothing suitable found :-(
278
- UNKNOWN
279
- end
280
-
281
- def bom
282
- @bom ||= check_bom
283
- end
284
-
285
- private
286
-
287
- def eof?
288
- input.eof?
289
- end
290
-
291
- def check_bom
292
- return if eof?
293
-
294
- # prevent "Illegal seek" error inside a pipe
295
- begin
296
- input.pos
297
- rescue Errno::ESPIPE
298
- return
299
- end
300
-
301
- bom_guessers.each { |block|
302
- encoding = instance_eval(&block)
303
- return encoding if encoding && supported_bom?(encoding)
304
-
305
- # read bytes don't build a BOM, so rewind...
306
- input.rewind
307
- }
308
-
309
- # nothing suitable found :-(
310
- nil
311
- end
312
-
313
- def next_byte
314
- input.read(1).unpack('C').first
315
- end
316
-
317
- def starts_with?(*bytes)
318
- bytes.all? { |byte|
319
- next_byte == byte
320
- }
321
- end
322
-
323
- def next_one_of?(*bytes)
324
- bytes.include?(next_byte)
325
- end
326
-
327
- def read(chunk_size = chunk_size)
328
- # => initialize counters
329
- @byte_count ||= Hash.new(0)
330
- @byte_total ||= 0
331
-
332
- return if eof?
333
-
334
- bytes_before = @byte_total
335
-
336
- input.read(chunk_size).each_byte { |byte|
337
- @byte_count[byte] += 1
338
- @byte_total += 1
339
-
340
- @first_byte ||= byte
341
- }
342
-
343
- @byte_total > bytes_before
344
- end
345
-
346
- def byte_count_sum(*bytes)
347
- bytes = *bytes # treat arrays/ranges and lists alike
348
- bytes.inject(0) { |sum, n| sum + byte_count[n] }
349
- end
350
-
351
- def relative_byte_count(count)
352
- count.to_f / byte_total
353
- end
354
-
355
- ### Definition of guessing heuristics. Order matters!
356
-
357
- # ASCII, if all bytes are within the lower 128 bytes
358
- # (Unfortunately, we have to read the *whole* file to make that decision)
359
- encoding ASCII do
360
- eof? && byte_count_sum(0x0..0x7f) == byte_total
361
- end
362
-
363
- # UTF-16, if lots of NULL bytes present
364
- encodings UTF_16BE, UTF_16LE, UTF_16 do
365
- if relative_byte_count(byte_count[0]) > 0.25
366
- case first_byte
367
- when 0x0: UTF_32
368
- when 0xfe: UTF_16BE
369
- when 0xff: UTF_16LE
370
- else UTF_16
371
- end
372
- end
373
- end
374
-
375
- # UTF-8, if number of escape-bytes and following bytes
376
- # is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
377
- encoding UTF_8 do
378
- esc_bytes = byte_count_sum(0xc0..0xdf) \
379
- # => 110xxxxx 10xxxxxx
380
- + byte_count_sum(0xe0..0xef) * 2 \
381
- # => 1110xxxx 10xxxxxx 10xxxxxx
382
- + byte_count_sum(0xf0..0xf7) * 3
383
- # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
384
- fol_bytes = byte_count_sum(0x80..0xbf)
385
- # => 10xxxxxx
386
-
387
- esc_bytes > 0 && esc_bytes == fol_bytes
388
- end
389
-
390
- # Analyse statistical appearance of German umlauts and other accented
391
- # letters (see TEST_CHARS)
392
- encodings *TEST_ENCODINGS do
393
- ratios = {}
394
-
395
- TEST_ENCODINGS.find(lambda {
396
- ratio, encoding = ratios.sort.last
397
- encoding if ratio >= TEST_THRESHOLD_APPROX
398
- }) { |encoding|
399
- ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
400
- #p [encoding, ratio]
401
- ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
402
- }
403
- end
404
-
405
- ### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
406
-
407
- bom_encoding UTF_8 do
408
- starts_with?(0xef, 0xbb, 0xbf)
409
- end
410
-
411
- bom_encoding UTF_16BE do
412
- starts_with?(0xfe, 0xff)
413
- end
414
-
415
- bom_encoding UTF_16LE do
416
- starts_with?(0xff, 0xfe)
417
- end
418
-
419
- bom_encoding UTF_32BE do
420
- starts_with?(0x00, 0x00, 0xfe, 0xff)
421
- end
44
+ class << self
422
45
 
423
- bom_encoding UTF_32LE do
424
- starts_with?(0xff, 0xfe, 0x00, 0x00)
46
+ def manual(*args)
47
+ Manual.display(*args)
425
48
  end
426
49
 
427
- bom_encoding SCSU do
428
- starts_with?(0x0e, 0xfe, 0xff)
429
- end
430
-
431
- bom_encoding UTF_7 do
432
- starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
433
- end
434
-
435
- bom_encoding UTF_EBCDIC do
436
- starts_with?(0xdd, 0x73, 0x66, 0x73)
437
- end
438
-
439
- bom_encoding BOCU_1 do
440
- starts_with?(0xfb, 0xee, 0x28)
50
+ def automatic(*args)
51
+ Automatic.guess(*args)
441
52
  end
442
53
 
443
54
  end
444
55
 
445
56
  end
57
+
58
+ %w[encoding manual automatic].each { |lib|
59
+ lib = "cmess/guess_encoding/#{lib}"
60
+ require lib
61
+ }
@@ -0,0 +1,341 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # Contributors: #
14
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
15
+ # for automatic encoding detection) #
16
+ # #
17
+ # cmess is free software; you can redistribute it and/or modify it under the #
18
+ # terms of the GNU General Public License as published by the Free Software #
19
+ # Foundation; either version 3 of the License, or (at your option) any later #
20
+ # version. #
21
+ # #
22
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
+ # details. #
26
+ # #
27
+ # You should have received a copy of the GNU General Public License along #
28
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # #
30
+ ###############################################################################
31
+ #++
32
+
33
+ $KCODE = 'u'
34
+
35
+ require 'yaml'
36
+ require 'iconv'
37
+ require 'stringio'
38
+ require 'forwardable'
39
+
40
+ # Tries to detect the encoding of a given input by applying several
41
+ # heuristics to determine the <b>most likely</b> candidate. If no heuristic
42
+ # catches on, resorts to Encoding::UNKNOWN.
43
+ #
44
+ # If a BOM is found, it may determine the encoding directly.
45
+
46
+ class CMess::GuessEncoding::Automatic
47
+
48
+ extend Forwardable
49
+
50
+ def_delegators self, :encoding_guessers, :supported_encoding?,
51
+ :bom_guessers, :supported_bom?
52
+
53
+ include CMess::GuessEncoding::Encoding
54
+
55
+ # Creates a converter for desired encoding (from UTF-8)
56
+ ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
57
+
58
+ # Single-byte encodings to test statistically by TEST_CHARS
59
+ TEST_ENCODINGS = [
60
+ MACINTOSH,
61
+ ISO_8859_1,
62
+ ISO_8859_15,
63
+ CP1252,
64
+ CP850,
65
+ MS_ANSI
66
+ ]
67
+
68
+ # Certain (non-ASCII) chars to test for in TEST_ENCODINGS
69
+ CHARS_TO_TEST = (
70
+ '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
71
+ 'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
72
+ ).split(//)
73
+
74
+ # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
75
+ TEST_CHARS = Hash.new { |hash, encoding|
76
+ encoding = Encoding.get_or_set_encoding_const(encoding)
77
+ encchars = CHARS_TO_TEST.map { |char|
78
+ begin
79
+ byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
80
+ rescue Iconv::IllegalSequence
81
+ end
82
+ }.compact
83
+
84
+ TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
85
+ hash[encoding] = encchars
86
+ }.update(YAML.load_file(
87
+ File.join(File.dirname(__FILE__), *%w[.. .. .. data test_chars.yaml])
88
+ ))
89
+
90
+ # Relative count of TEST_CHARS must exceed this threshold to yield
91
+ # a direct match
92
+ TEST_THRESHOLD_DIRECT = 0.1
93
+
94
+ # Relative count of TEST_CHARS must exceed this threshold to yield
95
+ # an approximate match
96
+ TEST_THRESHOLD_APPROX = 0.0004
97
+
98
+ @supported_encodings = []
99
+ @encoding_guessers = []
100
+ @supported_boms = []
101
+ @bom_guessers = []
102
+
103
+ class << self
104
+
105
+ attr_reader :supported_encodings, :encoding_guessers,
106
+ :supported_boms, :bom_guessers
107
+
108
+ def guess(input, chunk_size = nil, ignore_bom = false)
109
+ new(input, chunk_size).guess(ignore_bom)
110
+ end
111
+
112
+ private
113
+
114
+ def encoding(encoding, &condition_block)
115
+ encoding_block = lambda {
116
+ encoding if instance_eval(&condition_block)
117
+ }
118
+
119
+ encodings(encoding, &encoding_block)
120
+ end
121
+
122
+ def encodings(*encodings, &encoding_block)
123
+ encodings.each { |encoding|
124
+ @supported_encodings << encoding
125
+ @encoding_guessers << encoding_block \
126
+ unless @encoding_guessers.include?(encoding_block)
127
+ }
128
+ end
129
+
130
+ def supported_encoding?(encoding)
131
+ supported_encodings.include?(encoding)
132
+ end
133
+
134
+ def bom_encoding(encoding, &condition_block)
135
+ encoding_block = lambda {
136
+ encoding if instance_eval(&condition_block)
137
+ }
138
+
139
+ @supported_boms << encoding
140
+ @bom_guessers << encoding_block \
141
+ unless @bom_guessers.include?(encoding_block)
142
+ end
143
+
144
+ def supported_bom?(encoding)
145
+ supported_boms.include?(encoding)
146
+ end
147
+
148
+ end
149
+
150
+ attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
151
+
152
+ def initialize(input, chunk_size = nil)
153
+ @input = case input
154
+ when IO # that's what we want
155
+ input
156
+ when String # convert it to an IO
157
+ StringIO.new(input)
158
+ else # um, what's that...?
159
+ raise ArgumentError, "don't know how to handle input of type #{input.class}"
160
+ end
161
+
162
+ @chunk_size = chunk_size
163
+ end
164
+
165
+ def guess(ignore_bom = false)
166
+ return bom if bom && !ignore_bom
167
+
168
+ while read
169
+ encoding_guessers.each { |block|
170
+ encoding = instance_eval(&block)
171
+ return encoding if encoding && supported_encoding?(encoding)
172
+ }
173
+ end
174
+
175
+ # nothing suitable found :-(
176
+ UNKNOWN
177
+ end
178
+
179
+ def bom
180
+ @bom ||= check_bom
181
+ end
182
+
183
+ private
184
+
185
+ def eof?
186
+ input.eof?
187
+ end
188
+
189
+ def check_bom
190
+ return if eof?
191
+
192
+ # prevent "Illegal seek" error inside a pipe
193
+ begin
194
+ input.pos
195
+ rescue Errno::ESPIPE
196
+ return
197
+ end
198
+
199
+ bom_guessers.each { |block|
200
+ encoding = instance_eval(&block)
201
+ return encoding if encoding && supported_bom?(encoding)
202
+
203
+ # read bytes don't build a BOM, so rewind...
204
+ input.rewind
205
+ }
206
+
207
+ # nothing suitable found :-(
208
+ nil
209
+ end
210
+
211
+ def next_byte
212
+ input.read(1).unpack('C').first
213
+ end
214
+
215
+ def starts_with?(*bytes)
216
+ bytes.all? { |byte|
217
+ next_byte == byte
218
+ }
219
+ end
220
+
221
+ def next_one_of?(*bytes)
222
+ bytes.include?(next_byte)
223
+ end
224
+
225
+ def read(chunk_size = chunk_size)
226
+ # => initialize counters
227
+ @byte_count ||= Hash.new(0)
228
+ @byte_total ||= 0
229
+
230
+ return if eof?
231
+
232
+ bytes_before = @byte_total
233
+
234
+ input.read(chunk_size).each_byte { |byte|
235
+ @byte_count[byte] += 1
236
+ @byte_total += 1
237
+
238
+ @first_byte ||= byte
239
+ }
240
+
241
+ @byte_total > bytes_before
242
+ end
243
+
244
+ def byte_count_sum(*bytes)
245
+ bytes = *bytes # treat arrays/ranges and lists alike
246
+ bytes.inject(0) { |sum, n| sum + byte_count[n] }
247
+ end
248
+
249
+ def relative_byte_count(count)
250
+ count.to_f / byte_total
251
+ end
252
+
253
+ ### Definition of guessing heuristics. Order matters!
254
+
255
+ # ASCII, if all bytes are within the lower 128 bytes
256
+ # (Unfortunately, we have to read the *whole* file to make that decision)
257
+ encoding ASCII do
258
+ eof? && byte_count_sum(0x0..0x7f) == byte_total
259
+ end
260
+
261
+ # UTF-16, if lots of NULL bytes present
262
+ encodings UTF_16BE, UTF_16LE, UTF_16 do
263
+ if relative_byte_count(byte_count[0]) > 0.25
264
+ case first_byte
265
+ when 0x0: UTF_32
266
+ when 0xfe: UTF_16BE
267
+ when 0xff: UTF_16LE
268
+ else UTF_16
269
+ end
270
+ end
271
+ end
272
+
273
+ # UTF-8, if number of escape-bytes and following bytes
274
+ # is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
275
+ encoding UTF_8 do
276
+ esc_bytes = byte_count_sum(0xc0..0xdf) \
277
+ # => 110xxxxx 10xxxxxx
278
+ + byte_count_sum(0xe0..0xef) * 2 \
279
+ # => 1110xxxx 10xxxxxx 10xxxxxx
280
+ + byte_count_sum(0xf0..0xf7) * 3
281
+ # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
282
+ fol_bytes = byte_count_sum(0x80..0xbf)
283
+ # => 10xxxxxx
284
+
285
+ esc_bytes > 0 && esc_bytes == fol_bytes
286
+ end
287
+
288
+ # Analyse statistical appearance of German umlauts and other accented
289
+ # letters (see TEST_CHARS)
290
+ encodings *TEST_ENCODINGS do
291
+ ratios = {}
292
+
293
+ TEST_ENCODINGS.find(lambda {
294
+ ratio, encoding = ratios.sort.last
295
+ encoding if ratio >= TEST_THRESHOLD_APPROX
296
+ }) { |encoding|
297
+ ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
298
+ #p [encoding, ratio]
299
+ ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
300
+ }
301
+ end
302
+
303
+ ### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
304
+
305
+ bom_encoding UTF_8 do
306
+ starts_with?(0xef, 0xbb, 0xbf)
307
+ end
308
+
309
+ bom_encoding UTF_16BE do
310
+ starts_with?(0xfe, 0xff)
311
+ end
312
+
313
+ bom_encoding UTF_16LE do
314
+ starts_with?(0xff, 0xfe)
315
+ end
316
+
317
+ bom_encoding UTF_32BE do
318
+ starts_with?(0x00, 0x00, 0xfe, 0xff)
319
+ end
320
+
321
+ bom_encoding UTF_32LE do
322
+ starts_with?(0xff, 0xfe, 0x00, 0x00)
323
+ end
324
+
325
+ bom_encoding SCSU do
326
+ starts_with?(0x0e, 0xfe, 0xff)
327
+ end
328
+
329
+ bom_encoding UTF_7 do
330
+ starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
331
+ end
332
+
333
+ bom_encoding UTF_EBCDIC do
334
+ starts_with?(0xdd, 0x73, 0x66, 0x73)
335
+ end
336
+
337
+ bom_encoding BOCU_1 do
338
+ starts_with?(0xfb, 0xee, 0x28)
339
+ end
340
+
341
+ end
@@ -0,0 +1,61 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # Contributors: #
14
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
15
+ # for automatic encoding detection) #
16
+ # #
17
+ # cmess is free software; you can redistribute it and/or modify it under the #
18
+ # terms of the GNU General Public License as published by the Free Software #
19
+ # Foundation; either version 3 of the License, or (at your option) any later #
20
+ # version. #
21
+ # #
22
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
+ # details. #
26
+ # #
27
+ # You should have received a copy of the GNU General Public License along #
28
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # #
30
+ ###############################################################################
31
+ #++
32
+
33
+ # Namespace for our encodings.
34
+
35
+ module CMess::GuessEncoding::Encoding
36
+
37
+ extend self
38
+
39
+ def const_name_for(encoding)
40
+ encoding.tr('-', '_').gsub(/\W/, '').upcase
41
+ end
42
+
43
+ def set_encoding_const(encoding, const = const_name_for(encoding))
44
+ const_set(const, encoding.freeze)
45
+ end
46
+
47
+ def get_or_set_encoding_const(encoding)
48
+ const_defined?(const = const_name_for(encoding)) ?
49
+ const_get(const) : set_encoding_const(encoding, const)
50
+ end
51
+
52
+ %w[
53
+ UNKNOWN ASCII MACINTOSH
54
+ ISO-8859-1 ISO-8859-2 ISO-8859-15
55
+ CP1250 CP1251 CP1252 CP850 CP852 CP856
56
+ UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
57
+ UTF-7 UTF-EBCDIC SCSU BOCU-1
58
+ ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
59
+ ].each { |encoding| set_encoding_const(encoding) }
60
+
61
+ end
@@ -0,0 +1,105 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # Contributors: #
14
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
15
+ # for automatic encoding detection) #
16
+ # #
17
+ # cmess is free software; you can redistribute it and/or modify it under the #
18
+ # terms of the GNU General Public License as published by the Free Software #
19
+ # Foundation; either version 3 of the License, or (at your option) any later #
20
+ # version. #
21
+ # #
22
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
+ # details. #
26
+ # #
27
+ # You should have received a copy of the GNU General Public License along #
28
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # #
30
+ ###############################################################################
31
+ #++
32
+
33
+ require 'iconv'
34
+
35
+ # Outputs given string (or line), being encoded in target encoding, encoded in
36
+ # various test encodings, thus allowing to identify the (seemingly) correct
37
+ # encoding by visually comparing the input string with its desired appearance.
38
+
39
+ module CMess::GuessEncoding::Manual
40
+
41
+ extend self
42
+
43
+ include CMess::GuessEncoding::Encoding
44
+
45
+ # default encodings to try
46
+ ENCODINGS = [
47
+ ISO_8859_1,
48
+ ISO_8859_2,
49
+ ISO_8859_15,
50
+ CP1250,
51
+ CP1251,
52
+ CP1252,
53
+ CP850,
54
+ CP852,
55
+ CP856,
56
+ UTF_8
57
+ ]
58
+
59
+ # likely candidates to suggest to the user
60
+ CANDIDATES = [
61
+ ANSI_X34,
62
+ EBCDIC_AT_DE,
63
+ EBCDIC_US,
64
+ EUC_JP,
65
+ KOI_8,
66
+ MACINTOSH,
67
+ MS_ANSI,
68
+ SHIFT_JIS,
69
+ UTF_7,
70
+ UTF_16,
71
+ UTF_16BE,
72
+ UTF_16LE,
73
+ UTF_32,
74
+ UTF_32BE,
75
+ UTF_32LE
76
+ ]
77
+
78
+ def display(input, target_encoding, encodings = nil, additional_encodings = [])
79
+ target = target_encoding
80
+
81
+ encodings = (encodings || ENCODINGS) + additional_encodings
82
+ encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
83
+ # staying at the end
84
+ encodings = [target] + (encodings - [target]) # move target encoding to front
85
+
86
+ max_length = encodings.map { |encoding| encoding.length }.max
87
+
88
+ encodings.each { |encoding|
89
+ converted = begin
90
+ Iconv.conv(target, encoding, input)
91
+ rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
92
+ "ILLEGAL INPUT SEQUENCE: #{err}"
93
+ rescue Iconv::InvalidEncoding
94
+ if encoding == target
95
+ abort "Invalid encoding: #{encoding}"
96
+ else
97
+ "INVALID ENCODING!"
98
+ end
99
+ end
100
+
101
+ puts "%-#{max_length}s : %s" % [encoding, converted]
102
+ }
103
+ end
104
+
105
+ end
@@ -30,7 +30,7 @@ module CMess::Version
30
30
 
31
31
  MAJOR = 0
32
32
  MINOR = 0
33
- TINY = 8
33
+ TINY = 9
34
34
 
35
35
  class << self
36
36
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cmess
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8.274
4
+ version: 0.0.9.276
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-08-14 00:00:00 +02:00
12
+ date: 2008-08-15 00:00:00 +02:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -51,6 +51,9 @@ files:
51
51
  - lib/cmess/cli.rb
52
52
  - lib/cmess/cinderella.rb
53
53
  - lib/cmess/decode_entities.rb
54
+ - lib/cmess/guess_encoding/manual.rb
55
+ - lib/cmess/guess_encoding/encoding.rb
56
+ - lib/cmess/guess_encoding/automatic.rb
54
57
  - bin/cinderella
55
58
  - bin/decode_entities
56
59
  - bin/guess_encoding