cmess 0.0.8.274 → 0.0.9.276

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ChangeLog CHANGED
@@ -1,5 +1,12 @@
1
1
  = Revision history for cmess
2
2
 
3
+ == 0.0.9 [2008-08-15]
4
+
5
+ * Reorganized file structure for guess_encoding
6
+ * Added shortcuts GuessEncoding.manual/.automatic
7
+ * GuessEncoding::Automatic now also takes a String
8
+ as input (will be converted to a StringIO)
9
+
3
10
  == 0.0.8 [2008-08-14]
4
11
 
5
12
  * Require 'cmess' inside libs, so the user doesn't have to
data/README CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  == VERSION
4
4
 
5
- This documentation refers to cmess version 0.0.8
5
+ This documentation refers to cmess version 0.0.9
6
6
 
7
7
 
8
8
  == DESCRIPTION
@@ -30,416 +30,32 @@
30
30
  ###############################################################################
31
31
  #++
32
32
 
33
- $KCODE = 'u'
34
-
35
- require 'yaml'
36
- require 'iconv'
37
- require 'forwardable'
38
-
39
33
  require 'cmess'
40
34
 
41
- # Outputs given string (or line), being encoded in target encoding, encoded in
42
- # various test encodings, thus allowing to identify the (seemingly) correct
43
- # encoding by visually comparing the input string with its desired appearance.
44
- #
45
- # In addition to that manual procedure, may be used to detect the encoding
46
- # automatically. Works actually pretty good -- for the supported encodings
47
- # (see Automatic for details).
35
+ # Allows to guess an input's encoding either manually or automatically.
36
+ # Works actually pretty good -- for the supported encodings. See Manual
37
+ # and Automatic for details.
48
38
 
49
39
  module CMess::GuessEncoding
50
40
 
51
41
  # our version ;-)
52
- VERSION = '0.0.6'
53
-
54
- # Namespace for our encodings.
55
- module Encoding
56
-
57
- extend self
58
-
59
- def const_name_for(encoding)
60
- encoding.tr('-', '_').gsub(/\W/, '').upcase
61
- end
62
-
63
- def set_encoding_const(encoding, const = const_name_for(encoding))
64
- const_set(const, encoding.freeze)
65
- end
66
-
67
- def get_or_set_encoding_const(encoding)
68
- const_defined?(const = const_name_for(encoding)) ? const_get(const) :
69
- set_encoding_const(encoding, const)
70
- end
71
-
72
- %w[
73
- UNKNOWN ASCII MACINTOSH
74
- ISO-8859-1 ISO-8859-2 ISO-8859-15
75
- CP1250 CP1251 CP1252 CP850 CP852 CP856
76
- UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
77
- UTF-7 UTF-EBCDIC SCSU BOCU-1
78
- ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
79
- ].each { |encoding| set_encoding_const(encoding) }
80
-
81
- end
82
-
83
- module Manual
84
-
85
- extend self
86
-
87
- include Encoding
88
-
89
- # default encodings to try
90
- ENCODINGS = [
91
- ISO_8859_1,
92
- ISO_8859_2,
93
- ISO_8859_15,
94
- CP1250,
95
- CP1251,
96
- CP1252,
97
- CP850,
98
- CP852,
99
- CP856,
100
- UTF_8
101
- ]
102
-
103
- # likely candidates to suggest to the user
104
- CANDIDATES = [
105
- ANSI_X34,
106
- EBCDIC_AT_DE,
107
- EBCDIC_US,
108
- EUC_JP,
109
- KOI_8,
110
- MACINTOSH,
111
- MS_ANSI,
112
- SHIFT_JIS,
113
- UTF_7,
114
- UTF_16,
115
- UTF_16BE,
116
- UTF_16LE,
117
- UTF_32,
118
- UTF_32BE,
119
- UTF_32LE
120
- ]
121
-
122
- def display(input, target_encoding, encodings = nil, additional_encodings = [])
123
- target = target_encoding
124
-
125
- encodings = (encodings || ENCODINGS) + additional_encodings
126
- encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
127
- # staying at the end
128
- encodings = [target] + (encodings - [target]) # move target encoding to front
129
-
130
- max_length = encodings.map { |encoding| encoding.length }.max
131
-
132
- encodings.each { |encoding|
133
- converted = begin
134
- Iconv.conv(target, encoding, input)
135
- rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
136
- "ILLEGAL INPUT SEQUENCE: #{err}"
137
- rescue Iconv::InvalidEncoding
138
- if encoding == target
139
- abort "Invalid encoding: #{encoding}"
140
- else
141
- "INVALID ENCODING!"
142
- end
143
- end
144
-
145
- puts "%-#{max_length}s : %s" % [encoding, converted]
146
- }
147
- end
148
-
149
- end
150
-
151
- # Tries to detect the encoding of a given input by applying several
152
- # heuristics to determine the <b>most likely</b> candidate. If no heuristic
153
- # catches on, resorts to Encoding::UNKNOWN.
154
- #
155
- # If a BOM is found, it may determine the encoding directly.
156
- class Automatic
157
-
158
- extend Forwardable
159
-
160
- def_delegators self, :encoding_guessers, :supported_encoding?,
161
- :bom_guessers, :supported_bom?
162
-
163
- include Encoding
164
-
165
- # Creates a converter for desired encoding (from UTF-8)
166
- ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
167
-
168
- # Single-byte encodings to test statistically by TEST_CHARS
169
- TEST_ENCODINGS = [
170
- MACINTOSH,
171
- ISO_8859_1,
172
- ISO_8859_15,
173
- CP1252,
174
- CP850,
175
- MS_ANSI
176
- ]
177
-
178
- # Certain (non-ASCII) chars to test for in TEST_ENCODINGS
179
- CHARS_TO_TEST = (
180
- '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
181
- 'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
182
- ).split(//)
183
-
184
- # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
185
- TEST_CHARS = Hash.new { |hash, encoding|
186
- encoding = Encoding.get_or_set_encoding_const(encoding)
187
- encchars = CHARS_TO_TEST.map { |char|
188
- begin
189
- byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
190
- rescue Iconv::IllegalSequence
191
- end
192
- }.compact
193
-
194
- TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
195
- hash[encoding] = encchars
196
- }.update(YAML.load_file(
197
- File.join(File.dirname(__FILE__), '..', '..', 'data', 'test_chars.yaml')
198
- ))
199
-
200
- # Relative count of TEST_CHARS must exceed this threshold to yield
201
- # a direct match
202
- TEST_THRESHOLD_DIRECT = 0.1
203
-
204
- # Relative count of TEST_CHARS must exceed this threshold to yield
205
- # an approximate match
206
- TEST_THRESHOLD_APPROX = 0.0004
207
-
208
- @supported_encodings = []
209
- @encoding_guessers = []
210
- @supported_boms = []
211
- @bom_guessers = []
212
-
213
- class << self
214
-
215
- attr_reader :supported_encodings, :encoding_guessers,
216
- :supported_boms, :bom_guessers
217
-
218
- def guess(input, chunk_size = nil, ignore_bom = false)
219
- new(input, chunk_size).guess(ignore_bom)
220
- end
221
-
222
- private
223
-
224
- def encoding(encoding, &condition_block)
225
- encoding_block = lambda {
226
- encoding if instance_eval(&condition_block)
227
- }
228
-
229
- encodings(encoding, &encoding_block)
230
- end
231
-
232
- def encodings(*encodings, &encoding_block)
233
- encodings.each { |encoding|
234
- @supported_encodings << encoding
235
- @encoding_guessers << encoding_block \
236
- unless @encoding_guessers.include?(encoding_block)
237
- }
238
- end
239
-
240
- def supported_encoding?(encoding)
241
- supported_encodings.include?(encoding)
242
- end
243
-
244
- def bom_encoding(encoding, &condition_block)
245
- encoding_block = lambda {
246
- encoding if instance_eval(&condition_block)
247
- }
248
-
249
- @supported_boms << encoding
250
- @bom_guessers << encoding_block \
251
- unless @bom_guessers.include?(encoding_block)
252
- end
253
-
254
- def supported_bom?(encoding)
255
- supported_boms.include?(encoding)
256
- end
257
-
258
- end
259
-
260
- attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
261
-
262
- def initialize(input, chunk_size = nil)
263
- @input = input
264
- @chunk_size = chunk_size
265
- end
266
-
267
- def guess(ignore_bom = false)
268
- return bom if bom && !ignore_bom
269
-
270
- while read
271
- encoding_guessers.each { |block|
272
- encoding = instance_eval(&block)
273
- return encoding if encoding && supported_encoding?(encoding)
274
- }
275
- end
42
+ VERSION = '0.0.7'
276
43
 
277
- # nothing suitable found :-(
278
- UNKNOWN
279
- end
280
-
281
- def bom
282
- @bom ||= check_bom
283
- end
284
-
285
- private
286
-
287
- def eof?
288
- input.eof?
289
- end
290
-
291
- def check_bom
292
- return if eof?
293
-
294
- # prevent "Illegal seek" error inside a pipe
295
- begin
296
- input.pos
297
- rescue Errno::ESPIPE
298
- return
299
- end
300
-
301
- bom_guessers.each { |block|
302
- encoding = instance_eval(&block)
303
- return encoding if encoding && supported_bom?(encoding)
304
-
305
- # read bytes don't build a BOM, so rewind...
306
- input.rewind
307
- }
308
-
309
- # nothing suitable found :-(
310
- nil
311
- end
312
-
313
- def next_byte
314
- input.read(1).unpack('C').first
315
- end
316
-
317
- def starts_with?(*bytes)
318
- bytes.all? { |byte|
319
- next_byte == byte
320
- }
321
- end
322
-
323
- def next_one_of?(*bytes)
324
- bytes.include?(next_byte)
325
- end
326
-
327
- def read(chunk_size = chunk_size)
328
- # => initialize counters
329
- @byte_count ||= Hash.new(0)
330
- @byte_total ||= 0
331
-
332
- return if eof?
333
-
334
- bytes_before = @byte_total
335
-
336
- input.read(chunk_size).each_byte { |byte|
337
- @byte_count[byte] += 1
338
- @byte_total += 1
339
-
340
- @first_byte ||= byte
341
- }
342
-
343
- @byte_total > bytes_before
344
- end
345
-
346
- def byte_count_sum(*bytes)
347
- bytes = *bytes # treat arrays/ranges and lists alike
348
- bytes.inject(0) { |sum, n| sum + byte_count[n] }
349
- end
350
-
351
- def relative_byte_count(count)
352
- count.to_f / byte_total
353
- end
354
-
355
- ### Definition of guessing heuristics. Order matters!
356
-
357
- # ASCII, if all bytes are within the lower 128 bytes
358
- # (Unfortunately, we have to read the *whole* file to make that decision)
359
- encoding ASCII do
360
- eof? && byte_count_sum(0x0..0x7f) == byte_total
361
- end
362
-
363
- # UTF-16, if lots of NULL bytes present
364
- encodings UTF_16BE, UTF_16LE, UTF_16 do
365
- if relative_byte_count(byte_count[0]) > 0.25
366
- case first_byte
367
- when 0x0: UTF_32
368
- when 0xfe: UTF_16BE
369
- when 0xff: UTF_16LE
370
- else UTF_16
371
- end
372
- end
373
- end
374
-
375
- # UTF-8, if number of escape-bytes and following bytes
376
- # is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
377
- encoding UTF_8 do
378
- esc_bytes = byte_count_sum(0xc0..0xdf) \
379
- # => 110xxxxx 10xxxxxx
380
- + byte_count_sum(0xe0..0xef) * 2 \
381
- # => 1110xxxx 10xxxxxx 10xxxxxx
382
- + byte_count_sum(0xf0..0xf7) * 3
383
- # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
384
- fol_bytes = byte_count_sum(0x80..0xbf)
385
- # => 10xxxxxx
386
-
387
- esc_bytes > 0 && esc_bytes == fol_bytes
388
- end
389
-
390
- # Analyse statistical appearance of German umlauts and other accented
391
- # letters (see TEST_CHARS)
392
- encodings *TEST_ENCODINGS do
393
- ratios = {}
394
-
395
- TEST_ENCODINGS.find(lambda {
396
- ratio, encoding = ratios.sort.last
397
- encoding if ratio >= TEST_THRESHOLD_APPROX
398
- }) { |encoding|
399
- ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
400
- #p [encoding, ratio]
401
- ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
402
- }
403
- end
404
-
405
- ### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
406
-
407
- bom_encoding UTF_8 do
408
- starts_with?(0xef, 0xbb, 0xbf)
409
- end
410
-
411
- bom_encoding UTF_16BE do
412
- starts_with?(0xfe, 0xff)
413
- end
414
-
415
- bom_encoding UTF_16LE do
416
- starts_with?(0xff, 0xfe)
417
- end
418
-
419
- bom_encoding UTF_32BE do
420
- starts_with?(0x00, 0x00, 0xfe, 0xff)
421
- end
44
+ class << self
422
45
 
423
- bom_encoding UTF_32LE do
424
- starts_with?(0xff, 0xfe, 0x00, 0x00)
46
+ def manual(*args)
47
+ Manual.display(*args)
425
48
  end
426
49
 
427
- bom_encoding SCSU do
428
- starts_with?(0x0e, 0xfe, 0xff)
429
- end
430
-
431
- bom_encoding UTF_7 do
432
- starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
433
- end
434
-
435
- bom_encoding UTF_EBCDIC do
436
- starts_with?(0xdd, 0x73, 0x66, 0x73)
437
- end
438
-
439
- bom_encoding BOCU_1 do
440
- starts_with?(0xfb, 0xee, 0x28)
50
+ def automatic(*args)
51
+ Automatic.guess(*args)
441
52
  end
442
53
 
443
54
  end
444
55
 
445
56
  end
57
+
58
+ %w[encoding manual automatic].each { |lib|
59
+ lib = "cmess/guess_encoding/#{lib}"
60
+ require lib
61
+ }
@@ -0,0 +1,341 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # Contributors: #
14
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
15
+ # for automatic encoding detection) #
16
+ # #
17
+ # cmess is free software; you can redistribute it and/or modify it under the #
18
+ # terms of the GNU General Public License as published by the Free Software #
19
+ # Foundation; either version 3 of the License, or (at your option) any later #
20
+ # version. #
21
+ # #
22
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
+ # details. #
26
+ # #
27
+ # You should have received a copy of the GNU General Public License along #
28
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # #
30
+ ###############################################################################
31
+ #++
32
+
33
+ $KCODE = 'u'
34
+
35
+ require 'yaml'
36
+ require 'iconv'
37
+ require 'stringio'
38
+ require 'forwardable'
39
+
40
+ # Tries to detect the encoding of a given input by applying several
41
+ # heuristics to determine the <b>most likely</b> candidate. If no heuristic
42
+ # catches on, resorts to Encoding::UNKNOWN.
43
+ #
44
+ # If a BOM is found, it may determine the encoding directly.
45
+
46
+ class CMess::GuessEncoding::Automatic
47
+
48
+ extend Forwardable
49
+
50
+ def_delegators self, :encoding_guessers, :supported_encoding?,
51
+ :bom_guessers, :supported_bom?
52
+
53
+ include CMess::GuessEncoding::Encoding
54
+
55
+ # Creates a converter for desired encoding (from UTF-8)
56
+ ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
57
+
58
+ # Single-byte encodings to test statistically by TEST_CHARS
59
+ TEST_ENCODINGS = [
60
+ MACINTOSH,
61
+ ISO_8859_1,
62
+ ISO_8859_15,
63
+ CP1252,
64
+ CP850,
65
+ MS_ANSI
66
+ ]
67
+
68
+ # Certain (non-ASCII) chars to test for in TEST_ENCODINGS
69
+ CHARS_TO_TEST = (
70
+ '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
71
+ 'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
72
+ ).split(//)
73
+
74
+ # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
75
+ TEST_CHARS = Hash.new { |hash, encoding|
76
+ encoding = Encoding.get_or_set_encoding_const(encoding)
77
+ encchars = CHARS_TO_TEST.map { |char|
78
+ begin
79
+ byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
80
+ rescue Iconv::IllegalSequence
81
+ end
82
+ }.compact
83
+
84
+ TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
85
+ hash[encoding] = encchars
86
+ }.update(YAML.load_file(
87
+ File.join(File.dirname(__FILE__), *%w[.. .. .. data test_chars.yaml])
88
+ ))
89
+
90
+ # Relative count of TEST_CHARS must exceed this threshold to yield
91
+ # a direct match
92
+ TEST_THRESHOLD_DIRECT = 0.1
93
+
94
+ # Relative count of TEST_CHARS must exceed this threshold to yield
95
+ # an approximate match
96
+ TEST_THRESHOLD_APPROX = 0.0004
97
+
98
+ @supported_encodings = []
99
+ @encoding_guessers = []
100
+ @supported_boms = []
101
+ @bom_guessers = []
102
+
103
+ class << self
104
+
105
+ attr_reader :supported_encodings, :encoding_guessers,
106
+ :supported_boms, :bom_guessers
107
+
108
+ def guess(input, chunk_size = nil, ignore_bom = false)
109
+ new(input, chunk_size).guess(ignore_bom)
110
+ end
111
+
112
+ private
113
+
114
+ def encoding(encoding, &condition_block)
115
+ encoding_block = lambda {
116
+ encoding if instance_eval(&condition_block)
117
+ }
118
+
119
+ encodings(encoding, &encoding_block)
120
+ end
121
+
122
+ def encodings(*encodings, &encoding_block)
123
+ encodings.each { |encoding|
124
+ @supported_encodings << encoding
125
+ @encoding_guessers << encoding_block \
126
+ unless @encoding_guessers.include?(encoding_block)
127
+ }
128
+ end
129
+
130
+ def supported_encoding?(encoding)
131
+ supported_encodings.include?(encoding)
132
+ end
133
+
134
+ def bom_encoding(encoding, &condition_block)
135
+ encoding_block = lambda {
136
+ encoding if instance_eval(&condition_block)
137
+ }
138
+
139
+ @supported_boms << encoding
140
+ @bom_guessers << encoding_block \
141
+ unless @bom_guessers.include?(encoding_block)
142
+ end
143
+
144
+ def supported_bom?(encoding)
145
+ supported_boms.include?(encoding)
146
+ end
147
+
148
+ end
149
+
150
+ attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
151
+
152
+ def initialize(input, chunk_size = nil)
153
+ @input = case input
154
+ when IO # that's what we want
155
+ input
156
+ when String # convert it to an IO
157
+ StringIO.new(input)
158
+ else # um, what's that...?
159
+ raise ArgumentError, "don't know how to handle input of type #{input.class}"
160
+ end
161
+
162
+ @chunk_size = chunk_size
163
+ end
164
+
165
+ def guess(ignore_bom = false)
166
+ return bom if bom && !ignore_bom
167
+
168
+ while read
169
+ encoding_guessers.each { |block|
170
+ encoding = instance_eval(&block)
171
+ return encoding if encoding && supported_encoding?(encoding)
172
+ }
173
+ end
174
+
175
+ # nothing suitable found :-(
176
+ UNKNOWN
177
+ end
178
+
179
+ def bom
180
+ @bom ||= check_bom
181
+ end
182
+
183
+ private
184
+
185
+ def eof?
186
+ input.eof?
187
+ end
188
+
189
+ def check_bom
190
+ return if eof?
191
+
192
+ # prevent "Illegal seek" error inside a pipe
193
+ begin
194
+ input.pos
195
+ rescue Errno::ESPIPE
196
+ return
197
+ end
198
+
199
+ bom_guessers.each { |block|
200
+ encoding = instance_eval(&block)
201
+ return encoding if encoding && supported_bom?(encoding)
202
+
203
+ # read bytes don't build a BOM, so rewind...
204
+ input.rewind
205
+ }
206
+
207
+ # nothing suitable found :-(
208
+ nil
209
+ end
210
+
211
+ def next_byte
212
+ input.read(1).unpack('C').first
213
+ end
214
+
215
+ def starts_with?(*bytes)
216
+ bytes.all? { |byte|
217
+ next_byte == byte
218
+ }
219
+ end
220
+
221
+ def next_one_of?(*bytes)
222
+ bytes.include?(next_byte)
223
+ end
224
+
225
+ def read(chunk_size = chunk_size)
226
+ # => initialize counters
227
+ @byte_count ||= Hash.new(0)
228
+ @byte_total ||= 0
229
+
230
+ return if eof?
231
+
232
+ bytes_before = @byte_total
233
+
234
+ input.read(chunk_size).each_byte { |byte|
235
+ @byte_count[byte] += 1
236
+ @byte_total += 1
237
+
238
+ @first_byte ||= byte
239
+ }
240
+
241
+ @byte_total > bytes_before
242
+ end
243
+
244
+ def byte_count_sum(*bytes)
245
+ bytes = *bytes # treat arrays/ranges and lists alike
246
+ bytes.inject(0) { |sum, n| sum + byte_count[n] }
247
+ end
248
+
249
+ def relative_byte_count(count)
250
+ count.to_f / byte_total
251
+ end
252
+
253
+ ### Definition of guessing heuristics. Order matters!
254
+
255
+ # ASCII, if all bytes are within the lower 128 bytes
256
+ # (Unfortunately, we have to read the *whole* file to make that decision)
257
+ encoding ASCII do
258
+ eof? && byte_count_sum(0x0..0x7f) == byte_total
259
+ end
260
+
261
+ # UTF-16, if lots of NULL bytes present
262
+ encodings UTF_16BE, UTF_16LE, UTF_16 do
263
+ if relative_byte_count(byte_count[0]) > 0.25
264
+ case first_byte
265
+ when 0x0: UTF_32
266
+ when 0xfe: UTF_16BE
267
+ when 0xff: UTF_16LE
268
+ else UTF_16
269
+ end
270
+ end
271
+ end
272
+
273
+ # UTF-8, if number of escape-bytes and following bytes
274
+ # is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
275
+ encoding UTF_8 do
276
+ esc_bytes = byte_count_sum(0xc0..0xdf) \
277
+ # => 110xxxxx 10xxxxxx
278
+ + byte_count_sum(0xe0..0xef) * 2 \
279
+ # => 1110xxxx 10xxxxxx 10xxxxxx
280
+ + byte_count_sum(0xf0..0xf7) * 3
281
+ # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
282
+ fol_bytes = byte_count_sum(0x80..0xbf)
283
+ # => 10xxxxxx
284
+
285
+ esc_bytes > 0 && esc_bytes == fol_bytes
286
+ end
287
+
288
+ # Analyse statistical appearance of German umlauts and other accented
289
+ # letters (see TEST_CHARS)
290
+ encodings *TEST_ENCODINGS do
291
+ ratios = {}
292
+
293
+ TEST_ENCODINGS.find(lambda {
294
+ ratio, encoding = ratios.sort.last
295
+ encoding if ratio >= TEST_THRESHOLD_APPROX
296
+ }) { |encoding|
297
+ ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
298
+ #p [encoding, ratio]
299
+ ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
300
+ }
301
+ end
302
+
303
+ ### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
304
+
305
+ bom_encoding UTF_8 do
306
+ starts_with?(0xef, 0xbb, 0xbf)
307
+ end
308
+
309
+ bom_encoding UTF_16BE do
310
+ starts_with?(0xfe, 0xff)
311
+ end
312
+
313
+ bom_encoding UTF_16LE do
314
+ starts_with?(0xff, 0xfe)
315
+ end
316
+
317
+ bom_encoding UTF_32BE do
318
+ starts_with?(0x00, 0x00, 0xfe, 0xff)
319
+ end
320
+
321
+ bom_encoding UTF_32LE do
322
+ starts_with?(0xff, 0xfe, 0x00, 0x00)
323
+ end
324
+
325
+ bom_encoding SCSU do
326
+ starts_with?(0x0e, 0xfe, 0xff)
327
+ end
328
+
329
+ bom_encoding UTF_7 do
330
+ starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
331
+ end
332
+
333
+ bom_encoding UTF_EBCDIC do
334
+ starts_with?(0xdd, 0x73, 0x66, 0x73)
335
+ end
336
+
337
+ bom_encoding BOCU_1 do
338
+ starts_with?(0xfb, 0xee, 0x28)
339
+ end
340
+
341
+ end
@@ -0,0 +1,61 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # Contributors: #
14
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
15
+ # for automatic encoding detection) #
16
+ # #
17
+ # cmess is free software; you can redistribute it and/or modify it under the #
18
+ # terms of the GNU General Public License as published by the Free Software #
19
+ # Foundation; either version 3 of the License, or (at your option) any later #
20
+ # version. #
21
+ # #
22
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
+ # details. #
26
+ # #
27
+ # You should have received a copy of the GNU General Public License along #
28
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # #
30
+ ###############################################################################
31
+ #++
32
+
33
+ # Namespace for our encodings.
34
+
35
+ module CMess::GuessEncoding::Encoding
36
+
37
+ extend self
38
+
39
+ def const_name_for(encoding)
40
+ encoding.tr('-', '_').gsub(/\W/, '').upcase
41
+ end
42
+
43
+ def set_encoding_const(encoding, const = const_name_for(encoding))
44
+ const_set(const, encoding.freeze)
45
+ end
46
+
47
+ def get_or_set_encoding_const(encoding)
48
+ const_defined?(const = const_name_for(encoding)) ?
49
+ const_get(const) : set_encoding_const(encoding, const)
50
+ end
51
+
52
+ %w[
53
+ UNKNOWN ASCII MACINTOSH
54
+ ISO-8859-1 ISO-8859-2 ISO-8859-15
55
+ CP1250 CP1251 CP1252 CP850 CP852 CP856
56
+ UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
57
+ UTF-7 UTF-EBCDIC SCSU BOCU-1
58
+ ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
59
+ ].each { |encoding| set_encoding_const(encoding) }
60
+
61
+ end
@@ -0,0 +1,105 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # Contributors: #
14
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
15
+ # for automatic encoding detection) #
16
+ # #
17
+ # cmess is free software; you can redistribute it and/or modify it under the #
18
+ # terms of the GNU General Public License as published by the Free Software #
19
+ # Foundation; either version 3 of the License, or (at your option) any later #
20
+ # version. #
21
+ # #
22
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
+ # details. #
26
+ # #
27
+ # You should have received a copy of the GNU General Public License along #
28
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # #
30
+ ###############################################################################
31
+ #++
32
+
33
+ require 'iconv'
34
+
35
+ # Outputs given string (or line), being encoded in target encoding, encoded in
36
+ # various test encodings, thus allowing to identify the (seemingly) correct
37
+ # encoding by visually comparing the input string with its desired appearance.
38
+
39
+ module CMess::GuessEncoding::Manual
40
+
41
+ extend self
42
+
43
+ include CMess::GuessEncoding::Encoding
44
+
45
+ # default encodings to try
46
+ ENCODINGS = [
47
+ ISO_8859_1,
48
+ ISO_8859_2,
49
+ ISO_8859_15,
50
+ CP1250,
51
+ CP1251,
52
+ CP1252,
53
+ CP850,
54
+ CP852,
55
+ CP856,
56
+ UTF_8
57
+ ]
58
+
59
+ # likely candidates to suggest to the user
60
+ CANDIDATES = [
61
+ ANSI_X34,
62
+ EBCDIC_AT_DE,
63
+ EBCDIC_US,
64
+ EUC_JP,
65
+ KOI_8,
66
+ MACINTOSH,
67
+ MS_ANSI,
68
+ SHIFT_JIS,
69
+ UTF_7,
70
+ UTF_16,
71
+ UTF_16BE,
72
+ UTF_16LE,
73
+ UTF_32,
74
+ UTF_32BE,
75
+ UTF_32LE
76
+ ]
77
+
78
+ def display(input, target_encoding, encodings = nil, additional_encodings = [])
79
+ target = target_encoding
80
+
81
+ encodings = (encodings || ENCODINGS) + additional_encodings
82
+ encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
83
+ # staying at the end
84
+ encodings = [target] + (encodings - [target]) # move target encoding to front
85
+
86
+ max_length = encodings.map { |encoding| encoding.length }.max
87
+
88
+ encodings.each { |encoding|
89
+ converted = begin
90
+ Iconv.conv(target, encoding, input)
91
+ rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
92
+ "ILLEGAL INPUT SEQUENCE: #{err}"
93
+ rescue Iconv::InvalidEncoding
94
+ if encoding == target
95
+ abort "Invalid encoding: #{encoding}"
96
+ else
97
+ "INVALID ENCODING!"
98
+ end
99
+ end
100
+
101
+ puts "%-#{max_length}s : %s" % [encoding, converted]
102
+ }
103
+ end
104
+
105
+ end
@@ -30,7 +30,7 @@ module CMess::Version
30
30
 
31
31
  MAJOR = 0
32
32
  MINOR = 0
33
- TINY = 8
33
+ TINY = 9
34
34
 
35
35
  class << self
36
36
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cmess
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8.274
4
+ version: 0.0.9.276
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-08-14 00:00:00 +02:00
12
+ date: 2008-08-15 00:00:00 +02:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -51,6 +51,9 @@ files:
51
51
  - lib/cmess/cli.rb
52
52
  - lib/cmess/cinderella.rb
53
53
  - lib/cmess/decode_entities.rb
54
+ - lib/cmess/guess_encoding/manual.rb
55
+ - lib/cmess/guess_encoding/encoding.rb
56
+ - lib/cmess/guess_encoding/automatic.rb
54
57
  - bin/cinderella
55
58
  - bin/decode_entities
56
59
  - bin/guess_encoding