cmess 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@
5
5
  # #
6
6
  # A component of cmess, the encoding tool-box. #
7
7
  # #
8
- # Copyright (C) 2007-2010 University of Cologne, #
8
+ # Copyright (C) 2007-2011 University of Cologne, #
9
9
  # Albertus-Magnus-Platz, #
10
10
  # 50923 Cologne, Germany #
11
11
  # #
@@ -17,49 +17,50 @@
17
17
  # for automatic encoding detection) #
18
18
  # #
19
19
  # cmess is free software; you can redistribute it and/or modify it under the #
20
- # terms of the GNU General Public License as published by the Free Software #
21
- # Foundation; either version 3 of the License, or (at your option) any later #
22
- # version. #
20
+ # terms of the GNU Affero General Public License as published by the Free #
21
+ # Software Foundation; either version 3 of the License, or (at your option) #
22
+ # any later version. #
23
23
  # #
24
24
  # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
25
25
  # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
26
- # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
27
- # details. #
26
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
27
+ # more details. #
28
28
  # #
29
- # You should have received a copy of the GNU General Public License along #
30
- # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # You should have received a copy of the GNU Affero General Public License #
30
+ # along with cmess. If not, see <http://www.gnu.org/licenses/>. #
31
31
  # #
32
32
  ###############################################################################
33
33
  #++
34
34
 
35
- $KCODE = 'u' unless RUBY_VERSION >= '1.9'
35
+ $KCODE = 'u' if RUBY_VERSION < '1.9'
36
+
37
+ require 'cmess/guess_encoding'
36
38
 
37
39
  require 'yaml'
38
- require 'iconv'
39
40
  require 'stringio'
40
41
  require 'forwardable'
41
42
 
42
43
  # Tries to detect the encoding of a given input by applying several
43
- # heuristics to determine the <b>most likely</b> candidate. If no heuristic
44
- # catches on, resorts to Encoding::UNKNOWN.
44
+ # heuristics to determine the <b>most likely</b> candidate. If no
45
+ # heuristic catches on, resorts to Encoding::UNKNOWN.
45
46
  #
46
47
  # If a BOM is found, it may determine the encoding directly.
48
+ #
49
+ # For supported encodings see EncodingGuessers and BOMGuessers.
47
50
 
48
- module CMess
49
- module GuessEncoding
50
- class Automatic
51
+ class CMess::GuessEncoding::Automatic
51
52
 
52
53
  extend Forwardable
53
54
 
54
55
  def_delegators self, :encoding_guessers, :supported_encoding?,
55
56
  :bom_guessers, :supported_bom?
56
57
 
57
- include Encoding
58
+ include CMess::GuessEncoding::Encoding
58
59
 
59
- # Creates a converter for desired encoding (from UTF-8)
60
+ # Creates a converter for desired encoding (from UTF-8).
60
61
  ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
61
62
 
62
- # Single-byte encodings to test statistically by TEST_CHARS
63
+ # Single-byte encodings to test statistically by TEST_CHARS.
63
64
  TEST_ENCODINGS = [
64
65
  MACINTOSH,
65
66
  ISO_8859_1,
@@ -82,15 +83,16 @@ module CMess
82
83
  MS_ANSI
83
84
  ]
84
85
 
85
- # Certain (non-ASCII) chars to test for in TEST_ENCODINGS
86
+ # Certain (non-ASCII) chars to test for in TEST_ENCODINGS.
86
87
  CHARS_TO_TEST = (
87
88
  '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
88
89
  'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
89
90
  ).split(//)
90
91
 
91
- # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
92
+ # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST.
92
93
  TEST_CHARS = Hash.new { |hash, encoding|
93
- encoding = get_or_set_encoding_const(encoding)
94
+ encoding = self[encoding]
95
+
94
96
  encchars = CHARS_TO_TEST.map { |char|
95
97
  begin
96
98
  byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
@@ -99,19 +101,21 @@ module CMess
99
101
  }.compact
100
102
 
101
103
  TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
104
+
102
105
  hash[encoding] = encchars
103
- }.update(YAML.load_file(
104
- File.join(File.dirname(__FILE__), *%w[.. .. .. data test_chars.yaml])
105
- ))
106
+ }.update(YAML.load_file(File.join(CMess::DATA_DIR, 'test_chars.yaml')))
106
107
 
107
108
  # Relative count of TEST_CHARS must exceed this threshold to yield
108
- # a direct match
109
+ # a direct match.
109
110
  TEST_THRESHOLD_DIRECT = 0.1
110
111
 
111
112
  # Relative count of TEST_CHARS must exceed this threshold to yield
112
- # an approximate match
113
+ # an approximate match.
113
114
  TEST_THRESHOLD_APPROX = 0.0004
114
115
 
116
+ # Pattern for method names in EncodingGuessers and BOMGuessers.
117
+ GUESS_METHOD_RE = %r{\A((?:bom_)?encoding)_\d+_(.+)\z}
118
+
115
119
  @supported_encodings = []
116
120
  @encoding_guessers = []
117
121
  @supported_boms = []
@@ -128,19 +132,12 @@ module CMess
128
132
 
129
133
  private
130
134
 
131
- def encoding(encoding, &condition_block)
132
- encoding_block = lambda {
133
- encoding if instance_eval(&condition_block)
134
- }
135
-
136
- encodings(encoding, &encoding_block)
137
- end
135
+ def encoding(*encodings, &block)
136
+ encodings.flatten.each { |encoding|
137
+ next if @supported_encodings.include?(encoding)
138
138
 
139
- def encodings(*encodings, &encoding_block)
140
- encodings.each { |encoding|
141
139
  @supported_encodings << encoding
142
- @encoding_guessers << encoding_block \
143
- unless @encoding_guessers.include?(encoding_block)
140
+ @encoding_guessers << block
144
141
  }
145
142
  end
146
143
 
@@ -148,14 +145,11 @@ module CMess
148
145
  supported_encodings.include?(encoding)
149
146
  end
150
147
 
151
- def bom_encoding(encoding, &condition_block)
152
- encoding_block = lambda {
153
- encoding if instance_eval(&condition_block)
154
- }
148
+ def bom_encoding(encoding, &block)
149
+ return if @supported_boms.include?(encoding)
155
150
 
156
151
  @supported_boms << encoding
157
- @bom_guessers << encoding_block \
158
- unless @bom_guessers.include?(encoding_block)
152
+ @bom_guessers << lambda { |*| encoding if instance_eval(&block) }
159
153
  end
160
154
 
161
155
  def supported_bom?(encoding)
@@ -168,11 +162,9 @@ module CMess
168
162
 
169
163
  def initialize(input, chunk_size = nil)
170
164
  @input = case input
171
- when IO # that's what we want
172
- input
173
- when String # convert it to an IO
174
- StringIO.new(input)
175
- else # um, what's that...?
165
+ when IO then input
166
+ when String then StringIO.new(input)
167
+ else
176
168
  raise ArgumentError, "don't know how to handle input of type #{input.class}"
177
169
  end
178
170
 
@@ -230,9 +222,7 @@ module CMess
230
222
  end
231
223
 
232
224
  def starts_with?(*bytes)
233
- bytes.all? { |byte|
234
- next_byte == byte
235
- }
225
+ bytes.all? { |byte| next_byte == byte }
236
226
  end
237
227
 
238
228
  def next_one_of?(*bytes)
@@ -240,7 +230,6 @@ module CMess
240
230
  end
241
231
 
242
232
  def read(chunk_size = chunk_size)
243
- # => initialize counters
244
233
  @byte_count ||= Hash.new(0)
245
234
  @byte_total ||= 0
246
235
 
@@ -258,103 +247,140 @@ module CMess
258
247
  @byte_total > bytes_before
259
248
  end
260
249
 
261
- def byte_count_sum(*bytes)
262
- bytes = *bytes # treat arrays/ranges and lists alike
263
- bytes.inject(0) { |sum, n| sum + byte_count[n] }
250
+ def byte_count_sum(bytes)
251
+ Array(bytes).inject(0) { |sum, n| sum + byte_count[n] }
264
252
  end
265
253
 
266
254
  def relative_byte_count(count)
267
255
  count.to_f / byte_total
268
256
  end
269
257
 
270
- ### Definition of guessing heuristics. Order matters!
258
+ # Definition of guessing heuristics. Order matters!
271
259
 
272
- # ASCII, if all bytes are within the lower 128 bytes
273
- # (Unfortunately, we have to read the *whole* file to make that decision)
274
- encoding ASCII do
275
- eof? && byte_count_sum(0x0..0x7f) == byte_total
276
- end
260
+ module EncodingGuessers
261
+
262
+ include CMess::GuessEncoding::Encoding
277
263
 
278
- # UTF-16, if lots of NULL bytes present
279
- encodings UTF_16BE, UTF_16LE, UTF_16 do
280
- if relative_byte_count(byte_count[0]) > 0.25
281
- case first_byte
282
- when 0x0 then UTF_32
283
- when 0xfe then UTF_16BE
284
- when 0xff then UTF_16LE
285
- else UTF_16
264
+ # ASCII[http://en.wikipedia.org/wiki/ASCII], if all bytes are
265
+ # within the lower 128 bytes. Unfortunately, we have to read
266
+ # the *whole* file to make that decision.
267
+ def encoding_01_ASCII
268
+ ASCII if eof? && byte_count_sum(0x00..0x7f) == byte_total
269
+ end
270
+
271
+ # UTF-16[http://en.wikipedia.org/wiki/UTF-16] /
272
+ # UTF-32[http://en.wikipedia.org/wiki/UTF-32], if lots of
273
+ # NULL[http://en.wikipedia.org/wiki/Null_character] bytes present.
274
+ def encoding_02_UTF_32_and_UTF_16BE_and_UTF_16LE_and_UTF_16
275
+ if relative_byte_count(byte_count[0]) > 0.25
276
+ case first_byte
277
+ when 0x00 then UTF_32
278
+ when 0xfe then UTF_16BE
279
+ when 0xff then UTF_16LE
280
+ else UTF_16
281
+ end
286
282
  end
287
283
  end
288
- end
289
284
 
290
- # UTF-8, if number of escape-bytes and following bytes
291
- # is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
292
- encoding UTF_8 do
293
- esc_bytes = byte_count_sum(0xc0..0xdf) +
294
- # => 110xxxxx 10xxxxxx
295
- byte_count_sum(0xe0..0xef) * 2 +
296
- # => 1110xxxx 10xxxxxx 10xxxxxx
297
- byte_count_sum(0xf0..0xf7) * 3
298
- # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
299
- fol_bytes = byte_count_sum(0x80..0xbf)
300
- # => 10xxxxxx
301
-
302
- esc_bytes > 0 && esc_bytes == fol_bytes
303
- end
285
+ # UTF-8[http://en.wikipedia.org/wiki/UTF-8], if number of escape-bytes
286
+ # and following bytes is matching.
287
+ def encoding_03_UTF_8
288
+ esc_bytes = byte_count_sum(0xc0..0xdf) +
289
+ # => 110xxxxx 10xxxxxx
290
+ byte_count_sum(0xe0..0xef) * 2 +
291
+ # => 1110xxxx 10xxxxxx 10xxxxxx
292
+ byte_count_sum(0xf0..0xf7) * 3
293
+ # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
294
+
295
+ UTF_8 if esc_bytes > 0 && esc_bytes == byte_count_sum(0x80..0xbf)
296
+ end
297
+
298
+ # TEST_ENCODINGS, if frequency of TEST_CHARS exceeds TEST_THRESHOLD_DIRECT
299
+ # (direct match) or TEST_THRESHOLD_APPROX (approximate match).
300
+ def encoding_04_TEST_ENCODINGS
301
+ ratios = {}
302
+
303
+ TEST_ENCODINGS.find(lambda {
304
+ ratio, encoding = ratios.sort.last
305
+ encoding if ratio >= TEST_THRESHOLD_APPROX
306
+ }) { |encoding|
307
+ ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
308
+ ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
309
+ }
310
+ end
304
311
 
305
- # Analyse statistical appearance of German umlauts and other accented
306
- # letters (see TEST_CHARS)
307
- encodings(*TEST_ENCODINGS) do
308
- ratios = {}
309
-
310
- TEST_ENCODINGS.find(lambda {
311
- ratio, encoding = ratios.sort.last
312
- encoding if ratio >= TEST_THRESHOLD_APPROX
313
- }) { |encoding|
314
- ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
315
- #p [encoding, ratio]
316
- ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
317
- }
318
312
  end
319
313
 
320
- ### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
314
+ # BOM[http://en.wikipedia.org/wiki/Byte_order_mark] detection.
321
315
 
322
- bom_encoding UTF_8 do
323
- starts_with?(0xef, 0xbb, 0xbf)
324
- end
316
+ module BOMGuessers
325
317
 
326
- bom_encoding UTF_16BE do
327
- starts_with?(0xfe, 0xff)
328
- end
318
+ # UTF-8[http://en.wikipedia.org/wiki/UTF-8]
319
+ def bom_encoding_01_UTF_8
320
+ starts_with?(0xef, 0xbb, 0xbf)
321
+ end
329
322
 
330
- bom_encoding UTF_16LE do
331
- starts_with?(0xff, 0xfe)
332
- end
323
+ # UTF-16[http://en.wikipedia.org/wiki/UTF-16] (Big Endian)
324
+ def bom_encoding_02_UTF_16BE
325
+ starts_with?(0xfe, 0xff)
326
+ end
333
327
 
334
- bom_encoding UTF_32BE do
335
- starts_with?(0x00, 0x00, 0xfe, 0xff)
336
- end
328
+ # UTF-16[http://en.wikipedia.org/wiki/UTF-16] (Little Endian)
329
+ def bom_encoding_03_UTF_16LE
330
+ starts_with?(0xff, 0xfe)
331
+ end
337
332
 
338
- bom_encoding UTF_32LE do
339
- starts_with?(0xff, 0xfe, 0x00, 0x00)
340
- end
333
+ # UTF-32[http://en.wikipedia.org/wiki/UTF-32] (Big Endian)
334
+ def bom_encoding_04_UTF_32BE
335
+ starts_with?(0x00, 0x00, 0xfe, 0xff)
336
+ end
341
337
 
342
- bom_encoding SCSU do
343
- starts_with?(0x0e, 0xfe, 0xff)
344
- end
338
+ # UTF-32[http://en.wikipedia.org/wiki/UTF-32] (Little Endian)
339
+ def bom_encoding_05_UTF_32LE
340
+ starts_with?(0xff, 0xfe, 0x00, 0x00)
341
+ end
345
342
 
346
- bom_encoding UTF_7 do
347
- starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
348
- end
343
+ # SCSU[http://en.wikipedia.org/wiki/Standard_Compression_Scheme_for_Unicode]
344
+ def bom_encoding_06_SCSU
345
+ starts_with?(0x0e, 0xfe, 0xff)
346
+ end
349
347
 
350
- bom_encoding UTF_EBCDIC do
351
- starts_with?(0xdd, 0x73, 0x66, 0x73)
352
- end
348
+ # UTF-7[http://en.wikipedia.org/wiki/UTF-7]
349
+ def bom_encoding_07_UTF_7
350
+ starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
351
+ end
353
352
 
354
- bom_encoding BOCU_1 do
355
- starts_with?(0xfb, 0xee, 0x28)
356
- end
353
+ # UTF-1[http://en.wikipedia.org/wiki/UTF-1]
354
+ def bom_encoding_08_UTF_1
355
+ starts_with?(0xf7, 0x64, 0x4c)
356
+ end
357
+
358
+ # UTF-EBCDIC[http://en.wikipedia.org/wiki/UTF-EBCDIC]
359
+ def bom_encoding_09_UTF_EBCDIC
360
+ starts_with?(0xdd, 0x73, 0x66, 0x73)
361
+ end
362
+
363
+ # BOCU-1[http://en.wikipedia.org/wiki/BOCU-1]
364
+ def bom_encoding_10_BOCU_1
365
+ starts_with?(0xfb, 0xee, 0x28)
366
+ end
357
367
 
368
+ # GB-18030[http://en.wikipedia.org/wiki/GB-18030]
369
+ def bom_encoding_11_GB_18030
370
+ starts_with?(0x84, 0x31, 0x95, 0x33)
358
371
  end
372
+
359
373
  end
374
+
375
+ [EncodingGuessers, BOMGuessers].each { |mod|
376
+ include mod
377
+
378
+ mod.instance_methods(false).sort.each { |method|
379
+ next unless method =~ GUESS_METHOD_RE
380
+ name, list = $1, $2.split('_and_')
381
+
382
+ send(name, *list.map { |encoding| const_get(encoding) }) { send(method) }
383
+ }
384
+ }
385
+
360
386
  end
@@ -3,7 +3,7 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2007-2010 University of Cologne, #
6
+ # Copyright (C) 2007-2011 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
@@ -15,26 +15,26 @@
15
15
  # for automatic encoding detection) #
16
16
  # #
17
17
  # cmess is free software; you can redistribute it and/or modify it under the #
18
- # terms of the GNU General Public License as published by the Free Software #
19
- # Foundation; either version 3 of the License, or (at your option) any later #
20
- # version. #
18
+ # terms of the GNU Affero General Public License as published by the Free #
19
+ # Software Foundation; either version 3 of the License, or (at your option) #
20
+ # any later version. #
21
21
  # #
22
22
  # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
23
  # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
- # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
- # details. #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
25
+ # more details. #
26
26
  # #
27
- # You should have received a copy of the GNU General Public License along #
28
- # with cmess. If not, see <http://www.gnu.org/licenses/>. #
27
+ # You should have received a copy of the GNU Affero General Public License #
28
+ # along with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
29
  # #
30
30
  ###############################################################################
31
31
  #++
32
32
 
33
+ require 'cmess/guess_encoding'
34
+
33
35
  # Namespace for our encodings.
34
36
 
35
- module CMess
36
- module GuessEncoding
37
- module Encoding
37
+ module CMess::GuessEncoding::Encoding
38
38
 
39
39
  extend self
40
40
 
@@ -50,8 +50,8 @@ module CMess
50
50
  private
51
51
 
52
52
  def get_all_encodings
53
- %x{iconv -l}.split("\n").map { |e|
54
- get_or_set_encoding_const(e.sub(/\/*\z/, ''))
53
+ %x{iconv -l}.split($/).map { |encoding|
54
+ get_or_set_encoding_const(encoding.sub(%r{/*\z}, ''))
55
55
  }
56
56
  end
57
57
 
@@ -75,14 +75,12 @@ module CMess
75
75
  ISO-8859-11 ISO-8859-13 ISO-8859-14 ISO-8859-15 ISO-8859-16
76
76
  CP1250 CP1251 CP1252 CP850 CP852 CP856
77
77
  UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
78
- UTF-7 UTF-EBCDIC SCSU BOCU-1
78
+ UTF-7 UTF-1 UTF-EBCDIC SCSU BOCU-1 GB-18030
79
79
  ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
80
80
  ].each { |encoding| set_encoding_const(encoding) }
81
81
 
82
- def included(base)
83
- base.extend self
82
+ def self.included(base)
83
+ base.extend(self)
84
84
  end
85
85
 
86
- end
87
- end
88
86
  end