cmess 0.0.5.184 → 0.0.5.186

Sign up to get free protection for your applications and to get access to all the features.
@@ -32,6 +32,7 @@
32
32
 
33
33
  $KCODE = 'u'
34
34
 
35
+ require 'yaml'
35
36
  require 'iconv'
36
37
  require 'forwardable'
37
38
 
@@ -46,11 +47,26 @@ require 'forwardable'
46
47
  module CMess::GuessEncoding
47
48
 
48
49
  # our version ;-)
49
- VERSION = '0.0.5'
50
+ VERSION = '0.0.6'
50
51
 
51
52
  # Namespace for our encodings.
52
53
  module Encoding
53
54
 
55
+ extend self
56
+
57
+ def const_name_for(encoding)
58
+ encoding.tr('-', '_').gsub(/\W/, '').upcase
59
+ end
60
+
61
+ def set_encoding_const(encoding, const = const_name_for(encoding))
62
+ const_set(const, encoding.freeze)
63
+ end
64
+
65
+ def get_or_set_encoding_const(encoding)
66
+ const_defined?(const = const_name_for(encoding)) ? const_get(const) :
67
+ set_encoding_const(encoding, const)
68
+ end
69
+
54
70
  %w[
55
71
  UNKNOWN ASCII MACINTOSH
56
72
  ISO-8859-1 ISO-8859-2 ISO-8859-15
@@ -58,10 +74,7 @@ module CMess::GuessEncoding
58
74
  UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
59
75
  UTF-7 UTF-EBCDIC SCSU BOCU-1
60
76
  ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
61
- ].each { |encoding|
62
- const = encoding.tr('-', '_').gsub(/\W/, '')
63
- const_set(const, encoding.freeze)
64
- }
77
+ ].each { |encoding| set_encoding_const(encoding) }
65
78
 
66
79
  end
67
80
 
@@ -150,7 +163,7 @@ module CMess::GuessEncoding
150
163
  # Creates a converter for desired encoding (from UTF-8)
151
164
  ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
152
165
 
153
- # Encodings to test statistically by TEST_CHARS
166
+ # Single-byte encodings to test statistically by TEST_CHARS
154
167
  TEST_ENCODINGS = [
155
168
  MACINTOSH,
156
169
  ISO_8859_1,
@@ -160,17 +173,35 @@ module CMess::GuessEncoding
160
173
  MS_ANSI
161
174
  ]
162
175
 
163
- # Certain chars to test for in TEST_ENCODINGS
164
- TEST_CHARS = 'ÁÀÂÄÃÇÉÈÊËÍÌÎÏÑÓÒÔÖÚÙÛÜÆáàâäãçéèêëíìîïñóòôöúùûüæ'.
165
- split(//).inject(Hash.new { |h, k| h[k] = [] }) { |hash, char|
166
- TEST_ENCODINGS.each { |encoding|
167
- hash[encoding] += ICONV_FOR[encoding].iconv(char).unpack('C')
168
- }
169
- hash
170
- }
176
+ # Certain (non-ASCII) chars to test for in TEST_ENCODINGS
177
+ CHARS_TO_TEST = (
178
+ '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
179
+ 'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
180
+ ).split(//)
181
+
182
+ # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
183
+ TEST_CHARS = Hash.new { |hash, encoding|
184
+ encoding = Encoding.get_or_set_encoding_const(encoding)
185
+ encchars = CHARS_TO_TEST.map { |char|
186
+ begin
187
+ byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
188
+ rescue Iconv::IllegalSequence
189
+ end
190
+ }.compact
191
+
192
+ TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
193
+ hash[encoding] = encchars
194
+ }.update(YAML.load_file(
195
+ File.join(File.dirname(__FILE__), '..', '..', 'data', 'test_chars.yaml')
196
+ ))
197
+
198
+ # Relative count of TEST_CHARS must exceed this threshold to yield
199
+ # a direct match
200
+ TEST_THRESHOLD_DIRECT = 0.1
171
201
 
172
- # Relative count of TEST_CHARS must exceed this threshold to yield a match
173
- TEST_THRESHOLD = 0.0004
202
+ # Relative count of TEST_CHARS must exceed this threshold to yield
203
+ # an approximate match
204
+ TEST_THRESHOLD_APPROX = 0.0004
174
205
 
175
206
  @supported_encodings = []
176
207
  @encoding_guessers = []
@@ -199,7 +230,8 @@ module CMess::GuessEncoding
199
230
  def encodings(*encodings, &encoding_block)
200
231
  encodings.each { |encoding|
201
232
  @supported_encodings << encoding
202
- @encoding_guessers << encoding_block
233
+ @encoding_guessers << encoding_block \
234
+ unless @encoding_guessers.include?(encoding_block)
203
235
  }
204
236
  end
205
237
 
@@ -213,7 +245,8 @@ module CMess::GuessEncoding
213
245
  }
214
246
 
215
247
  @supported_boms << encoding
216
- @bom_guessers << encoding_block
248
+ @bom_guessers << encoding_block \
249
+ unless @bom_guessers.include?(encoding_block)
217
250
  end
218
251
 
219
252
  def supported_bom?(encoding)
@@ -322,10 +355,10 @@ module CMess::GuessEncoding
322
355
  encodings UTF_16BE, UTF_16LE, UTF_16 do
323
356
  if relative_byte_count(byte_count[0]) > 0.25
324
357
  case first_byte
325
- when 0: UTF_32
326
- when 254: UTF_16BE
327
- when 255: UTF_16LE
328
- else UTF_16
358
+ when 0x0: UTF_32
359
+ when 0xfe: UTF_16BE
360
+ when 0xff: UTF_16LE
361
+ else UTF_16
329
362
  end
330
363
  end
331
364
  end
@@ -348,9 +381,15 @@ module CMess::GuessEncoding
348
381
  # Analyse statistical appearance of German umlauts and other accented
349
382
  # letters (see TEST_CHARS)
350
383
  encodings *TEST_ENCODINGS do
351
- TEST_ENCODINGS.each { |encoding|
352
- break encoding if
353
- relative_byte_count(byte_count_sum(TEST_CHARS[encoding])) > TEST_THRESHOLD
384
+ ratios = {}
385
+
386
+ TEST_ENCODINGS.find(lambda {
387
+ ratio, encoding = ratios.sort.last
388
+ encoding if ratio >= TEST_THRESHOLD_APPROX
389
+ }) { |encoding|
390
+ ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
391
+ #p [encoding, ratio]
392
+ ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
354
393
  }
355
394
  end
356
395
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cmess
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5.184
4
+ version: 0.0.5.186
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-01-23 00:00:00 +01:00
12
+ date: 2008-01-24 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -56,11 +56,18 @@ files:
56
56
  - README
57
57
  - ChangeLog
58
58
  - Rakefile
59
- - example/crop
60
- - example/empty6-slash.txt
61
- - example/pot
62
- - example/crop_repaired
63
- - example/empty6-slash_repaired.txt
59
+ - example/cinderella
60
+ - example/cinderella/crop
61
+ - example/cinderella/empty6-slash.txt
62
+ - example/cinderella/pot
63
+ - example/cinderella/crop_repaired
64
+ - example/cinderella/empty6-slash_repaired.txt
65
+ - example/guess_encoding
66
+ - example/guess_encoding/de.utf-8.txt
67
+ - example/guess_encoding/fr.utf-8.txt
68
+ - example/guess_encoding/check_results
69
+ - example/guess_encoding/en.utf-8.txt
70
+ - example/guess_encoding/it.utf-8.txt
64
71
  - data/csets
65
72
  - data/csets/iso_8859-1.yaml
66
73
  - data/csets/iso_8859-15.yaml
@@ -81,17 +88,18 @@ files:
81
88
  - data/csets/unicode/latin_1_supplement.yaml
82
89
  - data/csets/unicode/basic_latin.yaml
83
90
  - data/csets/unicode/cyrillic.yaml
91
+ - data/test_chars.yaml
84
92
  has_rdoc: true
85
93
  homepage: http://prometheus.rubyforge.org/cmess
86
94
  post_install_message:
87
95
  rdoc_options:
88
96
  - --line-numbers
89
97
  - --inline-source
98
+ - --all
90
99
  - --charset
91
100
  - UTF-8
92
101
  - --main
93
102
  - README
94
- - --all
95
103
  - --title
96
104
  - cmess Application documentation
97
105
  require_paths: