cmess 0.0.5.184 → 0.0.5.186

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,6 +32,7 @@
32
32
 
33
33
  $KCODE = 'u'
34
34
 
35
+ require 'yaml'
35
36
  require 'iconv'
36
37
  require 'forwardable'
37
38
 
@@ -46,11 +47,26 @@ require 'forwardable'
46
47
  module CMess::GuessEncoding
47
48
 
48
49
  # our version ;-)
49
- VERSION = '0.0.5'
50
+ VERSION = '0.0.6'
50
51
 
51
52
  # Namespace for our encodings.
52
53
  module Encoding
53
54
 
55
+ extend self
56
+
57
+ def const_name_for(encoding)
58
+ encoding.tr('-', '_').gsub(/\W/, '').upcase
59
+ end
60
+
61
+ def set_encoding_const(encoding, const = const_name_for(encoding))
62
+ const_set(const, encoding.freeze)
63
+ end
64
+
65
+ def get_or_set_encoding_const(encoding)
66
+ const_defined?(const = const_name_for(encoding)) ? const_get(const) :
67
+ set_encoding_const(encoding, const)
68
+ end
69
+
54
70
  %w[
55
71
  UNKNOWN ASCII MACINTOSH
56
72
  ISO-8859-1 ISO-8859-2 ISO-8859-15
@@ -58,10 +74,7 @@ module CMess::GuessEncoding
58
74
  UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
59
75
  UTF-7 UTF-EBCDIC SCSU BOCU-1
60
76
  ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
61
- ].each { |encoding|
62
- const = encoding.tr('-', '_').gsub(/\W/, '')
63
- const_set(const, encoding.freeze)
64
- }
77
+ ].each { |encoding| set_encoding_const(encoding) }
65
78
 
66
79
  end
67
80
 
@@ -150,7 +163,7 @@ module CMess::GuessEncoding
150
163
  # Creates a converter for desired encoding (from UTF-8)
151
164
  ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
152
165
 
153
- # Encodings to test statistically by TEST_CHARS
166
+ # Single-byte encodings to test statistically by TEST_CHARS
154
167
  TEST_ENCODINGS = [
155
168
  MACINTOSH,
156
169
  ISO_8859_1,
@@ -160,17 +173,35 @@ module CMess::GuessEncoding
160
173
  MS_ANSI
161
174
  ]
162
175
 
163
- # Certain chars to test for in TEST_ENCODINGS
164
- TEST_CHARS = 'ÁÀÂÄÃÇÉÈÊËÍÌÎÏÑÓÒÔÖÚÙÛÜÆáàâäãçéèêëíìîïñóòôöúùûüæ'.
165
- split(//).inject(Hash.new { |h, k| h[k] = [] }) { |hash, char|
166
- TEST_ENCODINGS.each { |encoding|
167
- hash[encoding] += ICONV_FOR[encoding].iconv(char).unpack('C')
168
- }
169
- hash
170
- }
176
+ # Certain (non-ASCII) chars to test for in TEST_ENCODINGS
177
+ CHARS_TO_TEST = (
178
+ '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
179
+ 'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
180
+ ).split(//)
181
+
182
+ # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
183
+ TEST_CHARS = Hash.new { |hash, encoding|
184
+ encoding = Encoding.get_or_set_encoding_const(encoding)
185
+ encchars = CHARS_TO_TEST.map { |char|
186
+ begin
187
+ byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
188
+ rescue Iconv::IllegalSequence
189
+ end
190
+ }.compact
191
+
192
+ TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
193
+ hash[encoding] = encchars
194
+ }.update(YAML.load_file(
195
+ File.join(File.dirname(__FILE__), '..', '..', 'data', 'test_chars.yaml')
196
+ ))
197
+
198
+ # Relative count of TEST_CHARS must exceed this threshold to yield
199
+ # a direct match
200
+ TEST_THRESHOLD_DIRECT = 0.1
171
201
 
172
- # Relative count of TEST_CHARS must exceed this threshold to yield a match
173
- TEST_THRESHOLD = 0.0004
202
+ # Relative count of TEST_CHARS must exceed this threshold to yield
203
+ # an approximate match
204
+ TEST_THRESHOLD_APPROX = 0.0004
174
205
 
175
206
  @supported_encodings = []
176
207
  @encoding_guessers = []
@@ -199,7 +230,8 @@ module CMess::GuessEncoding
199
230
  def encodings(*encodings, &encoding_block)
200
231
  encodings.each { |encoding|
201
232
  @supported_encodings << encoding
202
- @encoding_guessers << encoding_block
233
+ @encoding_guessers << encoding_block \
234
+ unless @encoding_guessers.include?(encoding_block)
203
235
  }
204
236
  end
205
237
 
@@ -213,7 +245,8 @@ module CMess::GuessEncoding
213
245
  }
214
246
 
215
247
  @supported_boms << encoding
216
- @bom_guessers << encoding_block
248
+ @bom_guessers << encoding_block \
249
+ unless @bom_guessers.include?(encoding_block)
217
250
  end
218
251
 
219
252
  def supported_bom?(encoding)
@@ -322,10 +355,10 @@ module CMess::GuessEncoding
322
355
  encodings UTF_16BE, UTF_16LE, UTF_16 do
323
356
  if relative_byte_count(byte_count[0]) > 0.25
324
357
  case first_byte
325
- when 0: UTF_32
326
- when 254: UTF_16BE
327
- when 255: UTF_16LE
328
- else UTF_16
358
+ when 0x0: UTF_32
359
+ when 0xfe: UTF_16BE
360
+ when 0xff: UTF_16LE
361
+ else UTF_16
329
362
  end
330
363
  end
331
364
  end
@@ -348,9 +381,15 @@ module CMess::GuessEncoding
348
381
  # Analyse statistical appearance of German umlauts and other accented
349
382
  # letters (see TEST_CHARS)
350
383
  encodings *TEST_ENCODINGS do
351
- TEST_ENCODINGS.each { |encoding|
352
- break encoding if
353
- relative_byte_count(byte_count_sum(TEST_CHARS[encoding])) > TEST_THRESHOLD
384
+ ratios = {}
385
+
386
+ TEST_ENCODINGS.find(lambda {
387
+ ratio, encoding = ratios.sort.last
388
+ encoding if ratio >= TEST_THRESHOLD_APPROX
389
+ }) { |encoding|
390
+ ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
391
+ #p [encoding, ratio]
392
+ ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
354
393
  }
355
394
  end
356
395
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cmess
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5.184
4
+ version: 0.0.5.186
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jens Wille
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-01-23 00:00:00 +01:00
12
+ date: 2008-01-24 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -56,11 +56,18 @@ files:
56
56
  - README
57
57
  - ChangeLog
58
58
  - Rakefile
59
- - example/crop
60
- - example/empty6-slash.txt
61
- - example/pot
62
- - example/crop_repaired
63
- - example/empty6-slash_repaired.txt
59
+ - example/cinderella
60
+ - example/cinderella/crop
61
+ - example/cinderella/empty6-slash.txt
62
+ - example/cinderella/pot
63
+ - example/cinderella/crop_repaired
64
+ - example/cinderella/empty6-slash_repaired.txt
65
+ - example/guess_encoding
66
+ - example/guess_encoding/de.utf-8.txt
67
+ - example/guess_encoding/fr.utf-8.txt
68
+ - example/guess_encoding/check_results
69
+ - example/guess_encoding/en.utf-8.txt
70
+ - example/guess_encoding/it.utf-8.txt
64
71
  - data/csets
65
72
  - data/csets/iso_8859-1.yaml
66
73
  - data/csets/iso_8859-15.yaml
@@ -81,17 +88,18 @@ files:
81
88
  - data/csets/unicode/latin_1_supplement.yaml
82
89
  - data/csets/unicode/basic_latin.yaml
83
90
  - data/csets/unicode/cyrillic.yaml
91
+ - data/test_chars.yaml
84
92
  has_rdoc: true
85
93
  homepage: http://prometheus.rubyforge.org/cmess
86
94
  post_install_message:
87
95
  rdoc_options:
88
96
  - --line-numbers
89
97
  - --inline-source
98
+ - --all
90
99
  - --charset
91
100
  - UTF-8
92
101
  - --main
93
102
  - README
94
- - --all
95
103
  - --title
96
104
  - cmess Application documentation
97
105
  require_paths: