cmess 0.0.5.184 → 0.0.5.186
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +43 -17
- data/bin/guess_encoding +8 -8
- data/data/test_chars.yaml +14 -0
- data/example/{crop → cinderella/crop} +0 -0
- data/example/{crop_repaired → cinderella/crop_repaired} +0 -0
- data/example/{empty6-slash.txt → cinderella/empty6-slash.txt} +0 -0
- data/example/{empty6-slash_repaired.txt → cinderella/empty6-slash_repaired.txt} +0 -0
- data/example/{pot → cinderella/pot} +0 -0
- data/example/guess_encoding/check_results +60 -0
- data/example/guess_encoding/de.utf-8.txt +10030 -0
- data/example/guess_encoding/en.utf-8.txt +10030 -0
- data/example/guess_encoding/fr.utf-8.txt +10030 -0
- data/example/guess_encoding/it.utf-8.txt +10030 -0
- data/lib/cmess/guess_encoding.rb +64 -25
- metadata +16 -8
data/lib/cmess/guess_encoding.rb
CHANGED
@@ -32,6 +32,7 @@
|
|
32
32
|
|
33
33
|
$KCODE = 'u'
|
34
34
|
|
35
|
+
require 'yaml'
|
35
36
|
require 'iconv'
|
36
37
|
require 'forwardable'
|
37
38
|
|
@@ -46,11 +47,26 @@ require 'forwardable'
|
|
46
47
|
module CMess::GuessEncoding
|
47
48
|
|
48
49
|
# our version ;-)
|
49
|
-
VERSION = '0.0.
|
50
|
+
VERSION = '0.0.6'
|
50
51
|
|
51
52
|
# Namespace for our encodings.
|
52
53
|
module Encoding
|
53
54
|
|
55
|
+
extend self
|
56
|
+
|
57
|
+
def const_name_for(encoding)
|
58
|
+
encoding.tr('-', '_').gsub(/\W/, '').upcase
|
59
|
+
end
|
60
|
+
|
61
|
+
def set_encoding_const(encoding, const = const_name_for(encoding))
|
62
|
+
const_set(const, encoding.freeze)
|
63
|
+
end
|
64
|
+
|
65
|
+
def get_or_set_encoding_const(encoding)
|
66
|
+
const_defined?(const = const_name_for(encoding)) ? const_get(const) :
|
67
|
+
set_encoding_const(encoding, const)
|
68
|
+
end
|
69
|
+
|
54
70
|
%w[
|
55
71
|
UNKNOWN ASCII MACINTOSH
|
56
72
|
ISO-8859-1 ISO-8859-2 ISO-8859-15
|
@@ -58,10 +74,7 @@ module CMess::GuessEncoding
|
|
58
74
|
UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
|
59
75
|
UTF-7 UTF-EBCDIC SCSU BOCU-1
|
60
76
|
ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
|
61
|
-
].each { |encoding|
|
62
|
-
const = encoding.tr('-', '_').gsub(/\W/, '')
|
63
|
-
const_set(const, encoding.freeze)
|
64
|
-
}
|
77
|
+
].each { |encoding| set_encoding_const(encoding) }
|
65
78
|
|
66
79
|
end
|
67
80
|
|
@@ -150,7 +163,7 @@ module CMess::GuessEncoding
|
|
150
163
|
# Creates a converter for desired encoding (from UTF-8)
|
151
164
|
ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
|
152
165
|
|
153
|
-
#
|
166
|
+
# Single-byte encodings to test statistically by TEST_CHARS
|
154
167
|
TEST_ENCODINGS = [
|
155
168
|
MACINTOSH,
|
156
169
|
ISO_8859_1,
|
@@ -160,17 +173,35 @@ module CMess::GuessEncoding
|
|
160
173
|
MS_ANSI
|
161
174
|
]
|
162
175
|
|
163
|
-
# Certain chars to test for in TEST_ENCODINGS
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
176
|
+
# Certain (non-ASCII) chars to test for in TEST_ENCODINGS
|
177
|
+
CHARS_TO_TEST = (
|
178
|
+
'€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
|
179
|
+
'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
|
180
|
+
).split(//)
|
181
|
+
|
182
|
+
# Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
|
183
|
+
TEST_CHARS = Hash.new { |hash, encoding|
|
184
|
+
encoding = Encoding.get_or_set_encoding_const(encoding)
|
185
|
+
encchars = CHARS_TO_TEST.map { |char|
|
186
|
+
begin
|
187
|
+
byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
|
188
|
+
rescue Iconv::IllegalSequence
|
189
|
+
end
|
190
|
+
}.compact
|
191
|
+
|
192
|
+
TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
|
193
|
+
hash[encoding] = encchars
|
194
|
+
}.update(YAML.load_file(
|
195
|
+
File.join(File.dirname(__FILE__), '..', '..', 'data', 'test_chars.yaml')
|
196
|
+
))
|
197
|
+
|
198
|
+
# Relative count of TEST_CHARS must exceed this threshold to yield
|
199
|
+
# a direct match
|
200
|
+
TEST_THRESHOLD_DIRECT = 0.1
|
171
201
|
|
172
|
-
# Relative count of TEST_CHARS must exceed this threshold to yield
|
173
|
-
|
202
|
+
# Relative count of TEST_CHARS must exceed this threshold to yield
|
203
|
+
# an approximate match
|
204
|
+
TEST_THRESHOLD_APPROX = 0.0004
|
174
205
|
|
175
206
|
@supported_encodings = []
|
176
207
|
@encoding_guessers = []
|
@@ -199,7 +230,8 @@ module CMess::GuessEncoding
|
|
199
230
|
def encodings(*encodings, &encoding_block)
|
200
231
|
encodings.each { |encoding|
|
201
232
|
@supported_encodings << encoding
|
202
|
-
@encoding_guessers << encoding_block
|
233
|
+
@encoding_guessers << encoding_block \
|
234
|
+
unless @encoding_guessers.include?(encoding_block)
|
203
235
|
}
|
204
236
|
end
|
205
237
|
|
@@ -213,7 +245,8 @@ module CMess::GuessEncoding
|
|
213
245
|
}
|
214
246
|
|
215
247
|
@supported_boms << encoding
|
216
|
-
@bom_guessers << encoding_block
|
248
|
+
@bom_guessers << encoding_block \
|
249
|
+
unless @bom_guessers.include?(encoding_block)
|
217
250
|
end
|
218
251
|
|
219
252
|
def supported_bom?(encoding)
|
@@ -322,10 +355,10 @@ module CMess::GuessEncoding
|
|
322
355
|
encodings UTF_16BE, UTF_16LE, UTF_16 do
|
323
356
|
if relative_byte_count(byte_count[0]) > 0.25
|
324
357
|
case first_byte
|
325
|
-
when
|
326
|
-
when
|
327
|
-
when
|
328
|
-
else
|
358
|
+
when 0x0: UTF_32
|
359
|
+
when 0xfe: UTF_16BE
|
360
|
+
when 0xff: UTF_16LE
|
361
|
+
else UTF_16
|
329
362
|
end
|
330
363
|
end
|
331
364
|
end
|
@@ -348,9 +381,15 @@ module CMess::GuessEncoding
|
|
348
381
|
# Analyse statistical appearance of German umlauts and other accented
|
349
382
|
# letters (see TEST_CHARS)
|
350
383
|
encodings *TEST_ENCODINGS do
|
351
|
-
|
352
|
-
|
353
|
-
|
384
|
+
ratios = {}
|
385
|
+
|
386
|
+
TEST_ENCODINGS.find(lambda {
|
387
|
+
ratio, encoding = ratios.sort.last
|
388
|
+
encoding if ratio >= TEST_THRESHOLD_APPROX
|
389
|
+
}) { |encoding|
|
390
|
+
ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
|
391
|
+
#p [encoding, ratio]
|
392
|
+
ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
|
354
393
|
}
|
355
394
|
end
|
356
395
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cmess
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.5.
|
4
|
+
version: 0.0.5.186
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Wille
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-01-
|
12
|
+
date: 2008-01-24 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -56,11 +56,18 @@ files:
|
|
56
56
|
- README
|
57
57
|
- ChangeLog
|
58
58
|
- Rakefile
|
59
|
-
- example/
|
60
|
-
- example/
|
61
|
-
- example/
|
62
|
-
- example/
|
63
|
-
- example/
|
59
|
+
- example/cinderella
|
60
|
+
- example/cinderella/crop
|
61
|
+
- example/cinderella/empty6-slash.txt
|
62
|
+
- example/cinderella/pot
|
63
|
+
- example/cinderella/crop_repaired
|
64
|
+
- example/cinderella/empty6-slash_repaired.txt
|
65
|
+
- example/guess_encoding
|
66
|
+
- example/guess_encoding/de.utf-8.txt
|
67
|
+
- example/guess_encoding/fr.utf-8.txt
|
68
|
+
- example/guess_encoding/check_results
|
69
|
+
- example/guess_encoding/en.utf-8.txt
|
70
|
+
- example/guess_encoding/it.utf-8.txt
|
64
71
|
- data/csets
|
65
72
|
- data/csets/iso_8859-1.yaml
|
66
73
|
- data/csets/iso_8859-15.yaml
|
@@ -81,17 +88,18 @@ files:
|
|
81
88
|
- data/csets/unicode/latin_1_supplement.yaml
|
82
89
|
- data/csets/unicode/basic_latin.yaml
|
83
90
|
- data/csets/unicode/cyrillic.yaml
|
91
|
+
- data/test_chars.yaml
|
84
92
|
has_rdoc: true
|
85
93
|
homepage: http://prometheus.rubyforge.org/cmess
|
86
94
|
post_install_message:
|
87
95
|
rdoc_options:
|
88
96
|
- --line-numbers
|
89
97
|
- --inline-source
|
98
|
+
- --all
|
90
99
|
- --charset
|
91
100
|
- UTF-8
|
92
101
|
- --main
|
93
102
|
- README
|
94
|
-
- --all
|
95
103
|
- --title
|
96
104
|
- cmess Application documentation
|
97
105
|
require_paths:
|