cmess 0.0.5.184 → 0.0.5.186
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +43 -17
- data/bin/guess_encoding +8 -8
- data/data/test_chars.yaml +14 -0
- data/example/{crop → cinderella/crop} +0 -0
- data/example/{crop_repaired → cinderella/crop_repaired} +0 -0
- data/example/{empty6-slash.txt → cinderella/empty6-slash.txt} +0 -0
- data/example/{empty6-slash_repaired.txt → cinderella/empty6-slash_repaired.txt} +0 -0
- data/example/{pot → cinderella/pot} +0 -0
- data/example/guess_encoding/check_results +60 -0
- data/example/guess_encoding/de.utf-8.txt +10030 -0
- data/example/guess_encoding/en.utf-8.txt +10030 -0
- data/example/guess_encoding/fr.utf-8.txt +10030 -0
- data/example/guess_encoding/it.utf-8.txt +10030 -0
- data/lib/cmess/guess_encoding.rb +64 -25
- metadata +16 -8
data/lib/cmess/guess_encoding.rb
CHANGED
@@ -32,6 +32,7 @@
|
|
32
32
|
|
33
33
|
$KCODE = 'u'
|
34
34
|
|
35
|
+
require 'yaml'
|
35
36
|
require 'iconv'
|
36
37
|
require 'forwardable'
|
37
38
|
|
@@ -46,11 +47,26 @@ require 'forwardable'
|
|
46
47
|
module CMess::GuessEncoding
|
47
48
|
|
48
49
|
# our version ;-)
|
49
|
-
VERSION = '0.0.
|
50
|
+
VERSION = '0.0.6'
|
50
51
|
|
51
52
|
# Namespace for our encodings.
|
52
53
|
module Encoding
|
53
54
|
|
55
|
+
extend self
|
56
|
+
|
57
|
+
def const_name_for(encoding)
|
58
|
+
encoding.tr('-', '_').gsub(/\W/, '').upcase
|
59
|
+
end
|
60
|
+
|
61
|
+
def set_encoding_const(encoding, const = const_name_for(encoding))
|
62
|
+
const_set(const, encoding.freeze)
|
63
|
+
end
|
64
|
+
|
65
|
+
def get_or_set_encoding_const(encoding)
|
66
|
+
const_defined?(const = const_name_for(encoding)) ? const_get(const) :
|
67
|
+
set_encoding_const(encoding, const)
|
68
|
+
end
|
69
|
+
|
54
70
|
%w[
|
55
71
|
UNKNOWN ASCII MACINTOSH
|
56
72
|
ISO-8859-1 ISO-8859-2 ISO-8859-15
|
@@ -58,10 +74,7 @@ module CMess::GuessEncoding
|
|
58
74
|
UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
|
59
75
|
UTF-7 UTF-EBCDIC SCSU BOCU-1
|
60
76
|
ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
|
61
|
-
].each { |encoding|
|
62
|
-
const = encoding.tr('-', '_').gsub(/\W/, '')
|
63
|
-
const_set(const, encoding.freeze)
|
64
|
-
}
|
77
|
+
].each { |encoding| set_encoding_const(encoding) }
|
65
78
|
|
66
79
|
end
|
67
80
|
|
@@ -150,7 +163,7 @@ module CMess::GuessEncoding
|
|
150
163
|
# Creates a converter for desired encoding (from UTF-8)
|
151
164
|
ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
|
152
165
|
|
153
|
-
#
|
166
|
+
# Single-byte encodings to test statistically by TEST_CHARS
|
154
167
|
TEST_ENCODINGS = [
|
155
168
|
MACINTOSH,
|
156
169
|
ISO_8859_1,
|
@@ -160,17 +173,35 @@ module CMess::GuessEncoding
|
|
160
173
|
MS_ANSI
|
161
174
|
]
|
162
175
|
|
163
|
-
# Certain chars to test for in TEST_ENCODINGS
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
176
|
+
# Certain (non-ASCII) chars to test for in TEST_ENCODINGS
|
177
|
+
CHARS_TO_TEST = (
|
178
|
+
'€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
|
179
|
+
'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
|
180
|
+
).split(//)
|
181
|
+
|
182
|
+
# Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
|
183
|
+
TEST_CHARS = Hash.new { |hash, encoding|
|
184
|
+
encoding = Encoding.get_or_set_encoding_const(encoding)
|
185
|
+
encchars = CHARS_TO_TEST.map { |char|
|
186
|
+
begin
|
187
|
+
byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
|
188
|
+
rescue Iconv::IllegalSequence
|
189
|
+
end
|
190
|
+
}.compact
|
191
|
+
|
192
|
+
TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
|
193
|
+
hash[encoding] = encchars
|
194
|
+
}.update(YAML.load_file(
|
195
|
+
File.join(File.dirname(__FILE__), '..', '..', 'data', 'test_chars.yaml')
|
196
|
+
))
|
197
|
+
|
198
|
+
# Relative count of TEST_CHARS must exceed this threshold to yield
|
199
|
+
# a direct match
|
200
|
+
TEST_THRESHOLD_DIRECT = 0.1
|
171
201
|
|
172
|
-
# Relative count of TEST_CHARS must exceed this threshold to yield
|
173
|
-
|
202
|
+
# Relative count of TEST_CHARS must exceed this threshold to yield
|
203
|
+
# an approximate match
|
204
|
+
TEST_THRESHOLD_APPROX = 0.0004
|
174
205
|
|
175
206
|
@supported_encodings = []
|
176
207
|
@encoding_guessers = []
|
@@ -199,7 +230,8 @@ module CMess::GuessEncoding
|
|
199
230
|
def encodings(*encodings, &encoding_block)
|
200
231
|
encodings.each { |encoding|
|
201
232
|
@supported_encodings << encoding
|
202
|
-
@encoding_guessers << encoding_block
|
233
|
+
@encoding_guessers << encoding_block \
|
234
|
+
unless @encoding_guessers.include?(encoding_block)
|
203
235
|
}
|
204
236
|
end
|
205
237
|
|
@@ -213,7 +245,8 @@ module CMess::GuessEncoding
|
|
213
245
|
}
|
214
246
|
|
215
247
|
@supported_boms << encoding
|
216
|
-
@bom_guessers << encoding_block
|
248
|
+
@bom_guessers << encoding_block \
|
249
|
+
unless @bom_guessers.include?(encoding_block)
|
217
250
|
end
|
218
251
|
|
219
252
|
def supported_bom?(encoding)
|
@@ -322,10 +355,10 @@ module CMess::GuessEncoding
|
|
322
355
|
encodings UTF_16BE, UTF_16LE, UTF_16 do
|
323
356
|
if relative_byte_count(byte_count[0]) > 0.25
|
324
357
|
case first_byte
|
325
|
-
when
|
326
|
-
when
|
327
|
-
when
|
328
|
-
else
|
358
|
+
when 0x0: UTF_32
|
359
|
+
when 0xfe: UTF_16BE
|
360
|
+
when 0xff: UTF_16LE
|
361
|
+
else UTF_16
|
329
362
|
end
|
330
363
|
end
|
331
364
|
end
|
@@ -348,9 +381,15 @@ module CMess::GuessEncoding
|
|
348
381
|
# Analyse statistical appearance of German umlauts and other accented
|
349
382
|
# letters (see TEST_CHARS)
|
350
383
|
encodings *TEST_ENCODINGS do
|
351
|
-
|
352
|
-
|
353
|
-
|
384
|
+
ratios = {}
|
385
|
+
|
386
|
+
TEST_ENCODINGS.find(lambda {
|
387
|
+
ratio, encoding = ratios.sort.last
|
388
|
+
encoding if ratio >= TEST_THRESHOLD_APPROX
|
389
|
+
}) { |encoding|
|
390
|
+
ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
|
391
|
+
#p [encoding, ratio]
|
392
|
+
ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
|
354
393
|
}
|
355
394
|
end
|
356
395
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cmess
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.5.
|
4
|
+
version: 0.0.5.186
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jens Wille
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-01-
|
12
|
+
date: 2008-01-24 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -56,11 +56,18 @@ files:
|
|
56
56
|
- README
|
57
57
|
- ChangeLog
|
58
58
|
- Rakefile
|
59
|
-
- example/
|
60
|
-
- example/
|
61
|
-
- example/
|
62
|
-
- example/
|
63
|
-
- example/
|
59
|
+
- example/cinderella
|
60
|
+
- example/cinderella/crop
|
61
|
+
- example/cinderella/empty6-slash.txt
|
62
|
+
- example/cinderella/pot
|
63
|
+
- example/cinderella/crop_repaired
|
64
|
+
- example/cinderella/empty6-slash_repaired.txt
|
65
|
+
- example/guess_encoding
|
66
|
+
- example/guess_encoding/de.utf-8.txt
|
67
|
+
- example/guess_encoding/fr.utf-8.txt
|
68
|
+
- example/guess_encoding/check_results
|
69
|
+
- example/guess_encoding/en.utf-8.txt
|
70
|
+
- example/guess_encoding/it.utf-8.txt
|
64
71
|
- data/csets
|
65
72
|
- data/csets/iso_8859-1.yaml
|
66
73
|
- data/csets/iso_8859-15.yaml
|
@@ -81,17 +88,18 @@ files:
|
|
81
88
|
- data/csets/unicode/latin_1_supplement.yaml
|
82
89
|
- data/csets/unicode/basic_latin.yaml
|
83
90
|
- data/csets/unicode/cyrillic.yaml
|
91
|
+
- data/test_chars.yaml
|
84
92
|
has_rdoc: true
|
85
93
|
homepage: http://prometheus.rubyforge.org/cmess
|
86
94
|
post_install_message:
|
87
95
|
rdoc_options:
|
88
96
|
- --line-numbers
|
89
97
|
- --inline-source
|
98
|
+
- --all
|
90
99
|
- --charset
|
91
100
|
- UTF-8
|
92
101
|
- --main
|
93
102
|
- README
|
94
|
-
- --all
|
95
103
|
- --title
|
96
104
|
- cmess Application documentation
|
97
105
|
require_paths:
|