blackwinter-cmess 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +676 -0
- data/ChangeLog +54 -0
- data/README +63 -0
- data/Rakefile +51 -0
- data/bin/bconv +130 -0
- data/bin/cinderella +190 -0
- data/bin/decode_entities +106 -0
- data/bin/guess_encoding +223 -0
- data/data/chartab.yaml +26724 -0
- data/data/csets/iso_8859-1.yaml +195 -0
- data/data/csets/iso_8859-15.yaml +204 -0
- data/data/csets/latin1.yaml +195 -0
- data/data/csets/unicode/basic_latin.yaml +97 -0
- data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
- data/data/csets/unicode/cyrillic.yaml +256 -0
- data/data/csets/unicode/greek.yaml +129 -0
- data/data/csets/unicode/ipa_extensions.yaml +97 -0
- data/data/csets/unicode/latin-extended-c.yaml +18 -0
- data/data/csets/unicode/latin-extended-d.yaml +3 -0
- data/data/csets/unicode/latin_1_supplement.yaml +128 -0
- data/data/csets/unicode/latin_extended_a.yaml +129 -0
- data/data/csets/unicode/latin_extended_additional.yaml +247 -0
- data/data/csets/unicode/latin_extended_b.yaml +209 -0
- data/data/csets/unicode/letterlike_symbols.yaml +80 -0
- data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
- data/data/csets/utf-8.yaml +1504 -0
- data/data/csets/utf8.yaml +1504 -0
- data/data/test_chars.yaml +14 -0
- data/example/cinderella/crop +127 -0
- data/example/cinderella/crop_repaired +127 -0
- data/example/cinderella/empty6-slash.txt +1495 -0
- data/example/cinderella/empty6-slash_repaired.txt +1495 -0
- data/example/cinderella/pot +1368 -0
- data/example/guess_encoding/check_results +60 -0
- data/example/guess_encoding/de.utf-8.txt +10030 -0
- data/example/guess_encoding/en.utf-8.txt +10030 -0
- data/example/guess_encoding/fr.utf-8.txt +10030 -0
- data/example/guess_encoding/it.utf-8.txt +10030 -0
- data/lib/cmess/bconv.rb +169 -0
- data/lib/cmess/cinderella.rb +66 -0
- data/lib/cmess/cli.rb +120 -0
- data/lib/cmess/decode_entities.rb +69 -0
- data/lib/cmess/guess_encoding/automatic.rb +343 -0
- data/lib/cmess/guess_encoding/encoding.rb +78 -0
- data/lib/cmess/guess_encoding/manual.rb +108 -0
- data/lib/cmess/guess_encoding.rb +61 -0
- data/lib/cmess/version.rb +51 -0
- data/lib/cmess.rb +49 -0
- metadata +136 -0
@@ -0,0 +1,343 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# A component of cmess, the encoding tool-box. #
|
7
|
+
# #
|
8
|
+
# Copyright (C) 2007-2009 University of Cologne, #
|
9
|
+
# Albertus-Magnus-Platz, #
|
10
|
+
# 50932 Cologne, Germany #
|
11
|
+
# #
|
12
|
+
# Authors: #
|
13
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
14
|
+
# #
|
15
|
+
# Contributors: #
|
16
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
17
|
+
# for automatic encoding detection) #
|
18
|
+
# #
|
19
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
20
|
+
# terms of the GNU General Public License as published by the Free Software #
|
21
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
22
|
+
# version. #
|
23
|
+
# #
|
24
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
25
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
26
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
27
|
+
# details. #
|
28
|
+
# #
|
29
|
+
# You should have received a copy of the GNU General Public License along #
|
30
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
31
|
+
# #
|
32
|
+
###############################################################################
|
33
|
+
#++
|
34
|
+
|
35
|
+
$KCODE = 'u' unless RUBY_VERSION >= '1.9'
|
36
|
+
|
37
|
+
require 'yaml'
|
38
|
+
require 'iconv'
|
39
|
+
require 'stringio'
|
40
|
+
require 'forwardable'
|
41
|
+
|
42
|
+
# Tries to detect the encoding of a given input by applying several
|
43
|
+
# heuristics to determine the <b>most likely</b> candidate. If no heuristic
|
44
|
+
# catches on, resorts to Encoding::UNKNOWN.
|
45
|
+
#
|
46
|
+
# If a BOM is found, it may determine the encoding directly.
|
47
|
+
|
48
|
+
class CMess::GuessEncoding::Automatic
|
49
|
+
|
50
|
+
extend Forwardable
|
51
|
+
|
52
|
+
def_delegators self, :encoding_guessers, :supported_encoding?,
|
53
|
+
:bom_guessers, :supported_bom?
|
54
|
+
|
55
|
+
include CMess::GuessEncoding::Encoding
|
56
|
+
|
57
|
+
# Creates a converter for desired encoding (from UTF-8)
|
58
|
+
ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
|
59
|
+
|
60
|
+
# Single-byte encodings to test statistically by TEST_CHARS
|
61
|
+
TEST_ENCODINGS = [
|
62
|
+
MACINTOSH,
|
63
|
+
ISO_8859_1,
|
64
|
+
ISO_8859_15,
|
65
|
+
CP1252,
|
66
|
+
CP850,
|
67
|
+
MS_ANSI
|
68
|
+
]
|
69
|
+
|
70
|
+
# Certain (non-ASCII) chars to test for in TEST_ENCODINGS
|
71
|
+
CHARS_TO_TEST = (
|
72
|
+
'€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
|
73
|
+
'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
|
74
|
+
).split(//)
|
75
|
+
|
76
|
+
# Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
|
77
|
+
TEST_CHARS = Hash.new { |hash, encoding|
|
78
|
+
encoding = Encoding.get_or_set_encoding_const(encoding)
|
79
|
+
encchars = CHARS_TO_TEST.map { |char|
|
80
|
+
begin
|
81
|
+
byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
|
82
|
+
rescue Iconv::IllegalSequence
|
83
|
+
end
|
84
|
+
}.compact
|
85
|
+
|
86
|
+
TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
|
87
|
+
hash[encoding] = encchars
|
88
|
+
}.update(YAML.load_file(
|
89
|
+
File.join(File.dirname(__FILE__), *%w[.. .. .. data test_chars.yaml])
|
90
|
+
))
|
91
|
+
|
92
|
+
# Relative count of TEST_CHARS must exceed this threshold to yield
|
93
|
+
# a direct match
|
94
|
+
TEST_THRESHOLD_DIRECT = 0.1
|
95
|
+
|
96
|
+
# Relative count of TEST_CHARS must exceed this threshold to yield
|
97
|
+
# an approximate match
|
98
|
+
TEST_THRESHOLD_APPROX = 0.0004
|
99
|
+
|
100
|
+
@supported_encodings = []
|
101
|
+
@encoding_guessers = []
|
102
|
+
@supported_boms = []
|
103
|
+
@bom_guessers = []
|
104
|
+
|
105
|
+
class << self
|
106
|
+
|
107
|
+
attr_reader :supported_encodings, :encoding_guessers,
|
108
|
+
:supported_boms, :bom_guessers
|
109
|
+
|
110
|
+
def guess(input, chunk_size = nil, ignore_bom = false)
|
111
|
+
new(input, chunk_size).guess(ignore_bom)
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
def encoding(encoding, &condition_block)
|
117
|
+
encoding_block = lambda {
|
118
|
+
encoding if instance_eval(&condition_block)
|
119
|
+
}
|
120
|
+
|
121
|
+
encodings(encoding, &encoding_block)
|
122
|
+
end
|
123
|
+
|
124
|
+
def encodings(*encodings, &encoding_block)
|
125
|
+
encodings.each { |encoding|
|
126
|
+
@supported_encodings << encoding
|
127
|
+
@encoding_guessers << encoding_block \
|
128
|
+
unless @encoding_guessers.include?(encoding_block)
|
129
|
+
}
|
130
|
+
end
|
131
|
+
|
132
|
+
def supported_encoding?(encoding)
|
133
|
+
supported_encodings.include?(encoding)
|
134
|
+
end
|
135
|
+
|
136
|
+
def bom_encoding(encoding, &condition_block)
|
137
|
+
encoding_block = lambda {
|
138
|
+
encoding if instance_eval(&condition_block)
|
139
|
+
}
|
140
|
+
|
141
|
+
@supported_boms << encoding
|
142
|
+
@bom_guessers << encoding_block \
|
143
|
+
unless @bom_guessers.include?(encoding_block)
|
144
|
+
end
|
145
|
+
|
146
|
+
def supported_bom?(encoding)
|
147
|
+
supported_boms.include?(encoding)
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
|
153
|
+
|
154
|
+
def initialize(input, chunk_size = nil)
|
155
|
+
@input = case input
|
156
|
+
when IO # that's what we want
|
157
|
+
input
|
158
|
+
when String # convert it to an IO
|
159
|
+
StringIO.new(input)
|
160
|
+
else # um, what's that...?
|
161
|
+
raise ArgumentError, "don't know how to handle input of type #{input.class}"
|
162
|
+
end
|
163
|
+
|
164
|
+
@chunk_size = chunk_size
|
165
|
+
end
|
166
|
+
|
167
|
+
def guess(ignore_bom = false)
|
168
|
+
return bom if bom && !ignore_bom
|
169
|
+
|
170
|
+
while read
|
171
|
+
encoding_guessers.each { |block|
|
172
|
+
encoding = instance_eval(&block)
|
173
|
+
return encoding if encoding && supported_encoding?(encoding)
|
174
|
+
}
|
175
|
+
end
|
176
|
+
|
177
|
+
# nothing suitable found :-(
|
178
|
+
UNKNOWN
|
179
|
+
end
|
180
|
+
|
181
|
+
def bom
|
182
|
+
@bom ||= check_bom
|
183
|
+
end
|
184
|
+
|
185
|
+
private
|
186
|
+
|
187
|
+
def eof?
|
188
|
+
input.eof?
|
189
|
+
end
|
190
|
+
|
191
|
+
def check_bom
|
192
|
+
return if eof?
|
193
|
+
|
194
|
+
# prevent "Illegal seek" error inside a pipe
|
195
|
+
begin
|
196
|
+
input.pos
|
197
|
+
rescue Errno::ESPIPE
|
198
|
+
return
|
199
|
+
end
|
200
|
+
|
201
|
+
bom_guessers.each { |block|
|
202
|
+
encoding = instance_eval(&block)
|
203
|
+
return encoding if encoding && supported_bom?(encoding)
|
204
|
+
|
205
|
+
# read bytes don't build a BOM, so rewind...
|
206
|
+
input.rewind
|
207
|
+
}
|
208
|
+
|
209
|
+
# nothing suitable found :-(
|
210
|
+
nil
|
211
|
+
end
|
212
|
+
|
213
|
+
def next_byte
|
214
|
+
input.read(1).unpack('C').first
|
215
|
+
end
|
216
|
+
|
217
|
+
def starts_with?(*bytes)
|
218
|
+
bytes.all? { |byte|
|
219
|
+
next_byte == byte
|
220
|
+
}
|
221
|
+
end
|
222
|
+
|
223
|
+
def next_one_of?(*bytes)
|
224
|
+
bytes.include?(next_byte)
|
225
|
+
end
|
226
|
+
|
227
|
+
def read(chunk_size = chunk_size)
|
228
|
+
# => initialize counters
|
229
|
+
@byte_count ||= Hash.new(0)
|
230
|
+
@byte_total ||= 0
|
231
|
+
|
232
|
+
return if eof?
|
233
|
+
|
234
|
+
bytes_before = @byte_total
|
235
|
+
|
236
|
+
input.read(chunk_size).each_byte { |byte|
|
237
|
+
@byte_count[byte] += 1
|
238
|
+
@byte_total += 1
|
239
|
+
|
240
|
+
@first_byte ||= byte
|
241
|
+
}
|
242
|
+
|
243
|
+
@byte_total > bytes_before
|
244
|
+
end
|
245
|
+
|
246
|
+
def byte_count_sum(*bytes)
|
247
|
+
bytes = *bytes # treat arrays/ranges and lists alike
|
248
|
+
bytes.inject(0) { |sum, n| sum + byte_count[n] }
|
249
|
+
end
|
250
|
+
|
251
|
+
def relative_byte_count(count)
|
252
|
+
count.to_f / byte_total
|
253
|
+
end
|
254
|
+
|
255
|
+
### Definition of guessing heuristics. Order matters!
|
256
|
+
|
257
|
+
# ASCII, if all bytes are within the lower 128 bytes
|
258
|
+
# (Unfortunately, we have to read the *whole* file to make that decision)
|
259
|
+
encoding ASCII do
|
260
|
+
eof? && byte_count_sum(0x0..0x7f) == byte_total
|
261
|
+
end
|
262
|
+
|
263
|
+
# UTF-16, if lots of NULL bytes present
|
264
|
+
encodings UTF_16BE, UTF_16LE, UTF_16 do
|
265
|
+
if relative_byte_count(byte_count[0]) > 0.25
|
266
|
+
case first_byte
|
267
|
+
when 0x0 then UTF_32
|
268
|
+
when 0xfe then UTF_16BE
|
269
|
+
when 0xff then UTF_16LE
|
270
|
+
else UTF_16
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
# UTF-8, if number of escape-bytes and following bytes
|
276
|
+
# is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
|
277
|
+
encoding UTF_8 do
|
278
|
+
esc_bytes = byte_count_sum(0xc0..0xdf) +
|
279
|
+
# => 110xxxxx 10xxxxxx
|
280
|
+
byte_count_sum(0xe0..0xef) * 2 +
|
281
|
+
# => 1110xxxx 10xxxxxx 10xxxxxx
|
282
|
+
byte_count_sum(0xf0..0xf7) * 3
|
283
|
+
# => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
284
|
+
fol_bytes = byte_count_sum(0x80..0xbf)
|
285
|
+
# => 10xxxxxx
|
286
|
+
|
287
|
+
esc_bytes > 0 && esc_bytes == fol_bytes
|
288
|
+
end
|
289
|
+
|
290
|
+
# Analyse statistical appearance of German umlauts and other accented
|
291
|
+
# letters (see TEST_CHARS)
|
292
|
+
encodings(*TEST_ENCODINGS) do
|
293
|
+
ratios = {}
|
294
|
+
|
295
|
+
TEST_ENCODINGS.find(lambda {
|
296
|
+
ratio, encoding = ratios.sort.last
|
297
|
+
encoding if ratio >= TEST_THRESHOLD_APPROX
|
298
|
+
}) { |encoding|
|
299
|
+
ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
|
300
|
+
#p [encoding, ratio]
|
301
|
+
ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
|
302
|
+
}
|
303
|
+
end
|
304
|
+
|
305
|
+
### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
|
306
|
+
|
307
|
+
bom_encoding UTF_8 do
|
308
|
+
starts_with?(0xef, 0xbb, 0xbf)
|
309
|
+
end
|
310
|
+
|
311
|
+
bom_encoding UTF_16BE do
|
312
|
+
starts_with?(0xfe, 0xff)
|
313
|
+
end
|
314
|
+
|
315
|
+
bom_encoding UTF_16LE do
|
316
|
+
starts_with?(0xff, 0xfe)
|
317
|
+
end
|
318
|
+
|
319
|
+
bom_encoding UTF_32BE do
|
320
|
+
starts_with?(0x00, 0x00, 0xfe, 0xff)
|
321
|
+
end
|
322
|
+
|
323
|
+
bom_encoding UTF_32LE do
|
324
|
+
starts_with?(0xff, 0xfe, 0x00, 0x00)
|
325
|
+
end
|
326
|
+
|
327
|
+
bom_encoding SCSU do
|
328
|
+
starts_with?(0x0e, 0xfe, 0xff)
|
329
|
+
end
|
330
|
+
|
331
|
+
bom_encoding UTF_7 do
|
332
|
+
starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
|
333
|
+
end
|
334
|
+
|
335
|
+
bom_encoding UTF_EBCDIC do
|
336
|
+
starts_with?(0xdd, 0x73, 0x66, 0x73)
|
337
|
+
end
|
338
|
+
|
339
|
+
bom_encoding BOCU_1 do
|
340
|
+
starts_with?(0xfb, 0xee, 0x28)
|
341
|
+
end
|
342
|
+
|
343
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# Contributors: #
|
14
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
15
|
+
# for automatic encoding detection) #
|
16
|
+
# #
|
17
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
18
|
+
# terms of the GNU General Public License as published by the Free Software #
|
19
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
20
|
+
# version. #
|
21
|
+
# #
|
22
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
23
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
24
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
25
|
+
# details. #
|
26
|
+
# #
|
27
|
+
# You should have received a copy of the GNU General Public License along #
|
28
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
29
|
+
# #
|
30
|
+
###############################################################################
|
31
|
+
#++
|
32
|
+
|
33
|
+
# Namespace for our encodings.
|
34
|
+
|
35
|
+
module CMess::GuessEncoding::Encoding
|
36
|
+
|
37
|
+
extend self
|
38
|
+
|
39
|
+
def all_encodings
|
40
|
+
const_defined?(:ALL_ENCODINGS) ? ALL_ENCODINGS :
|
41
|
+
const_set(:ALL_ENCODINGS, get_all_encodings)
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def get_all_encodings
|
47
|
+
%x{iconv -l}.split("\n").map { |e|
|
48
|
+
get_or_set_encoding_const(e.sub(/\/*\z/, ''))
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
def const_name_for(encoding)
|
53
|
+
encoding.tr('-', '_').gsub(/\W/, '').sub(/\A\d/, 'ENC_\&').upcase
|
54
|
+
end
|
55
|
+
|
56
|
+
def set_encoding_const(encoding, const = const_name_for(encoding))
|
57
|
+
const_set(const, encoding.freeze)
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_or_set_encoding_const(encoding)
|
61
|
+
const_defined?(const = const_name_for(encoding)) ?
|
62
|
+
const_get(const) : set_encoding_const(encoding, const)
|
63
|
+
end
|
64
|
+
|
65
|
+
%w[
|
66
|
+
UNKNOWN ASCII MACINTOSH
|
67
|
+
ISO-8859-1 ISO-8859-2 ISO-8859-15
|
68
|
+
CP1250 CP1251 CP1252 CP850 CP852 CP856
|
69
|
+
UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
|
70
|
+
UTF-7 UTF-EBCDIC SCSU BOCU-1
|
71
|
+
ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
|
72
|
+
].each { |encoding| set_encoding_const(encoding) }
|
73
|
+
|
74
|
+
def included(base)
|
75
|
+
base.extend self
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# Contributors: #
|
14
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
15
|
+
# for automatic encoding detection) #
|
16
|
+
# #
|
17
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
18
|
+
# terms of the GNU General Public License as published by the Free Software #
|
19
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
20
|
+
# version. #
|
21
|
+
# #
|
22
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
23
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
24
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
25
|
+
# details. #
|
26
|
+
# #
|
27
|
+
# You should have received a copy of the GNU General Public License along #
|
28
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
29
|
+
# #
|
30
|
+
###############################################################################
|
31
|
+
#++
|
32
|
+
|
33
|
+
require 'iconv'
|
34
|
+
|
35
|
+
# Outputs given string (or line), being encoded in target encoding, encoded in
|
36
|
+
# various test encodings, thus allowing to identify the (seemingly) correct
|
37
|
+
# encoding by visually comparing the input string with its desired appearance.
|
38
|
+
|
39
|
+
module CMess::GuessEncoding::Manual
|
40
|
+
|
41
|
+
extend self
|
42
|
+
|
43
|
+
include CMess::GuessEncoding::Encoding
|
44
|
+
|
45
|
+
# default encodings to try
|
46
|
+
ENCODINGS = [
|
47
|
+
ISO_8859_1,
|
48
|
+
ISO_8859_2,
|
49
|
+
ISO_8859_15,
|
50
|
+
CP1250,
|
51
|
+
CP1251,
|
52
|
+
CP1252,
|
53
|
+
CP850,
|
54
|
+
CP852,
|
55
|
+
CP856,
|
56
|
+
UTF_8
|
57
|
+
]
|
58
|
+
|
59
|
+
# likely candidates to suggest to the user
|
60
|
+
CANDIDATES = [
|
61
|
+
ANSI_X34,
|
62
|
+
EBCDIC_AT_DE,
|
63
|
+
EBCDIC_US,
|
64
|
+
EUC_JP,
|
65
|
+
KOI_8,
|
66
|
+
MACINTOSH,
|
67
|
+
MS_ANSI,
|
68
|
+
SHIFT_JIS,
|
69
|
+
UTF_7,
|
70
|
+
UTF_16,
|
71
|
+
UTF_16BE,
|
72
|
+
UTF_16LE,
|
73
|
+
UTF_32,
|
74
|
+
UTF_32BE,
|
75
|
+
UTF_32LE
|
76
|
+
]
|
77
|
+
|
78
|
+
def display(input, target_encoding, encodings = nil, additional_encodings = [])
|
79
|
+
target = target_encoding
|
80
|
+
|
81
|
+
encodings = (encodings || ENCODINGS) + additional_encodings
|
82
|
+
encodings = encodings.include?('__ALL__') ? all_encodings :
|
83
|
+
encodings.reverse.uniq.reverse # uniq with additional encodings
|
84
|
+
# staying at the end
|
85
|
+
|
86
|
+
# move target encoding to front
|
87
|
+
encodings = [target] + (encodings - [target])
|
88
|
+
|
89
|
+
max_length = encodings.map { |encoding| encoding.length }.max
|
90
|
+
|
91
|
+
encodings.each { |encoding|
|
92
|
+
converted = begin
|
93
|
+
Iconv.conv(target, encoding, input)
|
94
|
+
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
|
95
|
+
"ILLEGAL INPUT SEQUENCE: #{err}"
|
96
|
+
rescue Iconv::InvalidEncoding
|
97
|
+
if encoding == target
|
98
|
+
raise ArgumentError, "invalid encoding: #{encoding}"
|
99
|
+
else
|
100
|
+
"INVALID ENCODING!"
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
puts "%-#{max_length}s : %s" % [encoding, converted]
|
105
|
+
}
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# Contributors: #
|
14
|
+
# John Vorhauer <john@vorhauer.de> (idea and original implementation #
|
15
|
+
# for automatic encoding detection) #
|
16
|
+
# #
|
17
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
18
|
+
# terms of the GNU General Public License as published by the Free Software #
|
19
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
20
|
+
# version. #
|
21
|
+
# #
|
22
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
23
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
24
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
25
|
+
# details. #
|
26
|
+
# #
|
27
|
+
# You should have received a copy of the GNU General Public License along #
|
28
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
29
|
+
# #
|
30
|
+
###############################################################################
|
31
|
+
#++
|
32
|
+
|
33
|
+
require 'cmess'
|
34
|
+
|
35
|
+
# Allows to guess an input's encoding either manually or automatically.
|
36
|
+
# Works actually pretty good -- for the supported encodings. See Manual
|
37
|
+
# and Automatic for details.
|
38
|
+
|
39
|
+
module CMess::GuessEncoding
|
40
|
+
|
41
|
+
# our version ;-)
|
42
|
+
VERSION = '0.0.8'
|
43
|
+
|
44
|
+
class << self
|
45
|
+
|
46
|
+
def manual(*args)
|
47
|
+
Manual.display(*args)
|
48
|
+
end
|
49
|
+
|
50
|
+
def automatic(*args)
|
51
|
+
Automatic.guess(*args)
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
%w[encoding manual automatic].each { |lib|
|
59
|
+
lib = "cmess/guess_encoding/#{lib}"
|
60
|
+
require lib
|
61
|
+
}
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2009 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
module CMess::Version
|
30
|
+
|
31
|
+
MAJOR = 0
|
32
|
+
MINOR = 2
|
33
|
+
TINY = 0
|
34
|
+
|
35
|
+
class << self
|
36
|
+
|
37
|
+
# Returns array representation.
|
38
|
+
def to_a
|
39
|
+
[MAJOR, MINOR, TINY]
|
40
|
+
end
|
41
|
+
|
42
|
+
# Short-cut for version string.
|
43
|
+
def to_s
|
44
|
+
to_a.join('.')
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
CMess::VERSION = to_s
|
50
|
+
|
51
|
+
end
|
data/lib/cmess.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# cmess -- Assist with handling messed up encodings #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
# Bundles several tools that aim at dealing with various problems occurring in
|
30
|
+
# the context of character sets and encodings. Currently, there are:
|
31
|
+
#
|
32
|
+
# guess_encoding:: Simple helper to identify the encoding of a given string.
|
33
|
+
# Includes the ability to automatically detect the encoding
|
34
|
+
# of an input. (see GuessEncoding)
|
35
|
+
# cinderella:: When characters are "double encoded", you can't easily
|
36
|
+
# convert them back -- this is where cinderella comes in,
|
37
|
+
# sorting the good ones into the pot and the (potentially)
|
38
|
+
# bad ones into the crop... (see Cinderella)
|
39
|
+
# bconv:: Convert between bibliographic (and other) encodings.
|
40
|
+
# (see BConv)
|
41
|
+
# decode_entities:: Decode HTML entities in a string. (see DecodeEntities)
|
42
|
+
|
43
|
+
module CMess
|
44
|
+
|
45
|
+
DATA_DIR = File.expand_path(File.join(File.dirname(__FILE__), '..', 'data'))
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
require 'cmess/version'
|