cmess 0.0.4.136

Sign up to get free protection for your applications and to get access to all the features.
data/lib/cmess.rb ADDED
@@ -0,0 +1,44 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # cmess -- Assist with handling messed up encodings #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ # Bundles several tools that aim at dealing with various problems occurring in
30
+ # the context of character sets and encodings. Currently, there are:
31
+ #
32
+ # guess_encoding:: Simple helper to identify the encoding of a given string.
33
+ # Includes the ability to automatically detect the encoding
34
+ # of an input. (see GuessEncoding)
35
+ # cinderella:: When characters are "double encoded", you can't easily
36
+ # convert them back -- this is where cinderella comes in,
37
+ # sorting the good ones into the pot and the (potentially)
38
+ # bad ones into the crop... (see Cinderella)
39
+ # decode_entities:: Decode HTML entities in a string. (see DecodeEntities)
40
+
41
+ module CMess
42
+ end
43
+
44
+ require 'cmess/version'
@@ -0,0 +1,63 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'iconv'
30
+
31
+ # Find (and possibly repair) doubly encoded characters. Here's how it's done:
32
+ #
33
+ # Treats characters encoded in target encoding as if they were encoded in
34
+ # source encoding, converts them to target encoding and "grep"s for lines
35
+ # containing those doubly encoded characters; if asked to repair doubly
36
+ # encoded characters, substitutes them with their original character.
37
+
38
+ module CMess::Cinderella
39
+
40
+ extend self
41
+
42
+ # our version ;-)
43
+ VERSION = '0.0.3'
44
+
45
+ def pick(input, pot, crop, source_encoding, target_encoding, chars, repair = false)
46
+ iconv = Iconv.new(target_encoding, source_encoding)
47
+
48
+ encoded = chars.inject({}) { |hash, char|
49
+ hash.update(iconv.iconv(char) => char)
50
+ }
51
+
52
+ regexp = Regexp.union(*encoded.keys)
53
+
54
+ input.each { |line|
55
+ if out = line =~ regexp ? crop : pot
56
+ line.gsub!(regexp) { |m| encoded[m] } if repair
57
+
58
+ out.puts(line)
59
+ end
60
+ }
61
+ end
62
+
63
+ end
data/lib/cmess/cli.rb ADDED
@@ -0,0 +1,79 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ module CMess::CLI
30
+
31
+ def ensure_readable(file)
32
+ abort "Can't find input file: #{file}" unless File.readable?(file)
33
+ end
34
+
35
+ def ensure_directory(dir)
36
+ abort "Directory not found: #{dir}" unless File.directory?(dir)
37
+ end
38
+
39
+ def open_file_in_place(file)
40
+ ensure_readable(file)
41
+ [File.readlines(file), File.open(file, 'w')]
42
+ end
43
+
44
+ def open_file_or_std(file, mode = 'r')
45
+ if file == '-'
46
+ case mode
47
+ when 'r': STDIN
48
+ when 'w': STDOUT
49
+ when 'a': STDERR
50
+ else raise ArgumentError, "don't know how to handle mode '#{mode}'"
51
+ end
52
+ else
53
+ ensure_readable(file) unless mode == 'w'
54
+ File.open(file, mode)
55
+ end
56
+ end
57
+
58
+ def determine_system_encoding
59
+ ENV['SYSTEM_ENCODING'] ||
60
+ ENV['LANG'][/\.(.*)/, 1] ||
61
+ system_encoding_not_found
62
+ end
63
+
64
+ def system_encoding_not_found
65
+ not_found = lambda {
66
+ abort <<-EOT
67
+ Your system's encoding couldn't be determined automatically -- please specify it
68
+ explicitly via the SYSTEM_ENCODING environment variable or via the '-t' option.
69
+ EOT
70
+ }
71
+
72
+ def not_found.to_s
73
+ 'NOT FOUND'
74
+ end
75
+
76
+ not_found
77
+ end
78
+
79
+ end
@@ -0,0 +1,68 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'iconv'
30
+
31
+ require 'rubygems'
32
+ require 'htmlentities/string'
33
+
34
+ module CMess::DecodeEntities
35
+
36
+ extend self
37
+
38
+ # our version ;-)
39
+ VERSION = '0.0.2'
40
+
41
+ # HTMLEntities requires UTF-8
42
+ INTERMEDIATE_ENCODING = 'utf-8'
43
+
44
+ ICONV_DUMMY = begin
45
+ dummy = Object.new
46
+
47
+ def dummy.iconv(string)
48
+ string
49
+ end
50
+
51
+ dummy
52
+ end
53
+
54
+ def decode(input, output, source_encoding, target_encoding = nil)
55
+ target_encoding ||= source_encoding
56
+
57
+ iconv_in = source_encoding != INTERMEDIATE_ENCODING ?
58
+ Iconv.new(INTERMEDIATE_ENCODING, source_encoding) : ICONV_DUMMY
59
+
60
+ iconv_out = target_encoding != INTERMEDIATE_ENCODING ?
61
+ Iconv.new(target_encoding, INTERMEDIATE_ENCODING) : ICONV_DUMMY
62
+
63
+ input.each { |line|
64
+ output.puts iconv_out.iconv(iconv_in.iconv(line).decode_entities)
65
+ }
66
+ end
67
+
68
+ end
@@ -0,0 +1,372 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # Contributors: #
14
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
15
+ # for automatic encoding detection) #
16
+ # #
17
+ # cmess is free software; you can redistribute it and/or modify it under the #
18
+ # terms of the GNU General Public License as published by the Free Software #
19
+ # Foundation; either version 3 of the License, or (at your option) any later #
20
+ # version. #
21
+ # #
22
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
+ # details. #
26
+ # #
27
+ # You should have received a copy of the GNU General Public License along #
28
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # #
30
+ ###############################################################################
31
+ #++
32
+
33
+ require 'iconv'
34
+
35
+ # Outputs given string (or line), being encoded in target encoding, encoded in
36
+ # various test encodings, thus allowing to identify the (seemingly) correct
37
+ # encoding by visually comparing the input string with its desired appearance.
38
+ #
39
+ # In addition to that manual procedure, may be used to detect the encoding
40
+ # automatically. Works actually pretty good -- for the supported encodings
41
+ # (see Automatic for details).
42
+
43
+ module CMess::GuessEncoding
44
+
45
+ # our version ;-)
46
+ VERSION = '0.0.5'
47
+
48
+ # Namespace for our encodings.
49
+ module Encoding
50
+
51
+ %w[
52
+ UNKNOWN ASCII MACINTOSH
53
+ ISO-8859-1 ISO-8859-2 ISO-8859-15
54
+ CP1250 CP1251 CP1252 CP850 CP852 CP856
55
+ UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
56
+ UTF-7 UTF-EBCDIC SCSU BOCU-1
57
+ ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
58
+ ].each { |encoding|
59
+ const = encoding.tr('-', '_').gsub(/\W/, '')
60
+ const_set(const, encoding.freeze)
61
+ }
62
+
63
+ end
64
+
65
+ module Manual
66
+
67
+ extend self
68
+
69
+ include Encoding
70
+
71
+ # default encodings to try
72
+ ENCODINGS = [
73
+ ISO_8859_1,
74
+ ISO_8859_2,
75
+ ISO_8859_15,
76
+ CP1250,
77
+ CP1251,
78
+ CP1252,
79
+ CP850,
80
+ CP852,
81
+ CP856,
82
+ UTF_8
83
+ ]
84
+
85
+ # likely candidates to suggest to the user
86
+ CANDIDATES = [
87
+ ANSI_X34,
88
+ EBCDIC_AT_DE,
89
+ EBCDIC_US,
90
+ EUC_JP,
91
+ KOI_8,
92
+ MACINTOSH,
93
+ MS_ANSI,
94
+ SHIFT_JIS,
95
+ UTF_7,
96
+ UTF_16,
97
+ UTF_16BE,
98
+ UTF_16LE,
99
+ UTF_32,
100
+ UTF_32BE,
101
+ UTF_32LE
102
+ ]
103
+
104
+ def display(input, target_encoding, encodings = nil, additional_encodings = [])
105
+ target = target_encoding
106
+
107
+ encodings = (encodings || ENCODINGS) + additional_encodings
108
+ encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
109
+ # staying at the end
110
+ encodings = [target] + (encodings - [target]) # move target encoding to front
111
+
112
+ max_length = encodings.map { |encoding| encoding.length }.max
113
+
114
+ encodings.each { |encoding|
115
+ converted = begin
116
+ Iconv.conv(target, encoding, input)
117
+ rescue Iconv::IllegalSequence => err
118
+ "ILLEGAL INPUT SEQUENCE: #{err}"
119
+ rescue Iconv::InvalidEncoding
120
+ if encoding == target
121
+ abort "Invalid encoding: #{encoding}"
122
+ else
123
+ "INVALID ENCODING!"
124
+ end
125
+ end
126
+
127
+ puts "%-#{max_length}s : %s" % [encoding, converted]
128
+ }
129
+ end
130
+
131
+ end
132
+
133
+ # Tries to detect the encoding of a given input by applying several
134
+ # heuristics to determine the <b>most likely</b> candidate. If no heuristic
135
+ # catches on, resorts to Encoding::UNKNOWN.
136
+ #
137
+ # If a BOM is found, it may determine the encoding directly.
138
+ class Automatic
139
+
140
+ extend Forwardable
141
+
142
+ def_delegators :@klass, :encoding_guessers, :supported_encoding?,
143
+ :bom_guessers, :supported_bom?
144
+
145
+ include Encoding
146
+
147
+ @supported_encodings = []
148
+ @encoding_guessers = []
149
+ @supported_boms = []
150
+ @bom_guessers = []
151
+
152
+ class << self
153
+
154
+ attr_reader :supported_encodings, :encoding_guessers,
155
+ :supported_boms, :bom_guessers
156
+
157
+ def guess(input, chunk_size = nil, ignore_bom = false)
158
+ new(input, chunk_size).guess(ignore_bom)
159
+ end
160
+
161
+ private
162
+
163
+ def encoding(encoding, &condition_block)
164
+ encoding_block = lambda {
165
+ encoding if instance_eval(&condition_block)
166
+ }
167
+
168
+ encodings(encoding, &encoding_block)
169
+ end
170
+
171
+ def encodings(*encodings, &encoding_block)
172
+ encodings.each { |encoding|
173
+ @supported_encodings << encoding
174
+ @encoding_guessers << encoding_block
175
+ }
176
+ end
177
+
178
+ def supported_encoding?(encoding)
179
+ supported_encodings.include?(encoding)
180
+ end
181
+
182
+ def bom_encoding(encoding, &condition_block)
183
+ encoding_block = lambda {
184
+ encoding if instance_eval(&condition_block)
185
+ }
186
+
187
+ @supported_boms << encoding
188
+ @bom_guessers << encoding_block
189
+ end
190
+
191
+ def supported_bom?(encoding)
192
+ supported_boms.include?(encoding)
193
+ end
194
+
195
+ end
196
+
197
+ attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
198
+
199
+ def initialize(input, chunk_size = nil)
200
+ @input = input
201
+ @chunk_size = chunk_size
202
+
203
+ @klass = self.class
204
+ end
205
+
206
+ def guess(ignore_bom = false)
207
+ return bom if bom && !ignore_bom
208
+
209
+ while read
210
+ encoding_guessers.each { |block|
211
+ encoding = instance_eval(&block)
212
+ return encoding if encoding && supported_encoding?(encoding)
213
+ }
214
+ end
215
+
216
+ # nothing suitable found :-(
217
+ UNKNOWN
218
+ end
219
+
220
+ def bom
221
+ @bom ||= check_bom
222
+ end
223
+
224
+ private
225
+
226
+ def eof?
227
+ input.eof?
228
+ end
229
+
230
+ def check_bom
231
+ return if eof?
232
+
233
+ bom_guessers.each { |block|
234
+ encoding = instance_eval(&block)
235
+ return encoding if encoding && supported_bom?(encoding)
236
+
237
+ # read bytes don't build a BOM, so rewind...
238
+ input.rewind
239
+ }
240
+
241
+ # nothing suitable found :-(
242
+ nil
243
+ end
244
+
245
+ def next_byte
246
+ input.read(1).unpack('C').first
247
+ end
248
+
249
+ def starts_with?(*bytes)
250
+ bytes.all? { |byte|
251
+ next_byte == byte
252
+ }
253
+ end
254
+
255
+ def next_one_of?(*bytes)
256
+ bytes.include?(next_byte)
257
+ end
258
+
259
+ def read(chunk_size = chunk_size)
260
+ # => initialize counters
261
+ @byte_count ||= Hash.new(0)
262
+ @byte_total ||= 0
263
+
264
+ return if eof?
265
+
266
+ bytes_before = @byte_total
267
+
268
+ input.read(chunk_size).each_byte { |byte|
269
+ @byte_count[byte] += 1
270
+ @byte_total += 1
271
+
272
+ @first_byte ||= byte
273
+ }
274
+
275
+ @byte_total > bytes_before
276
+ end
277
+
278
+ def byte_count_sum(*bytes)
279
+ bytes = *bytes # treat arrays/ranges and lists alike
280
+ bytes.inject(0) { |sum, n| sum + byte_count[n] }
281
+ end
282
+
283
+ def relative_byte_count(count)
284
+ count.to_f / byte_total
285
+ end
286
+
287
+ ### Definition of guessing heuristics. Order matters!
288
+
289
+ # ASCII, if all bytes are within the lower 128 bytes
290
+ # (Unfortunately, we have to read the *whole* file to make that decision)
291
+ encoding ASCII do
292
+ eof? && byte_count_sum(0x0..0x7f) == byte_total
293
+ end
294
+
295
+ # UTF-16, if lots of NULL bytes present
296
+ encodings UTF_16BE, UTF_16LE, UTF_16 do
297
+ if relative_byte_count(byte_count[0]) > 0.25
298
+ case first_byte
299
+ when 0: UTF_32
300
+ when 254: UTF_16BE
301
+ when 255: UTF_16LE
302
+ else UTF_16
303
+ end
304
+ end
305
+ end
306
+
307
+ # UTF-8, if number of escape-bytes and following bytes
308
+ # is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
309
+ encoding UTF_8 do
310
+ esc_bytes = byte_count_sum(0xc0..0xdf) \
311
+ # => 110xxxxx 10xxxxxx
312
+ + byte_count_sum(0xe0..0xef) * 2 \
313
+ # => 1110xxxx 10xxxxxx 10xxxxxx
314
+ + byte_count_sum(0xf0..0xf7) * 3
315
+ # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
316
+ fol_bytes = byte_count_sum(0x80..0xbf)
317
+ # => 10xxxxxx
318
+
319
+ esc_bytes > 0 && esc_bytes == fol_bytes
320
+ end
321
+
322
+ # Analyse statistical appearance of German umlauts (=> ÄäÖöÜüß)
323
+ encodings MACINTOSH, ISO_8859_1 do
324
+ {
325
+ MACINTOSH => [0x80, 0x8a, 0x85, 0x9a, 0x86, 0x9f, 0xa7],
326
+ ISO_8859_1 => [0xc4, 0xe4, 0xd6, 0xf6, 0xdc, 0xfc, 0xdf]
327
+ }.each { |encoding, umlauts|
328
+ break encoding if relative_byte_count(byte_count_sum(umlauts)) > 0.001
329
+ }
330
+ end
331
+
332
+ ### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
333
+
334
+ bom_encoding UTF_8 do
335
+ starts_with?(0xef, 0xbb, 0xbf)
336
+ end
337
+
338
+ bom_encoding UTF_16BE do
339
+ starts_with?(0xfe, 0xff)
340
+ end
341
+
342
+ bom_encoding UTF_16LE do
343
+ starts_with?(0xff, 0xfe)
344
+ end
345
+
346
+ bom_encoding UTF_32BE do
347
+ starts_with?(0x00, 0x00, 0xfe, 0xff)
348
+ end
349
+
350
+ bom_encoding UTF_32LE do
351
+ starts_with?(0xff, 0xfe, 0x00, 0x00)
352
+ end
353
+
354
+ bom_encoding SCSU do
355
+ starts_with?(0x0e, 0xfe, 0xff)
356
+ end
357
+
358
+ bom_encoding UTF_7 do
359
+ starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
360
+ end
361
+
362
+ bom_encoding UTF_EBCDIC do
363
+ starts_with?(0xdd, 0x73, 0x66, 0x73)
364
+ end
365
+
366
+ bom_encoding BOCU_1 do
367
+ starts_with?(0xfb, 0xee, 0x28)
368
+ end
369
+
370
+ end
371
+
372
+ end