cmess 0.0.4.136

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/cmess.rb ADDED
@@ -0,0 +1,44 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # cmess -- Assist with handling messed up encodings #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ # Bundles several tools that aim at dealing with various problems occurring in
30
+ # the context of character sets and encodings. Currently, there are:
31
+ #
32
+ # guess_encoding:: Simple helper to identify the encoding of a given string.
33
+ # Includes the ability to automatically detect the encoding
34
+ # of an input. (see GuessEncoding)
35
+ # cinderella:: When characters are "double encoded", you can't easily
36
+ # convert them back -- this is where cinderella comes in,
37
+ # sorting the good ones into the pot and the (potentially)
38
+ # bad ones into the crop... (see Cinderella)
39
+ # decode_entities:: Decode HTML entities in a string. (see DecodeEntities)
40
+
41
+ module CMess
42
+ end
43
+
44
+ require 'cmess/version'
@@ -0,0 +1,63 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'iconv'
30
+
31
+ # Find (and possibly repair) doubly encoded characters. Here's how it's done:
32
+ #
33
+ # Treats characters encoded in target encoding as if they were encoded in
34
+ # source encoding, converts them to target encoding and "grep"s for lines
35
+ # containing those doubly encoded characters; if asked to repair doubly
36
+ # encoded characters, substitutes them with their original character.
37
+
38
+ module CMess::Cinderella
39
+
40
+ extend self
41
+
42
+ # our version ;-)
43
+ VERSION = '0.0.3'
44
+
45
+ def pick(input, pot, crop, source_encoding, target_encoding, chars, repair = false)
46
+ iconv = Iconv.new(target_encoding, source_encoding)
47
+
48
+ encoded = chars.inject({}) { |hash, char|
49
+ hash.update(iconv.iconv(char) => char)
50
+ }
51
+
52
+ regexp = Regexp.union(*encoded.keys)
53
+
54
+ input.each { |line|
55
+ if out = line =~ regexp ? crop : pot
56
+ line.gsub!(regexp) { |m| encoded[m] } if repair
57
+
58
+ out.puts(line)
59
+ end
60
+ }
61
+ end
62
+
63
+ end
data/lib/cmess/cli.rb ADDED
@@ -0,0 +1,79 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ module CMess::CLI
30
+
31
+ def ensure_readable(file)
32
+ abort "Can't find input file: #{file}" unless File.readable?(file)
33
+ end
34
+
35
+ def ensure_directory(dir)
36
+ abort "Directory not found: #{dir}" unless File.directory?(dir)
37
+ end
38
+
39
+ def open_file_in_place(file)
40
+ ensure_readable(file)
41
+ [File.readlines(file), File.open(file, 'w')]
42
+ end
43
+
44
+ def open_file_or_std(file, mode = 'r')
45
+ if file == '-'
46
+ case mode
47
+ when 'r': STDIN
48
+ when 'w': STDOUT
49
+ when 'a': STDERR
50
+ else raise ArgumentError, "don't know how to handle mode '#{mode}'"
51
+ end
52
+ else
53
+ ensure_readable(file) unless mode == 'w'
54
+ File.open(file, mode)
55
+ end
56
+ end
57
+
58
+ def determine_system_encoding
59
+ ENV['SYSTEM_ENCODING'] ||
60
+ ENV['LANG'][/\.(.*)/, 1] ||
61
+ system_encoding_not_found
62
+ end
63
+
64
+ def system_encoding_not_found
65
+ not_found = lambda {
66
+ abort <<-EOT
67
+ Your system's encoding couldn't be determined automatically -- please specify it
68
+ explicitly via the SYSTEM_ENCODING environment variable or via the '-t' option.
69
+ EOT
70
+ }
71
+
72
+ def not_found.to_s
73
+ 'NOT FOUND'
74
+ end
75
+
76
+ not_found
77
+ end
78
+
79
+ end
@@ -0,0 +1,68 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'iconv'
30
+
31
+ require 'rubygems'
32
+ require 'htmlentities/string'
33
+
34
+ module CMess::DecodeEntities
35
+
36
+ extend self
37
+
38
+ # our version ;-)
39
+ VERSION = '0.0.2'
40
+
41
+ # HTMLEntities requires UTF-8
42
+ INTERMEDIATE_ENCODING = 'utf-8'
43
+
44
+ ICONV_DUMMY = begin
45
+ dummy = Object.new
46
+
47
+ def dummy.iconv(string)
48
+ string
49
+ end
50
+
51
+ dummy
52
+ end
53
+
54
+ def decode(input, output, source_encoding, target_encoding = nil)
55
+ target_encoding ||= source_encoding
56
+
57
+ iconv_in = source_encoding != INTERMEDIATE_ENCODING ?
58
+ Iconv.new(INTERMEDIATE_ENCODING, source_encoding) : ICONV_DUMMY
59
+
60
+ iconv_out = target_encoding != INTERMEDIATE_ENCODING ?
61
+ Iconv.new(target_encoding, INTERMEDIATE_ENCODING) : ICONV_DUMMY
62
+
63
+ input.each { |line|
64
+ output.puts iconv_out.iconv(iconv_in.iconv(line).decode_entities)
65
+ }
66
+ end
67
+
68
+ end
@@ -0,0 +1,372 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # Contributors: #
14
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
15
+ # for automatic encoding detection) #
16
+ # #
17
+ # cmess is free software; you can redistribute it and/or modify it under the #
18
+ # terms of the GNU General Public License as published by the Free Software #
19
+ # Foundation; either version 3 of the License, or (at your option) any later #
20
+ # version. #
21
+ # #
22
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
+ # details. #
26
+ # #
27
+ # You should have received a copy of the GNU General Public License along #
28
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # #
30
+ ###############################################################################
31
+ #++
32
+
33
+ require 'iconv'
34
+
35
+ # Outputs given string (or line), being encoded in target encoding, encoded in
36
+ # various test encodings, thus allowing to identify the (seemingly) correct
37
+ # encoding by visually comparing the input string with its desired appearance.
38
+ #
39
+ # In addition to that manual procedure, may be used to detect the encoding
40
+ # automatically. Works actually pretty good -- for the supported encodings
41
+ # (see Automatic for details).
42
+
43
+ module CMess::GuessEncoding
44
+
45
+ # our version ;-)
46
+ VERSION = '0.0.5'
47
+
48
+ # Namespace for our encodings.
49
+ module Encoding
50
+
51
+ %w[
52
+ UNKNOWN ASCII MACINTOSH
53
+ ISO-8859-1 ISO-8859-2 ISO-8859-15
54
+ CP1250 CP1251 CP1252 CP850 CP852 CP856
55
+ UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
56
+ UTF-7 UTF-EBCDIC SCSU BOCU-1
57
+ ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
58
+ ].each { |encoding|
59
+ const = encoding.tr('-', '_').gsub(/\W/, '')
60
+ const_set(const, encoding.freeze)
61
+ }
62
+
63
+ end
64
+
65
+ module Manual
66
+
67
+ extend self
68
+
69
+ include Encoding
70
+
71
+ # default encodings to try
72
+ ENCODINGS = [
73
+ ISO_8859_1,
74
+ ISO_8859_2,
75
+ ISO_8859_15,
76
+ CP1250,
77
+ CP1251,
78
+ CP1252,
79
+ CP850,
80
+ CP852,
81
+ CP856,
82
+ UTF_8
83
+ ]
84
+
85
+ # likely candidates to suggest to the user
86
+ CANDIDATES = [
87
+ ANSI_X34,
88
+ EBCDIC_AT_DE,
89
+ EBCDIC_US,
90
+ EUC_JP,
91
+ KOI_8,
92
+ MACINTOSH,
93
+ MS_ANSI,
94
+ SHIFT_JIS,
95
+ UTF_7,
96
+ UTF_16,
97
+ UTF_16BE,
98
+ UTF_16LE,
99
+ UTF_32,
100
+ UTF_32BE,
101
+ UTF_32LE
102
+ ]
103
+
104
+ def display(input, target_encoding, encodings = nil, additional_encodings = [])
105
+ target = target_encoding
106
+
107
+ encodings = (encodings || ENCODINGS) + additional_encodings
108
+ encodings = encodings.reverse.uniq.reverse # uniq with additional encodings
109
+ # staying at the end
110
+ encodings = [target] + (encodings - [target]) # move target encoding to front
111
+
112
+ max_length = encodings.map { |encoding| encoding.length }.max
113
+
114
+ encodings.each { |encoding|
115
+ converted = begin
116
+ Iconv.conv(target, encoding, input)
117
+ rescue Iconv::IllegalSequence => err
118
+ "ILLEGAL INPUT SEQUENCE: #{err}"
119
+ rescue Iconv::InvalidEncoding
120
+ if encoding == target
121
+ abort "Invalid encoding: #{encoding}"
122
+ else
123
+ "INVALID ENCODING!"
124
+ end
125
+ end
126
+
127
+ puts "%-#{max_length}s : %s" % [encoding, converted]
128
+ }
129
+ end
130
+
131
+ end
132
+
133
+ # Tries to detect the encoding of a given input by applying several
134
+ # heuristics to determine the <b>most likely</b> candidate. If no heuristic
135
+ # catches on, resorts to Encoding::UNKNOWN.
136
+ #
137
+ # If a BOM is found, it may determine the encoding directly.
138
+ class Automatic
139
+
140
+ extend Forwardable
141
+
142
+ def_delegators :@klass, :encoding_guessers, :supported_encoding?,
143
+ :bom_guessers, :supported_bom?
144
+
145
+ include Encoding
146
+
147
+ @supported_encodings = []
148
+ @encoding_guessers = []
149
+ @supported_boms = []
150
+ @bom_guessers = []
151
+
152
+ class << self
153
+
154
+ attr_reader :supported_encodings, :encoding_guessers,
155
+ :supported_boms, :bom_guessers
156
+
157
+ def guess(input, chunk_size = nil, ignore_bom = false)
158
+ new(input, chunk_size).guess(ignore_bom)
159
+ end
160
+
161
+ private
162
+
163
+ def encoding(encoding, &condition_block)
164
+ encoding_block = lambda {
165
+ encoding if instance_eval(&condition_block)
166
+ }
167
+
168
+ encodings(encoding, &encoding_block)
169
+ end
170
+
171
+ def encodings(*encodings, &encoding_block)
172
+ encodings.each { |encoding|
173
+ @supported_encodings << encoding
174
+ @encoding_guessers << encoding_block
175
+ }
176
+ end
177
+
178
+ def supported_encoding?(encoding)
179
+ supported_encodings.include?(encoding)
180
+ end
181
+
182
+ def bom_encoding(encoding, &condition_block)
183
+ encoding_block = lambda {
184
+ encoding if instance_eval(&condition_block)
185
+ }
186
+
187
+ @supported_boms << encoding
188
+ @bom_guessers << encoding_block
189
+ end
190
+
191
+ def supported_bom?(encoding)
192
+ supported_boms.include?(encoding)
193
+ end
194
+
195
+ end
196
+
197
+ attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
198
+
199
+ def initialize(input, chunk_size = nil)
200
+ @input = input
201
+ @chunk_size = chunk_size
202
+
203
+ @klass = self.class
204
+ end
205
+
206
+ def guess(ignore_bom = false)
207
+ return bom if bom && !ignore_bom
208
+
209
+ while read
210
+ encoding_guessers.each { |block|
211
+ encoding = instance_eval(&block)
212
+ return encoding if encoding && supported_encoding?(encoding)
213
+ }
214
+ end
215
+
216
+ # nothing suitable found :-(
217
+ UNKNOWN
218
+ end
219
+
220
+ def bom
221
+ @bom ||= check_bom
222
+ end
223
+
224
+ private
225
+
226
+ def eof?
227
+ input.eof?
228
+ end
229
+
230
+ def check_bom
231
+ return if eof?
232
+
233
+ bom_guessers.each { |block|
234
+ encoding = instance_eval(&block)
235
+ return encoding if encoding && supported_bom?(encoding)
236
+
237
+ # read bytes don't build a BOM, so rewind...
238
+ input.rewind
239
+ }
240
+
241
+ # nothing suitable found :-(
242
+ nil
243
+ end
244
+
245
+ def next_byte
246
+ input.read(1).unpack('C').first
247
+ end
248
+
249
+ def starts_with?(*bytes)
250
+ bytes.all? { |byte|
251
+ next_byte == byte
252
+ }
253
+ end
254
+
255
+ def next_one_of?(*bytes)
256
+ bytes.include?(next_byte)
257
+ end
258
+
259
+ def read(chunk_size = chunk_size)
260
+ # => initialize counters
261
+ @byte_count ||= Hash.new(0)
262
+ @byte_total ||= 0
263
+
264
+ return if eof?
265
+
266
+ bytes_before = @byte_total
267
+
268
+ input.read(chunk_size).each_byte { |byte|
269
+ @byte_count[byte] += 1
270
+ @byte_total += 1
271
+
272
+ @first_byte ||= byte
273
+ }
274
+
275
+ @byte_total > bytes_before
276
+ end
277
+
278
+ def byte_count_sum(*bytes)
279
+ bytes = *bytes # treat arrays/ranges and lists alike
280
+ bytes.inject(0) { |sum, n| sum + byte_count[n] }
281
+ end
282
+
283
+ def relative_byte_count(count)
284
+ count.to_f / byte_total
285
+ end
286
+
287
+ ### Definition of guessing heuristics. Order matters!
288
+
289
+ # ASCII, if all bytes are within the lower 128 bytes
290
+ # (Unfortunately, we have to read the *whole* file to make that decision)
291
+ encoding ASCII do
292
+ eof? && byte_count_sum(0x0..0x7f) == byte_total
293
+ end
294
+
295
+ # UTF-16, if lots of NULL bytes present
296
+ encodings UTF_16BE, UTF_16LE, UTF_16 do
297
+ if relative_byte_count(byte_count[0]) > 0.25
298
+ case first_byte
299
+ when 0: UTF_32
300
+ when 254: UTF_16BE
301
+ when 255: UTF_16LE
302
+ else UTF_16
303
+ end
304
+ end
305
+ end
306
+
307
+ # UTF-8, if number of escape-bytes and following bytes
308
+ # is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
309
+ encoding UTF_8 do
310
+ esc_bytes = byte_count_sum(0xc0..0xdf) \
311
+ # => 110xxxxx 10xxxxxx
312
+ + byte_count_sum(0xe0..0xef) * 2 \
313
+ # => 1110xxxx 10xxxxxx 10xxxxxx
314
+ + byte_count_sum(0xf0..0xf7) * 3
315
+ # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
316
+ fol_bytes = byte_count_sum(0x80..0xbf)
317
+ # => 10xxxxxx
318
+
319
+ esc_bytes > 0 && esc_bytes == fol_bytes
320
+ end
321
+
322
+ # Analyse statistical appearance of German umlauts (=> ÄäÖöÜüß)
323
+ encodings MACINTOSH, ISO_8859_1 do
324
+ {
325
+ MACINTOSH => [0x80, 0x8a, 0x85, 0x9a, 0x86, 0x9f, 0xa7],
326
+ ISO_8859_1 => [0xc4, 0xe4, 0xd6, 0xf6, 0xdc, 0xfc, 0xdf]
327
+ }.each { |encoding, umlauts|
328
+ break encoding if relative_byte_count(byte_count_sum(umlauts)) > 0.001
329
+ }
330
+ end
331
+
332
+ ### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
333
+
334
+ bom_encoding UTF_8 do
335
+ starts_with?(0xef, 0xbb, 0xbf)
336
+ end
337
+
338
+ bom_encoding UTF_16BE do
339
+ starts_with?(0xfe, 0xff)
340
+ end
341
+
342
+ bom_encoding UTF_16LE do
343
+ starts_with?(0xff, 0xfe)
344
+ end
345
+
346
+ bom_encoding UTF_32BE do
347
+ starts_with?(0x00, 0x00, 0xfe, 0xff)
348
+ end
349
+
350
+ bom_encoding UTF_32LE do
351
+ starts_with?(0xff, 0xfe, 0x00, 0x00)
352
+ end
353
+
354
+ bom_encoding SCSU do
355
+ starts_with?(0x0e, 0xfe, 0xff)
356
+ end
357
+
358
+ bom_encoding UTF_7 do
359
+ starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
360
+ end
361
+
362
+ bom_encoding UTF_EBCDIC do
363
+ starts_with?(0xdd, 0x73, 0x66, 0x73)
364
+ end
365
+
366
+ bom_encoding BOCU_1 do
367
+ starts_with?(0xfb, 0xee, 0x28)
368
+ end
369
+
370
+ end
371
+
372
+ end