blackwinter-cmess 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/COPYING +676 -0
  2. data/ChangeLog +54 -0
  3. data/README +63 -0
  4. data/Rakefile +51 -0
  5. data/bin/bconv +130 -0
  6. data/bin/cinderella +190 -0
  7. data/bin/decode_entities +106 -0
  8. data/bin/guess_encoding +223 -0
  9. data/data/chartab.yaml +26724 -0
  10. data/data/csets/iso_8859-1.yaml +195 -0
  11. data/data/csets/iso_8859-15.yaml +204 -0
  12. data/data/csets/latin1.yaml +195 -0
  13. data/data/csets/unicode/basic_latin.yaml +97 -0
  14. data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
  15. data/data/csets/unicode/cyrillic.yaml +256 -0
  16. data/data/csets/unicode/greek.yaml +129 -0
  17. data/data/csets/unicode/ipa_extensions.yaml +97 -0
  18. data/data/csets/unicode/latin-extended-c.yaml +18 -0
  19. data/data/csets/unicode/latin-extended-d.yaml +3 -0
  20. data/data/csets/unicode/latin_1_supplement.yaml +128 -0
  21. data/data/csets/unicode/latin_extended_a.yaml +129 -0
  22. data/data/csets/unicode/latin_extended_additional.yaml +247 -0
  23. data/data/csets/unicode/latin_extended_b.yaml +209 -0
  24. data/data/csets/unicode/letterlike_symbols.yaml +80 -0
  25. data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
  26. data/data/csets/utf-8.yaml +1504 -0
  27. data/data/csets/utf8.yaml +1504 -0
  28. data/data/test_chars.yaml +14 -0
  29. data/example/cinderella/crop +127 -0
  30. data/example/cinderella/crop_repaired +127 -0
  31. data/example/cinderella/empty6-slash.txt +1495 -0
  32. data/example/cinderella/empty6-slash_repaired.txt +1495 -0
  33. data/example/cinderella/pot +1368 -0
  34. data/example/guess_encoding/check_results +60 -0
  35. data/example/guess_encoding/de.utf-8.txt +10030 -0
  36. data/example/guess_encoding/en.utf-8.txt +10030 -0
  37. data/example/guess_encoding/fr.utf-8.txt +10030 -0
  38. data/example/guess_encoding/it.utf-8.txt +10030 -0
  39. data/lib/cmess/bconv.rb +169 -0
  40. data/lib/cmess/cinderella.rb +66 -0
  41. data/lib/cmess/cli.rb +120 -0
  42. data/lib/cmess/decode_entities.rb +69 -0
  43. data/lib/cmess/guess_encoding/automatic.rb +343 -0
  44. data/lib/cmess/guess_encoding/encoding.rb +78 -0
  45. data/lib/cmess/guess_encoding/manual.rb +108 -0
  46. data/lib/cmess/guess_encoding.rb +61 -0
  47. data/lib/cmess/version.rb +51 -0
  48. data/lib/cmess.rb +49 -0
  49. metadata +136 -0
@@ -0,0 +1,343 @@
1
+ # encoding: utf-8
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # A component of cmess, the encoding tool-box. #
7
+ # #
8
+ # Copyright (C) 2007-2009 University of Cologne, #
9
+ # Albertus-Magnus-Platz, #
10
+ # 50932 Cologne, Germany #
11
+ # #
12
+ # Authors: #
13
+ # Jens Wille <jens.wille@uni-koeln.de> #
14
+ # #
15
+ # Contributors: #
16
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
17
+ # for automatic encoding detection) #
18
+ # #
19
+ # cmess is free software; you can redistribute it and/or modify it under the #
20
+ # terms of the GNU General Public License as published by the Free Software #
21
+ # Foundation; either version 3 of the License, or (at your option) any later #
22
+ # version. #
23
+ # #
24
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
25
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
26
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
27
+ # details. #
28
+ # #
29
+ # You should have received a copy of the GNU General Public License along #
30
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
31
+ # #
32
+ ###############################################################################
33
+ #++
34
+
35
+ $KCODE = 'u' unless RUBY_VERSION >= '1.9'
36
+
37
+ require 'yaml'
38
+ require 'iconv'
39
+ require 'stringio'
40
+ require 'forwardable'
41
+
42
+ # Tries to detect the encoding of a given input by applying several
43
+ # heuristics to determine the <b>most likely</b> candidate. If no heuristic
44
+ # catches on, resorts to Encoding::UNKNOWN.
45
+ #
46
+ # If a BOM is found, it may determine the encoding directly.
47
+
48
+ class CMess::GuessEncoding::Automatic
49
+
50
+ extend Forwardable
51
+
52
+ def_delegators self, :encoding_guessers, :supported_encoding?,
53
+ :bom_guessers, :supported_bom?
54
+
55
+ include CMess::GuessEncoding::Encoding
56
+
57
+ # Creates a converter for desired encoding (from UTF-8)
58
+ ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
59
+
60
+ # Single-byte encodings to test statistically by TEST_CHARS
61
+ TEST_ENCODINGS = [
62
+ MACINTOSH,
63
+ ISO_8859_1,
64
+ ISO_8859_15,
65
+ CP1252,
66
+ CP850,
67
+ MS_ANSI
68
+ ]
69
+
70
+ # Certain (non-ASCII) chars to test for in TEST_ENCODINGS
71
+ CHARS_TO_TEST = (
72
+ '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
73
+ 'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
74
+ ).split(//)
75
+
76
+ # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
77
+ TEST_CHARS = Hash.new { |hash, encoding|
78
+ encoding = Encoding.get_or_set_encoding_const(encoding)
79
+ encchars = CHARS_TO_TEST.map { |char|
80
+ begin
81
+ byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
82
+ rescue Iconv::IllegalSequence
83
+ end
84
+ }.compact
85
+
86
+ TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
87
+ hash[encoding] = encchars
88
+ }.update(YAML.load_file(
89
+ File.join(File.dirname(__FILE__), *%w[.. .. .. data test_chars.yaml])
90
+ ))
91
+
92
+ # Relative count of TEST_CHARS must exceed this threshold to yield
93
+ # a direct match
94
+ TEST_THRESHOLD_DIRECT = 0.1
95
+
96
+ # Relative count of TEST_CHARS must exceed this threshold to yield
97
+ # an approximate match
98
+ TEST_THRESHOLD_APPROX = 0.0004
99
+
100
+ @supported_encodings = []
101
+ @encoding_guessers = []
102
+ @supported_boms = []
103
+ @bom_guessers = []
104
+
105
+ class << self
106
+
107
+ attr_reader :supported_encodings, :encoding_guessers,
108
+ :supported_boms, :bom_guessers
109
+
110
+ def guess(input, chunk_size = nil, ignore_bom = false)
111
+ new(input, chunk_size).guess(ignore_bom)
112
+ end
113
+
114
+ private
115
+
116
+ def encoding(encoding, &condition_block)
117
+ encoding_block = lambda {
118
+ encoding if instance_eval(&condition_block)
119
+ }
120
+
121
+ encodings(encoding, &encoding_block)
122
+ end
123
+
124
+ def encodings(*encodings, &encoding_block)
125
+ encodings.each { |encoding|
126
+ @supported_encodings << encoding
127
+ @encoding_guessers << encoding_block \
128
+ unless @encoding_guessers.include?(encoding_block)
129
+ }
130
+ end
131
+
132
+ def supported_encoding?(encoding)
133
+ supported_encodings.include?(encoding)
134
+ end
135
+
136
+ def bom_encoding(encoding, &condition_block)
137
+ encoding_block = lambda {
138
+ encoding if instance_eval(&condition_block)
139
+ }
140
+
141
+ @supported_boms << encoding
142
+ @bom_guessers << encoding_block \
143
+ unless @bom_guessers.include?(encoding_block)
144
+ end
145
+
146
+ def supported_bom?(encoding)
147
+ supported_boms.include?(encoding)
148
+ end
149
+
150
+ end
151
+
152
+ attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
153
+
154
+ def initialize(input, chunk_size = nil)
155
+ @input = case input
156
+ when IO # that's what we want
157
+ input
158
+ when String # convert it to an IO
159
+ StringIO.new(input)
160
+ else # um, what's that...?
161
+ raise ArgumentError, "don't know how to handle input of type #{input.class}"
162
+ end
163
+
164
+ @chunk_size = chunk_size
165
+ end
166
+
167
+ def guess(ignore_bom = false)
168
+ return bom if bom && !ignore_bom
169
+
170
+ while read
171
+ encoding_guessers.each { |block|
172
+ encoding = instance_eval(&block)
173
+ return encoding if encoding && supported_encoding?(encoding)
174
+ }
175
+ end
176
+
177
+ # nothing suitable found :-(
178
+ UNKNOWN
179
+ end
180
+
181
+ def bom
182
+ @bom ||= check_bom
183
+ end
184
+
185
+ private
186
+
187
+ def eof?
188
+ input.eof?
189
+ end
190
+
191
+ def check_bom
192
+ return if eof?
193
+
194
+ # prevent "Illegal seek" error inside a pipe
195
+ begin
196
+ input.pos
197
+ rescue Errno::ESPIPE
198
+ return
199
+ end
200
+
201
+ bom_guessers.each { |block|
202
+ encoding = instance_eval(&block)
203
+ return encoding if encoding && supported_bom?(encoding)
204
+
205
+ # read bytes don't build a BOM, so rewind...
206
+ input.rewind
207
+ }
208
+
209
+ # nothing suitable found :-(
210
+ nil
211
+ end
212
+
213
+ def next_byte
214
+ input.read(1).unpack('C').first
215
+ end
216
+
217
+ def starts_with?(*bytes)
218
+ bytes.all? { |byte|
219
+ next_byte == byte
220
+ }
221
+ end
222
+
223
+ def next_one_of?(*bytes)
224
+ bytes.include?(next_byte)
225
+ end
226
+
227
+ def read(chunk_size = chunk_size)
228
+ # => initialize counters
229
+ @byte_count ||= Hash.new(0)
230
+ @byte_total ||= 0
231
+
232
+ return if eof?
233
+
234
+ bytes_before = @byte_total
235
+
236
+ input.read(chunk_size).each_byte { |byte|
237
+ @byte_count[byte] += 1
238
+ @byte_total += 1
239
+
240
+ @first_byte ||= byte
241
+ }
242
+
243
+ @byte_total > bytes_before
244
+ end
245
+
246
+ def byte_count_sum(*bytes)
247
+ bytes = *bytes # treat arrays/ranges and lists alike
248
+ bytes.inject(0) { |sum, n| sum + byte_count[n] }
249
+ end
250
+
251
+ def relative_byte_count(count)
252
+ count.to_f / byte_total
253
+ end
254
+
255
+ ### Definition of guessing heuristics. Order matters!
256
+
257
+ # ASCII, if all bytes are within the lower 128 bytes
258
+ # (Unfortunately, we have to read the *whole* file to make that decision)
259
+ encoding ASCII do
260
+ eof? && byte_count_sum(0x0..0x7f) == byte_total
261
+ end
262
+
263
+ # UTF-16, if lots of NULL bytes present
264
+ encodings UTF_16BE, UTF_16LE, UTF_16 do
265
+ if relative_byte_count(byte_count[0]) > 0.25
266
+ case first_byte
267
+ when 0x0 then UTF_32
268
+ when 0xfe then UTF_16BE
269
+ when 0xff then UTF_16LE
270
+ else UTF_16
271
+ end
272
+ end
273
+ end
274
+
275
+ # UTF-8, if number of escape-bytes and following bytes
276
+ # is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
277
+ encoding UTF_8 do
278
+ esc_bytes = byte_count_sum(0xc0..0xdf) +
279
+ # => 110xxxxx 10xxxxxx
280
+ byte_count_sum(0xe0..0xef) * 2 +
281
+ # => 1110xxxx 10xxxxxx 10xxxxxx
282
+ byte_count_sum(0xf0..0xf7) * 3
283
+ # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
284
+ fol_bytes = byte_count_sum(0x80..0xbf)
285
+ # => 10xxxxxx
286
+
287
+ esc_bytes > 0 && esc_bytes == fol_bytes
288
+ end
289
+
290
+ # Analyse statistical appearance of German umlauts and other accented
291
+ # letters (see TEST_CHARS)
292
+ encodings(*TEST_ENCODINGS) do
293
+ ratios = {}
294
+
295
+ TEST_ENCODINGS.find(lambda {
296
+ ratio, encoding = ratios.sort.last
297
+ encoding if ratio >= TEST_THRESHOLD_APPROX
298
+ }) { |encoding|
299
+ ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
300
+ #p [encoding, ratio]
301
+ ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
302
+ }
303
+ end
304
+
305
+ ### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
306
+
307
+ bom_encoding UTF_8 do
308
+ starts_with?(0xef, 0xbb, 0xbf)
309
+ end
310
+
311
+ bom_encoding UTF_16BE do
312
+ starts_with?(0xfe, 0xff)
313
+ end
314
+
315
+ bom_encoding UTF_16LE do
316
+ starts_with?(0xff, 0xfe)
317
+ end
318
+
319
+ bom_encoding UTF_32BE do
320
+ starts_with?(0x00, 0x00, 0xfe, 0xff)
321
+ end
322
+
323
+ bom_encoding UTF_32LE do
324
+ starts_with?(0xff, 0xfe, 0x00, 0x00)
325
+ end
326
+
327
+ bom_encoding SCSU do
328
+ starts_with?(0x0e, 0xfe, 0xff)
329
+ end
330
+
331
+ bom_encoding UTF_7 do
332
+ starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
333
+ end
334
+
335
+ bom_encoding UTF_EBCDIC do
336
+ starts_with?(0xdd, 0x73, 0x66, 0x73)
337
+ end
338
+
339
+ bom_encoding BOCU_1 do
340
+ starts_with?(0xfb, 0xee, 0x28)
341
+ end
342
+
343
+ end
@@ -0,0 +1,78 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # Contributors: #
14
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
15
+ # for automatic encoding detection) #
16
+ # #
17
+ # cmess is free software; you can redistribute it and/or modify it under the #
18
+ # terms of the GNU General Public License as published by the Free Software #
19
+ # Foundation; either version 3 of the License, or (at your option) any later #
20
+ # version. #
21
+ # #
22
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
+ # details. #
26
+ # #
27
+ # You should have received a copy of the GNU General Public License along #
28
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # #
30
+ ###############################################################################
31
+ #++
32
+
33
+ # Namespace for our encodings.
34
+
35
+ module CMess::GuessEncoding::Encoding
36
+
37
+ extend self
38
+
39
+ def all_encodings
40
+ const_defined?(:ALL_ENCODINGS) ? ALL_ENCODINGS :
41
+ const_set(:ALL_ENCODINGS, get_all_encodings)
42
+ end
43
+
44
+ private
45
+
46
+ def get_all_encodings
47
+ %x{iconv -l}.split("\n").map { |e|
48
+ get_or_set_encoding_const(e.sub(/\/*\z/, ''))
49
+ }
50
+ end
51
+
52
+ def const_name_for(encoding)
53
+ encoding.tr('-', '_').gsub(/\W/, '').sub(/\A\d/, 'ENC_\&').upcase
54
+ end
55
+
56
+ def set_encoding_const(encoding, const = const_name_for(encoding))
57
+ const_set(const, encoding.freeze)
58
+ end
59
+
60
+ def get_or_set_encoding_const(encoding)
61
+ const_defined?(const = const_name_for(encoding)) ?
62
+ const_get(const) : set_encoding_const(encoding, const)
63
+ end
64
+
65
+ %w[
66
+ UNKNOWN ASCII MACINTOSH
67
+ ISO-8859-1 ISO-8859-2 ISO-8859-15
68
+ CP1250 CP1251 CP1252 CP850 CP852 CP856
69
+ UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
70
+ UTF-7 UTF-EBCDIC SCSU BOCU-1
71
+ ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
72
+ ].each { |encoding| set_encoding_const(encoding) }
73
+
74
+ def included(base)
75
+ base.extend self
76
+ end
77
+
78
+ end
@@ -0,0 +1,108 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # Contributors: #
14
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
15
+ # for automatic encoding detection) #
16
+ # #
17
+ # cmess is free software; you can redistribute it and/or modify it under the #
18
+ # terms of the GNU General Public License as published by the Free Software #
19
+ # Foundation; either version 3 of the License, or (at your option) any later #
20
+ # version. #
21
+ # #
22
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
+ # details. #
26
+ # #
27
+ # You should have received a copy of the GNU General Public License along #
28
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # #
30
+ ###############################################################################
31
+ #++
32
+
33
+ require 'iconv'
34
+
35
+ # Outputs given string (or line), being encoded in target encoding, encoded in
36
+ # various test encodings, thus allowing to identify the (seemingly) correct
37
+ # encoding by visually comparing the input string with its desired appearance.
38
+
39
+ module CMess::GuessEncoding::Manual
40
+
41
+ extend self
42
+
43
+ include CMess::GuessEncoding::Encoding
44
+
45
+ # default encodings to try
46
+ ENCODINGS = [
47
+ ISO_8859_1,
48
+ ISO_8859_2,
49
+ ISO_8859_15,
50
+ CP1250,
51
+ CP1251,
52
+ CP1252,
53
+ CP850,
54
+ CP852,
55
+ CP856,
56
+ UTF_8
57
+ ]
58
+
59
+ # likely candidates to suggest to the user
60
+ CANDIDATES = [
61
+ ANSI_X34,
62
+ EBCDIC_AT_DE,
63
+ EBCDIC_US,
64
+ EUC_JP,
65
+ KOI_8,
66
+ MACINTOSH,
67
+ MS_ANSI,
68
+ SHIFT_JIS,
69
+ UTF_7,
70
+ UTF_16,
71
+ UTF_16BE,
72
+ UTF_16LE,
73
+ UTF_32,
74
+ UTF_32BE,
75
+ UTF_32LE
76
+ ]
77
+
78
+ def display(input, target_encoding, encodings = nil, additional_encodings = [])
79
+ target = target_encoding
80
+
81
+ encodings = (encodings || ENCODINGS) + additional_encodings
82
+ encodings = encodings.include?('__ALL__') ? all_encodings :
83
+ encodings.reverse.uniq.reverse # uniq with additional encodings
84
+ # staying at the end
85
+
86
+ # move target encoding to front
87
+ encodings = [target] + (encodings - [target])
88
+
89
+ max_length = encodings.map { |encoding| encoding.length }.max
90
+
91
+ encodings.each { |encoding|
92
+ converted = begin
93
+ Iconv.conv(target, encoding, input)
94
+ rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
95
+ "ILLEGAL INPUT SEQUENCE: #{err}"
96
+ rescue Iconv::InvalidEncoding
97
+ if encoding == target
98
+ raise ArgumentError, "invalid encoding: #{encoding}"
99
+ else
100
+ "INVALID ENCODING!"
101
+ end
102
+ end
103
+
104
+ puts "%-#{max_length}s : %s" % [encoding, converted]
105
+ }
106
+ end
107
+
108
+ end
@@ -0,0 +1,61 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # Contributors: #
14
+ # John Vorhauer <john@vorhauer.de> (idea and original implementation #
15
+ # for automatic encoding detection) #
16
+ # #
17
+ # cmess is free software; you can redistribute it and/or modify it under the #
18
+ # terms of the GNU General Public License as published by the Free Software #
19
+ # Foundation; either version 3 of the License, or (at your option) any later #
20
+ # version. #
21
+ # #
22
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
+ # details. #
26
+ # #
27
+ # You should have received a copy of the GNU General Public License along #
28
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
+ # #
30
+ ###############################################################################
31
+ #++
32
+
33
+ require 'cmess'
34
+
35
+ # Allows to guess an input's encoding either manually or automatically.
36
+ # Works actually pretty good -- for the supported encodings. See Manual
37
+ # and Automatic for details.
38
+
39
+ module CMess::GuessEncoding
40
+
41
+ # our version ;-)
42
+ VERSION = '0.0.8'
43
+
44
+ class << self
45
+
46
+ def manual(*args)
47
+ Manual.display(*args)
48
+ end
49
+
50
+ def automatic(*args)
51
+ Automatic.guess(*args)
52
+ end
53
+
54
+ end
55
+
56
+ end
57
+
58
+ %w[encoding manual automatic].each { |lib|
59
+ lib = "cmess/guess_encoding/#{lib}"
60
+ require lib
61
+ }
@@ -0,0 +1,51 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ module CMess::Version
30
+
31
+ MAJOR = 0
32
+ MINOR = 2
33
+ TINY = 0
34
+
35
+ class << self
36
+
37
+ # Returns array representation.
38
+ def to_a
39
+ [MAJOR, MINOR, TINY]
40
+ end
41
+
42
+ # Short-cut for version string.
43
+ def to_s
44
+ to_a.join('.')
45
+ end
46
+
47
+ end
48
+
49
+ CMess::VERSION = to_s
50
+
51
+ end
data/lib/cmess.rb ADDED
@@ -0,0 +1,49 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # cmess -- Assist with handling messed up encodings #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ # Bundles several tools that aim at dealing with various problems occurring in
30
+ # the context of character sets and encodings. Currently, there are:
31
+ #
32
+ # guess_encoding:: Simple helper to identify the encoding of a given string.
33
+ # Includes the ability to automatically detect the encoding
34
+ # of an input. (see GuessEncoding)
35
+ # cinderella:: When characters are "double encoded", you can't easily
36
+ # convert them back -- this is where cinderella comes in,
37
+ # sorting the good ones into the pot and the (potentially)
38
+ # bad ones into the crop... (see Cinderella)
39
+ # bconv:: Convert between bibliographic (and other) encodings.
40
+ # (see BConv)
41
+ # decode_entities:: Decode HTML entities in a string. (see DecodeEntities)
42
+
43
+ module CMess
44
+
45
+ DATA_DIR = File.expand_path(File.join(File.dirname(__FILE__), '..', 'data'))
46
+
47
+ end
48
+
49
+ require 'cmess/version'