blackwinter-cmess 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/COPYING +676 -0
  2. data/ChangeLog +54 -0
  3. data/README +63 -0
  4. data/Rakefile +51 -0
  5. data/bin/bconv +130 -0
  6. data/bin/cinderella +190 -0
  7. data/bin/decode_entities +106 -0
  8. data/bin/guess_encoding +223 -0
  9. data/data/chartab.yaml +26724 -0
  10. data/data/csets/iso_8859-1.yaml +195 -0
  11. data/data/csets/iso_8859-15.yaml +204 -0
  12. data/data/csets/latin1.yaml +195 -0
  13. data/data/csets/unicode/basic_latin.yaml +97 -0
  14. data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
  15. data/data/csets/unicode/cyrillic.yaml +256 -0
  16. data/data/csets/unicode/greek.yaml +129 -0
  17. data/data/csets/unicode/ipa_extensions.yaml +97 -0
  18. data/data/csets/unicode/latin-extended-c.yaml +18 -0
  19. data/data/csets/unicode/latin-extended-d.yaml +3 -0
  20. data/data/csets/unicode/latin_1_supplement.yaml +128 -0
  21. data/data/csets/unicode/latin_extended_a.yaml +129 -0
  22. data/data/csets/unicode/latin_extended_additional.yaml +247 -0
  23. data/data/csets/unicode/latin_extended_b.yaml +209 -0
  24. data/data/csets/unicode/letterlike_symbols.yaml +80 -0
  25. data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
  26. data/data/csets/utf-8.yaml +1504 -0
  27. data/data/csets/utf8.yaml +1504 -0
  28. data/data/test_chars.yaml +14 -0
  29. data/example/cinderella/crop +127 -0
  30. data/example/cinderella/crop_repaired +127 -0
  31. data/example/cinderella/empty6-slash.txt +1495 -0
  32. data/example/cinderella/empty6-slash_repaired.txt +1495 -0
  33. data/example/cinderella/pot +1368 -0
  34. data/example/guess_encoding/check_results +60 -0
  35. data/example/guess_encoding/de.utf-8.txt +10030 -0
  36. data/example/guess_encoding/en.utf-8.txt +10030 -0
  37. data/example/guess_encoding/fr.utf-8.txt +10030 -0
  38. data/example/guess_encoding/it.utf-8.txt +10030 -0
  39. data/lib/cmess/bconv.rb +169 -0
  40. data/lib/cmess/cinderella.rb +66 -0
  41. data/lib/cmess/cli.rb +120 -0
  42. data/lib/cmess/decode_entities.rb +69 -0
  43. data/lib/cmess/guess_encoding/automatic.rb +343 -0
  44. data/lib/cmess/guess_encoding/encoding.rb +78 -0
  45. data/lib/cmess/guess_encoding/manual.rb +108 -0
  46. data/lib/cmess/guess_encoding.rb +61 -0
  47. data/lib/cmess/version.rb +51 -0
  48. data/lib/cmess.rb +49 -0
  49. metadata +136 -0
@@ -0,0 +1,169 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'yaml'
30
+ require 'iconv'
31
+ require 'cmess'
32
+
33
+ # Convert between bibliographic (and other) encodings.
34
+
35
+ class CMess::BConv
36
+
37
+ # our version ;-)
38
+ VERSION = '0.0.2'
39
+
40
+ INTERMEDIATE_ENCODING = 'utf-8'
41
+
42
+ DEFAULT_CHARTAB_FILE = File.join(CMess::DATA_DIR, 'chartab.yaml')
43
+
44
+ class << self
45
+
46
+ def encodings(chartab = DEFAULT_CHARTAB_FILE)
47
+ chartab = load_chartab(chartab)
48
+
49
+ chartab[chartab.keys.first].keys.map { |encoding|
50
+ encoding.upcase unless encoding =~ /\A__/
51
+ }.compact.sort
52
+ end
53
+
54
+ def convert(*args)
55
+ new(*args).convert
56
+ end
57
+
58
+ def load_chartab(chartab)
59
+ case chartab
60
+ when Hash
61
+ chartab
62
+ when String
63
+ raise "chartab file not found: #{chartab}" unless File.readable?(chartab)
64
+ YAML.load_file(chartab)
65
+ else
66
+ raise ArgumentError, "invalid chartab of type #{chartab.class}"
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ attr_reader :input, :output, :source_encoding, :target_encoding, :chartab, :encodings
73
+
74
+ def initialize(input, output, source_encoding, target_encoding, chartab = DEFAULT_CHARTAB_FILE)
75
+ @input, @output = input, output
76
+
77
+ @source_encoding = source_encoding.upcase
78
+ @target_encoding = target_encoding.upcase
79
+
80
+ @chartab = self.class.load_chartab(chartab)
81
+ @encodings = self.class.encodings(@chartab)
82
+ end
83
+
84
+ def encoding?(encoding)
85
+ encodings.include?(encoding)
86
+ end
87
+
88
+ def convert
89
+ if encoding?(source_encoding)
90
+ if encoding?(target_encoding)
91
+ @charmap = chartab.inject({}) { |hash, (code, map)|
92
+ hash.update(map[source_encoding] => map[target_encoding].pack('U*'))
93
+ }
94
+
95
+ input.each_byte { |char|
96
+ output.print map(char)
97
+ }
98
+ else
99
+ iconv = iconv_to
100
+
101
+ @charmap = chartab.inject({}) { |hash, (code, map)|
102
+ hash.update(map[source_encoding] => [code.to_i(16)].pack('U*'))
103
+ }
104
+
105
+ input.each_byte { |char|
106
+ output.print iconv.iconv(map(char))
107
+ }
108
+ end
109
+ else
110
+ if encoding?(target_encoding)
111
+ iconv = iconv_from
112
+
113
+ charmap = chartab.inject({}) { |hash, (code, map)|
114
+ hash.update(code.to_i(16) => map[target_encoding].pack('U*'))
115
+ }
116
+
117
+ input.each { |line|
118
+ iconv.iconv(line).unpack('U*').each { |char|
119
+ output.print charmap[char]
120
+ }
121
+ }
122
+ else
123
+ iconv = iconv_from_to
124
+
125
+ input.each { |line|
126
+ output.puts iconv.iconv(line)
127
+ }
128
+ end
129
+ end
130
+ end
131
+
132
+ private
133
+
134
+ def iconv_from_to(from = source_encoding, to = target_encoding)
135
+ iconv = begin
136
+ Iconv.new(to, from)
137
+ rescue Iconv::InvalidEncoding
138
+ raise ArgumentError, "invalid encoding: source encoding = #{from}, target encoding = #{to}"
139
+ end
140
+
141
+ def iconv.iconv(*args)
142
+ super
143
+ rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
144
+ warn "ILLEGAL INPUT SEQUENCE: #{err}"; ''
145
+ end
146
+
147
+ iconv
148
+ end
149
+
150
+ def iconv_from(from = source_encoding)
151
+ iconv_from_to(from, INTERMEDIATE_ENCODING)
152
+ end
153
+
154
+ def iconv_to(to = target_encoding)
155
+ iconv_from_to(INTERMEDIATE_ENCODING, to)
156
+ end
157
+
158
+ def map(char, charmap = @charmap)
159
+ unless map = charmap[[char]]
160
+ unless map = charmap[[char, c = input.getc]]
161
+ input.ungetc(c) if c
162
+ map = ''
163
+ end
164
+ end
165
+
166
+ map
167
+ end
168
+
169
+ end
@@ -0,0 +1,66 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'iconv'
30
+ require 'cmess'
31
+
32
+ # Find (and possibly repair) doubly encoded characters. Here's how it's done:
33
+ #
34
+ # Treats characters encoded in target encoding as if they were encoded in
35
+ # source encoding, converts them to target encoding and "grep"s for lines
36
+ # containing those doubly encoded characters; if asked to repair doubly
37
+ # encoded characters, substitutes them with their original character.
38
+
39
+ module CMess::Cinderella
40
+
41
+ extend self
42
+
43
+ # our version ;-)
44
+ VERSION = '0.0.3'
45
+
46
+ DEFAULT_CSETS_DIR = File.join(CMess::DATA_DIR, 'csets')
47
+
48
+ def pick(input, pot, crop, source_encoding, target_encoding, chars, repair = false)
49
+ iconv = Iconv.new(target_encoding, source_encoding)
50
+
51
+ encoded = chars.inject({}) { |hash, char|
52
+ hash.update(iconv.iconv(char) => char)
53
+ }
54
+
55
+ regexp = Regexp.union(*encoded.keys)
56
+
57
+ input.each { |line|
58
+ if out = line =~ regexp ? crop : pot
59
+ line.gsub!(regexp) { |m| encoded[m] } if repair
60
+
61
+ out.puts(line)
62
+ end
63
+ }
64
+ end
65
+
66
+ end
data/lib/cmess/cli.rb ADDED
@@ -0,0 +1,120 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'tempfile'
30
+
31
+ require 'rubygems'
32
+ require 'nuggets/env/user_encoding'
33
+
34
+ module CMess::CLI
35
+
36
+ # how to split list of arguments
37
+ SPLIT_ARG_LIST_RE = /\s*[,\s]\s*/o
38
+
39
+ def ensure_readable(file)
40
+ abort "Can't find input file: #{file}" unless File.readable?(file)
41
+ end
42
+
43
+ def ensure_directory(dir)
44
+ abort "Directory not found: #{dir}" unless File.directory?(dir)
45
+ end
46
+
47
+ def open_file_in_place(file)
48
+ [open_temporary_input(file), File.open(file, 'w')]
49
+ end
50
+
51
+ def open_file_or_std(file, mode = 'r')
52
+ if file == '-'
53
+ case mode
54
+ when 'r' then STDIN
55
+ when 'w' then STDOUT
56
+ when 'a' then STDERR
57
+ else raise ArgumentError, "don't know how to handle mode '#{mode}'"
58
+ end
59
+ else
60
+ ensure_readable(file) unless mode == 'w'
61
+ File.open(file, mode)
62
+ end
63
+ end
64
+
65
+ def open_temporary_input(*files)
66
+ temp = Tempfile.new('cmess_cli')
67
+
68
+ files.each { |file|
69
+ if file == '-'
70
+ STDIN.each { |line| temp << line }
71
+ else
72
+ ensure_readable(file)
73
+ File.open(file) { |f| f.each { |line| temp << line } }
74
+ end
75
+ }
76
+
77
+ # return File, instead of Tempfile
78
+ temp.close
79
+ temp.open
80
+ end
81
+
82
+ def trailing_args_as_input(options)
83
+ unless ARGV.empty? || options[:input_set]
84
+ options[:input] = if ARGV.size == 1
85
+ open_file_or_std(ARGV.first)
86
+ else
87
+ open_temporary_input(*ARGV)
88
+ end
89
+ end
90
+ end
91
+
92
+ def determine_system_encoding
93
+ ENV.user_encoding || begin
94
+ dummy = lambda {
95
+ abort <<-EOT
96
+ Your system's encoding couldn't be determined automatically -- please specify
97
+ it explicitly via the ENCODING environment variable or via the '-t' option.
98
+ EOT
99
+ }
100
+
101
+ def dummy.to_s; 'NOT FOUND' end
102
+
103
+ dummy
104
+ end
105
+ end
106
+
107
+ def cli
108
+ yield
109
+ rescue => err
110
+ if $VERBOSE
111
+ backtrace = err.backtrace
112
+ fromtrace = backtrace[1..-1].map { |i| "\n from #{i}" }
113
+
114
+ abort "#{backtrace.first} #{err} (#{err.class})#{fromtrace}"
115
+ else
116
+ abort "#{err.to_s.capitalize} [#{err.backtrace.first}]"
117
+ end
118
+ end
119
+
120
+ end
@@ -0,0 +1,69 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'iconv'
30
+ require 'cmess'
31
+
32
+ require 'rubygems'
33
+ require 'htmlentities/string'
34
+
35
+ module CMess::DecodeEntities
36
+
37
+ extend self
38
+
39
+ # our version ;-)
40
+ VERSION = '0.0.2'
41
+
42
+ # HTMLEntities requires UTF-8
43
+ INTERMEDIATE_ENCODING = 'utf-8'
44
+
45
+ ICONV_DUMMY = begin
46
+ dummy = Object.new
47
+
48
+ def dummy.iconv(string)
49
+ string
50
+ end
51
+
52
+ dummy
53
+ end
54
+
55
+ def decode(input, output, source_encoding, target_encoding = nil)
56
+ target_encoding ||= source_encoding
57
+
58
+ iconv_in = source_encoding != INTERMEDIATE_ENCODING ?
59
+ Iconv.new(INTERMEDIATE_ENCODING, source_encoding) : ICONV_DUMMY
60
+
61
+ iconv_out = target_encoding != INTERMEDIATE_ENCODING ?
62
+ Iconv.new(target_encoding, INTERMEDIATE_ENCODING) : ICONV_DUMMY
63
+
64
+ input.each { |line|
65
+ output.puts iconv_out.iconv(iconv_in.iconv(line).decode_entities)
66
+ }
67
+ end
68
+
69
+ end