blackwinter-cmess 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/COPYING +676 -0
  2. data/ChangeLog +54 -0
  3. data/README +63 -0
  4. data/Rakefile +51 -0
  5. data/bin/bconv +130 -0
  6. data/bin/cinderella +190 -0
  7. data/bin/decode_entities +106 -0
  8. data/bin/guess_encoding +223 -0
  9. data/data/chartab.yaml +26724 -0
  10. data/data/csets/iso_8859-1.yaml +195 -0
  11. data/data/csets/iso_8859-15.yaml +204 -0
  12. data/data/csets/latin1.yaml +195 -0
  13. data/data/csets/unicode/basic_latin.yaml +97 -0
  14. data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
  15. data/data/csets/unicode/cyrillic.yaml +256 -0
  16. data/data/csets/unicode/greek.yaml +129 -0
  17. data/data/csets/unicode/ipa_extensions.yaml +97 -0
  18. data/data/csets/unicode/latin-extended-c.yaml +18 -0
  19. data/data/csets/unicode/latin-extended-d.yaml +3 -0
  20. data/data/csets/unicode/latin_1_supplement.yaml +128 -0
  21. data/data/csets/unicode/latin_extended_a.yaml +129 -0
  22. data/data/csets/unicode/latin_extended_additional.yaml +247 -0
  23. data/data/csets/unicode/latin_extended_b.yaml +209 -0
  24. data/data/csets/unicode/letterlike_symbols.yaml +80 -0
  25. data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
  26. data/data/csets/utf-8.yaml +1504 -0
  27. data/data/csets/utf8.yaml +1504 -0
  28. data/data/test_chars.yaml +14 -0
  29. data/example/cinderella/crop +127 -0
  30. data/example/cinderella/crop_repaired +127 -0
  31. data/example/cinderella/empty6-slash.txt +1495 -0
  32. data/example/cinderella/empty6-slash_repaired.txt +1495 -0
  33. data/example/cinderella/pot +1368 -0
  34. data/example/guess_encoding/check_results +60 -0
  35. data/example/guess_encoding/de.utf-8.txt +10030 -0
  36. data/example/guess_encoding/en.utf-8.txt +10030 -0
  37. data/example/guess_encoding/fr.utf-8.txt +10030 -0
  38. data/example/guess_encoding/it.utf-8.txt +10030 -0
  39. data/lib/cmess/bconv.rb +169 -0
  40. data/lib/cmess/cinderella.rb +66 -0
  41. data/lib/cmess/cli.rb +120 -0
  42. data/lib/cmess/decode_entities.rb +69 -0
  43. data/lib/cmess/guess_encoding/automatic.rb +343 -0
  44. data/lib/cmess/guess_encoding/encoding.rb +78 -0
  45. data/lib/cmess/guess_encoding/manual.rb +108 -0
  46. data/lib/cmess/guess_encoding.rb +61 -0
  47. data/lib/cmess/version.rb +51 -0
  48. data/lib/cmess.rb +49 -0
  49. metadata +136 -0
@@ -0,0 +1,169 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2008 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'yaml'
30
+ require 'iconv'
31
+ require 'cmess'
32
+
33
+ # Convert between bibliographic (and other) encodings.
34
+
35
+ class CMess::BConv
36
+
37
+ # our version ;-)
38
+ VERSION = '0.0.2'
39
+
40
+ INTERMEDIATE_ENCODING = 'utf-8'
41
+
42
+ DEFAULT_CHARTAB_FILE = File.join(CMess::DATA_DIR, 'chartab.yaml')
43
+
44
+ class << self
45
+
46
+ def encodings(chartab = DEFAULT_CHARTAB_FILE)
47
+ chartab = load_chartab(chartab)
48
+
49
+ chartab[chartab.keys.first].keys.map { |encoding|
50
+ encoding.upcase unless encoding =~ /\A__/
51
+ }.compact.sort
52
+ end
53
+
54
+ def convert(*args)
55
+ new(*args).convert
56
+ end
57
+
58
+ def load_chartab(chartab)
59
+ case chartab
60
+ when Hash
61
+ chartab
62
+ when String
63
+ raise "chartab file not found: #{chartab}" unless File.readable?(chartab)
64
+ YAML.load_file(chartab)
65
+ else
66
+ raise ArgumentError, "invalid chartab of type #{chartab.class}"
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ attr_reader :input, :output, :source_encoding, :target_encoding, :chartab, :encodings
73
+
74
+ def initialize(input, output, source_encoding, target_encoding, chartab = DEFAULT_CHARTAB_FILE)
75
+ @input, @output = input, output
76
+
77
+ @source_encoding = source_encoding.upcase
78
+ @target_encoding = target_encoding.upcase
79
+
80
+ @chartab = self.class.load_chartab(chartab)
81
+ @encodings = self.class.encodings(@chartab)
82
+ end
83
+
84
+ def encoding?(encoding)
85
+ encodings.include?(encoding)
86
+ end
87
+
88
+ def convert
89
+ if encoding?(source_encoding)
90
+ if encoding?(target_encoding)
91
+ @charmap = chartab.inject({}) { |hash, (code, map)|
92
+ hash.update(map[source_encoding] => map[target_encoding].pack('U*'))
93
+ }
94
+
95
+ input.each_byte { |char|
96
+ output.print map(char)
97
+ }
98
+ else
99
+ iconv = iconv_to
100
+
101
+ @charmap = chartab.inject({}) { |hash, (code, map)|
102
+ hash.update(map[source_encoding] => [code.to_i(16)].pack('U*'))
103
+ }
104
+
105
+ input.each_byte { |char|
106
+ output.print iconv.iconv(map(char))
107
+ }
108
+ end
109
+ else
110
+ if encoding?(target_encoding)
111
+ iconv = iconv_from
112
+
113
+ charmap = chartab.inject({}) { |hash, (code, map)|
114
+ hash.update(code.to_i(16) => map[target_encoding].pack('U*'))
115
+ }
116
+
117
+ input.each { |line|
118
+ iconv.iconv(line).unpack('U*').each { |char|
119
+ output.print charmap[char]
120
+ }
121
+ }
122
+ else
123
+ iconv = iconv_from_to
124
+
125
+ input.each { |line|
126
+ output.puts iconv.iconv(line)
127
+ }
128
+ end
129
+ end
130
+ end
131
+
132
+ private
133
+
134
+ def iconv_from_to(from = source_encoding, to = target_encoding)
135
+ iconv = begin
136
+ Iconv.new(to, from)
137
+ rescue Iconv::InvalidEncoding
138
+ raise ArgumentError, "invalid encoding: source encoding = #{from}, target encoding = #{to}"
139
+ end
140
+
141
+ def iconv.iconv(*args)
142
+ super
143
+ rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
144
+ warn "ILLEGAL INPUT SEQUENCE: #{err}"; ''
145
+ end
146
+
147
+ iconv
148
+ end
149
+
150
+ def iconv_from(from = source_encoding)
151
+ iconv_from_to(from, INTERMEDIATE_ENCODING)
152
+ end
153
+
154
+ def iconv_to(to = target_encoding)
155
+ iconv_from_to(INTERMEDIATE_ENCODING, to)
156
+ end
157
+
158
+ def map(char, charmap = @charmap)
159
+ unless map = charmap[[char]]
160
+ unless map = charmap[[char, c = input.getc]]
161
+ input.ungetc(c) if c
162
+ map = ''
163
+ end
164
+ end
165
+
166
+ map
167
+ end
168
+
169
+ end
@@ -0,0 +1,66 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'iconv'
30
+ require 'cmess'
31
+
32
+ # Find (and possibly repair) doubly encoded characters. Here's how it's done:
33
+ #
34
+ # Treats characters encoded in target encoding as if they were encoded in
35
+ # source encoding, converts them to target encoding and "grep"s for lines
36
+ # containing those doubly encoded characters; if asked to repair doubly
37
+ # encoded characters, substitutes them with their original character.
38
+
39
+ module CMess::Cinderella
40
+
41
+ extend self
42
+
43
+ # our version ;-)
44
+ VERSION = '0.0.3'
45
+
46
+ DEFAULT_CSETS_DIR = File.join(CMess::DATA_DIR, 'csets')
47
+
48
+ def pick(input, pot, crop, source_encoding, target_encoding, chars, repair = false)
49
+ iconv = Iconv.new(target_encoding, source_encoding)
50
+
51
+ encoded = chars.inject({}) { |hash, char|
52
+ hash.update(iconv.iconv(char) => char)
53
+ }
54
+
55
+ regexp = Regexp.union(*encoded.keys)
56
+
57
+ input.each { |line|
58
+ if out = line =~ regexp ? crop : pot
59
+ line.gsub!(regexp) { |m| encoded[m] } if repair
60
+
61
+ out.puts(line)
62
+ end
63
+ }
64
+ end
65
+
66
+ end
data/lib/cmess/cli.rb ADDED
@@ -0,0 +1,120 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007-2009 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'tempfile'
30
+
31
+ require 'rubygems'
32
+ require 'nuggets/env/user_encoding'
33
+
34
+ module CMess::CLI
35
+
36
+ # how to split list of arguments
37
+ SPLIT_ARG_LIST_RE = /\s*[,\s]\s*/o
38
+
39
+ def ensure_readable(file)
40
+ abort "Can't find input file: #{file}" unless File.readable?(file)
41
+ end
42
+
43
+ def ensure_directory(dir)
44
+ abort "Directory not found: #{dir}" unless File.directory?(dir)
45
+ end
46
+
47
+ def open_file_in_place(file)
48
+ [open_temporary_input(file), File.open(file, 'w')]
49
+ end
50
+
51
+ def open_file_or_std(file, mode = 'r')
52
+ if file == '-'
53
+ case mode
54
+ when 'r' then STDIN
55
+ when 'w' then STDOUT
56
+ when 'a' then STDERR
57
+ else raise ArgumentError, "don't know how to handle mode '#{mode}'"
58
+ end
59
+ else
60
+ ensure_readable(file) unless mode == 'w'
61
+ File.open(file, mode)
62
+ end
63
+ end
64
+
65
+ def open_temporary_input(*files)
66
+ temp = Tempfile.new('cmess_cli')
67
+
68
+ files.each { |file|
69
+ if file == '-'
70
+ STDIN.each { |line| temp << line }
71
+ else
72
+ ensure_readable(file)
73
+ File.open(file) { |f| f.each { |line| temp << line } }
74
+ end
75
+ }
76
+
77
+ # return File, instead of Tempfile
78
+ temp.close
79
+ temp.open
80
+ end
81
+
82
+ def trailing_args_as_input(options)
83
+ unless ARGV.empty? || options[:input_set]
84
+ options[:input] = if ARGV.size == 1
85
+ open_file_or_std(ARGV.first)
86
+ else
87
+ open_temporary_input(*ARGV)
88
+ end
89
+ end
90
+ end
91
+
92
+ def determine_system_encoding
93
+ ENV.user_encoding || begin
94
+ dummy = lambda {
95
+ abort <<-EOT
96
+ Your system's encoding couldn't be determined automatically -- please specify
97
+ it explicitly via the ENCODING environment variable or via the '-t' option.
98
+ EOT
99
+ }
100
+
101
+ def dummy.to_s; 'NOT FOUND' end
102
+
103
+ dummy
104
+ end
105
+ end
106
+
107
+ def cli
108
+ yield
109
+ rescue => err
110
+ if $VERBOSE
111
+ backtrace = err.backtrace
112
+ fromtrace = backtrace[1..-1].map { |i| "\n from #{i}" }
113
+
114
+ abort "#{backtrace.first} #{err} (#{err.class})#{fromtrace}"
115
+ else
116
+ abort "#{err.to_s.capitalize} [#{err.backtrace.first}]"
117
+ end
118
+ end
119
+
120
+ end
@@ -0,0 +1,69 @@
1
+ #--
2
+ ###############################################################################
3
+ # #
4
+ # A component of cmess, the encoding tool-box. #
5
+ # #
6
+ # Copyright (C) 2007 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50932 Cologne, Germany #
9
+ # #
10
+ # Authors: #
11
+ # Jens Wille <jens.wille@uni-koeln.de> #
12
+ # #
13
+ # cmess is free software; you can redistribute it and/or modify it under the #
14
+ # terms of the GNU General Public License as published by the Free Software #
15
+ # Foundation; either version 3 of the License, or (at your option) any later #
16
+ # version. #
17
+ # #
18
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
+ # details. #
22
+ # #
23
+ # You should have received a copy of the GNU General Public License along #
24
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
+ # #
26
+ ###############################################################################
27
+ #++
28
+
29
+ require 'iconv'
30
+ require 'cmess'
31
+
32
+ require 'rubygems'
33
+ require 'htmlentities/string'
34
+
35
+ module CMess::DecodeEntities
36
+
37
+ extend self
38
+
39
+ # our version ;-)
40
+ VERSION = '0.0.2'
41
+
42
+ # HTMLEntities requires UTF-8
43
+ INTERMEDIATE_ENCODING = 'utf-8'
44
+
45
+ ICONV_DUMMY = begin
46
+ dummy = Object.new
47
+
48
+ def dummy.iconv(string)
49
+ string
50
+ end
51
+
52
+ dummy
53
+ end
54
+
55
+ def decode(input, output, source_encoding, target_encoding = nil)
56
+ target_encoding ||= source_encoding
57
+
58
+ iconv_in = source_encoding != INTERMEDIATE_ENCODING ?
59
+ Iconv.new(INTERMEDIATE_ENCODING, source_encoding) : ICONV_DUMMY
60
+
61
+ iconv_out = target_encoding != INTERMEDIATE_ENCODING ?
62
+ Iconv.new(target_encoding, INTERMEDIATE_ENCODING) : ICONV_DUMMY
63
+
64
+ input.each { |line|
65
+ output.puts iconv_out.iconv(iconv_in.iconv(line).decode_entities)
66
+ }
67
+ end
68
+
69
+ end