blackwinter-cmess 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +676 -0
- data/ChangeLog +54 -0
- data/README +63 -0
- data/Rakefile +51 -0
- data/bin/bconv +130 -0
- data/bin/cinderella +190 -0
- data/bin/decode_entities +106 -0
- data/bin/guess_encoding +223 -0
- data/data/chartab.yaml +26724 -0
- data/data/csets/iso_8859-1.yaml +195 -0
- data/data/csets/iso_8859-15.yaml +204 -0
- data/data/csets/latin1.yaml +195 -0
- data/data/csets/unicode/basic_latin.yaml +97 -0
- data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
- data/data/csets/unicode/cyrillic.yaml +256 -0
- data/data/csets/unicode/greek.yaml +129 -0
- data/data/csets/unicode/ipa_extensions.yaml +97 -0
- data/data/csets/unicode/latin-extended-c.yaml +18 -0
- data/data/csets/unicode/latin-extended-d.yaml +3 -0
- data/data/csets/unicode/latin_1_supplement.yaml +128 -0
- data/data/csets/unicode/latin_extended_a.yaml +129 -0
- data/data/csets/unicode/latin_extended_additional.yaml +247 -0
- data/data/csets/unicode/latin_extended_b.yaml +209 -0
- data/data/csets/unicode/letterlike_symbols.yaml +80 -0
- data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
- data/data/csets/utf-8.yaml +1504 -0
- data/data/csets/utf8.yaml +1504 -0
- data/data/test_chars.yaml +14 -0
- data/example/cinderella/crop +127 -0
- data/example/cinderella/crop_repaired +127 -0
- data/example/cinderella/empty6-slash.txt +1495 -0
- data/example/cinderella/empty6-slash_repaired.txt +1495 -0
- data/example/cinderella/pot +1368 -0
- data/example/guess_encoding/check_results +60 -0
- data/example/guess_encoding/de.utf-8.txt +10030 -0
- data/example/guess_encoding/en.utf-8.txt +10030 -0
- data/example/guess_encoding/fr.utf-8.txt +10030 -0
- data/example/guess_encoding/it.utf-8.txt +10030 -0
- data/lib/cmess/bconv.rb +169 -0
- data/lib/cmess/cinderella.rb +66 -0
- data/lib/cmess/cli.rb +120 -0
- data/lib/cmess/decode_entities.rb +69 -0
- data/lib/cmess/guess_encoding/automatic.rb +343 -0
- data/lib/cmess/guess_encoding/encoding.rb +78 -0
- data/lib/cmess/guess_encoding/manual.rb +108 -0
- data/lib/cmess/guess_encoding.rb +61 -0
- data/lib/cmess/version.rb +51 -0
- data/lib/cmess.rb +49 -0
- metadata +136 -0
data/lib/cmess/bconv.rb
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'yaml'
|
30
|
+
require 'iconv'
|
31
|
+
require 'cmess'
|
32
|
+
|
33
|
+
# Convert between bibliographic (and other) encodings.
|
34
|
+
|
35
|
+
class CMess::BConv
|
36
|
+
|
37
|
+
# our version ;-)
|
38
|
+
VERSION = '0.0.2'
|
39
|
+
|
40
|
+
INTERMEDIATE_ENCODING = 'utf-8'
|
41
|
+
|
42
|
+
DEFAULT_CHARTAB_FILE = File.join(CMess::DATA_DIR, 'chartab.yaml')
|
43
|
+
|
44
|
+
class << self
|
45
|
+
|
46
|
+
def encodings(chartab = DEFAULT_CHARTAB_FILE)
|
47
|
+
chartab = load_chartab(chartab)
|
48
|
+
|
49
|
+
chartab[chartab.keys.first].keys.map { |encoding|
|
50
|
+
encoding.upcase unless encoding =~ /\A__/
|
51
|
+
}.compact.sort
|
52
|
+
end
|
53
|
+
|
54
|
+
def convert(*args)
|
55
|
+
new(*args).convert
|
56
|
+
end
|
57
|
+
|
58
|
+
def load_chartab(chartab)
|
59
|
+
case chartab
|
60
|
+
when Hash
|
61
|
+
chartab
|
62
|
+
when String
|
63
|
+
raise "chartab file not found: #{chartab}" unless File.readable?(chartab)
|
64
|
+
YAML.load_file(chartab)
|
65
|
+
else
|
66
|
+
raise ArgumentError, "invalid chartab of type #{chartab.class}"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
attr_reader :input, :output, :source_encoding, :target_encoding, :chartab, :encodings
|
73
|
+
|
74
|
+
def initialize(input, output, source_encoding, target_encoding, chartab = DEFAULT_CHARTAB_FILE)
|
75
|
+
@input, @output = input, output
|
76
|
+
|
77
|
+
@source_encoding = source_encoding.upcase
|
78
|
+
@target_encoding = target_encoding.upcase
|
79
|
+
|
80
|
+
@chartab = self.class.load_chartab(chartab)
|
81
|
+
@encodings = self.class.encodings(@chartab)
|
82
|
+
end
|
83
|
+
|
84
|
+
def encoding?(encoding)
|
85
|
+
encodings.include?(encoding)
|
86
|
+
end
|
87
|
+
|
88
|
+
def convert
|
89
|
+
if encoding?(source_encoding)
|
90
|
+
if encoding?(target_encoding)
|
91
|
+
@charmap = chartab.inject({}) { |hash, (code, map)|
|
92
|
+
hash.update(map[source_encoding] => map[target_encoding].pack('U*'))
|
93
|
+
}
|
94
|
+
|
95
|
+
input.each_byte { |char|
|
96
|
+
output.print map(char)
|
97
|
+
}
|
98
|
+
else
|
99
|
+
iconv = iconv_to
|
100
|
+
|
101
|
+
@charmap = chartab.inject({}) { |hash, (code, map)|
|
102
|
+
hash.update(map[source_encoding] => [code.to_i(16)].pack('U*'))
|
103
|
+
}
|
104
|
+
|
105
|
+
input.each_byte { |char|
|
106
|
+
output.print iconv.iconv(map(char))
|
107
|
+
}
|
108
|
+
end
|
109
|
+
else
|
110
|
+
if encoding?(target_encoding)
|
111
|
+
iconv = iconv_from
|
112
|
+
|
113
|
+
charmap = chartab.inject({}) { |hash, (code, map)|
|
114
|
+
hash.update(code.to_i(16) => map[target_encoding].pack('U*'))
|
115
|
+
}
|
116
|
+
|
117
|
+
input.each { |line|
|
118
|
+
iconv.iconv(line).unpack('U*').each { |char|
|
119
|
+
output.print charmap[char]
|
120
|
+
}
|
121
|
+
}
|
122
|
+
else
|
123
|
+
iconv = iconv_from_to
|
124
|
+
|
125
|
+
input.each { |line|
|
126
|
+
output.puts iconv.iconv(line)
|
127
|
+
}
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
private
|
133
|
+
|
134
|
+
def iconv_from_to(from = source_encoding, to = target_encoding)
|
135
|
+
iconv = begin
|
136
|
+
Iconv.new(to, from)
|
137
|
+
rescue Iconv::InvalidEncoding
|
138
|
+
raise ArgumentError, "invalid encoding: source encoding = #{from}, target encoding = #{to}"
|
139
|
+
end
|
140
|
+
|
141
|
+
def iconv.iconv(*args)
|
142
|
+
super
|
143
|
+
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
|
144
|
+
warn "ILLEGAL INPUT SEQUENCE: #{err}"; ''
|
145
|
+
end
|
146
|
+
|
147
|
+
iconv
|
148
|
+
end
|
149
|
+
|
150
|
+
def iconv_from(from = source_encoding)
|
151
|
+
iconv_from_to(from, INTERMEDIATE_ENCODING)
|
152
|
+
end
|
153
|
+
|
154
|
+
def iconv_to(to = target_encoding)
|
155
|
+
iconv_from_to(INTERMEDIATE_ENCODING, to)
|
156
|
+
end
|
157
|
+
|
158
|
+
def map(char, charmap = @charmap)
|
159
|
+
unless map = charmap[[char]]
|
160
|
+
unless map = charmap[[char, c = input.getc]]
|
161
|
+
input.ungetc(c) if c
|
162
|
+
map = ''
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
map
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'iconv'
|
30
|
+
require 'cmess'
|
31
|
+
|
32
|
+
# Find (and possibly repair) doubly encoded characters. Here's how it's done:
|
33
|
+
#
|
34
|
+
# Treats characters encoded in target encoding as if they were encoded in
|
35
|
+
# source encoding, converts them to target encoding and "grep"s for lines
|
36
|
+
# containing those doubly encoded characters; if asked to repair doubly
|
37
|
+
# encoded characters, substitutes them with their original character.
|
38
|
+
|
39
|
+
module CMess::Cinderella
|
40
|
+
|
41
|
+
extend self
|
42
|
+
|
43
|
+
# our version ;-)
|
44
|
+
VERSION = '0.0.3'
|
45
|
+
|
46
|
+
DEFAULT_CSETS_DIR = File.join(CMess::DATA_DIR, 'csets')
|
47
|
+
|
48
|
+
def pick(input, pot, crop, source_encoding, target_encoding, chars, repair = false)
|
49
|
+
iconv = Iconv.new(target_encoding, source_encoding)
|
50
|
+
|
51
|
+
encoded = chars.inject({}) { |hash, char|
|
52
|
+
hash.update(iconv.iconv(char) => char)
|
53
|
+
}
|
54
|
+
|
55
|
+
regexp = Regexp.union(*encoded.keys)
|
56
|
+
|
57
|
+
input.each { |line|
|
58
|
+
if out = line =~ regexp ? crop : pot
|
59
|
+
line.gsub!(regexp) { |m| encoded[m] } if repair
|
60
|
+
|
61
|
+
out.puts(line)
|
62
|
+
end
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
data/lib/cmess/cli.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2009 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'tempfile'
|
30
|
+
|
31
|
+
require 'rubygems'
|
32
|
+
require 'nuggets/env/user_encoding'
|
33
|
+
|
34
|
+
module CMess::CLI
|
35
|
+
|
36
|
+
# how to split list of arguments
|
37
|
+
SPLIT_ARG_LIST_RE = /\s*[,\s]\s*/o
|
38
|
+
|
39
|
+
def ensure_readable(file)
|
40
|
+
abort "Can't find input file: #{file}" unless File.readable?(file)
|
41
|
+
end
|
42
|
+
|
43
|
+
def ensure_directory(dir)
|
44
|
+
abort "Directory not found: #{dir}" unless File.directory?(dir)
|
45
|
+
end
|
46
|
+
|
47
|
+
def open_file_in_place(file)
|
48
|
+
[open_temporary_input(file), File.open(file, 'w')]
|
49
|
+
end
|
50
|
+
|
51
|
+
def open_file_or_std(file, mode = 'r')
|
52
|
+
if file == '-'
|
53
|
+
case mode
|
54
|
+
when 'r' then STDIN
|
55
|
+
when 'w' then STDOUT
|
56
|
+
when 'a' then STDERR
|
57
|
+
else raise ArgumentError, "don't know how to handle mode '#{mode}'"
|
58
|
+
end
|
59
|
+
else
|
60
|
+
ensure_readable(file) unless mode == 'w'
|
61
|
+
File.open(file, mode)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def open_temporary_input(*files)
|
66
|
+
temp = Tempfile.new('cmess_cli')
|
67
|
+
|
68
|
+
files.each { |file|
|
69
|
+
if file == '-'
|
70
|
+
STDIN.each { |line| temp << line }
|
71
|
+
else
|
72
|
+
ensure_readable(file)
|
73
|
+
File.open(file) { |f| f.each { |line| temp << line } }
|
74
|
+
end
|
75
|
+
}
|
76
|
+
|
77
|
+
# return File, instead of Tempfile
|
78
|
+
temp.close
|
79
|
+
temp.open
|
80
|
+
end
|
81
|
+
|
82
|
+
def trailing_args_as_input(options)
|
83
|
+
unless ARGV.empty? || options[:input_set]
|
84
|
+
options[:input] = if ARGV.size == 1
|
85
|
+
open_file_or_std(ARGV.first)
|
86
|
+
else
|
87
|
+
open_temporary_input(*ARGV)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def determine_system_encoding
|
93
|
+
ENV.user_encoding || begin
|
94
|
+
dummy = lambda {
|
95
|
+
abort <<-EOT
|
96
|
+
Your system's encoding couldn't be determined automatically -- please specify
|
97
|
+
it explicitly via the ENCODING environment variable or via the '-t' option.
|
98
|
+
EOT
|
99
|
+
}
|
100
|
+
|
101
|
+
def dummy.to_s; 'NOT FOUND' end
|
102
|
+
|
103
|
+
dummy
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def cli
|
108
|
+
yield
|
109
|
+
rescue => err
|
110
|
+
if $VERBOSE
|
111
|
+
backtrace = err.backtrace
|
112
|
+
fromtrace = backtrace[1..-1].map { |i| "\n from #{i}" }
|
113
|
+
|
114
|
+
abort "#{backtrace.first} #{err} (#{err.class})#{fromtrace}"
|
115
|
+
else
|
116
|
+
abort "#{err.to_s.capitalize} [#{err.backtrace.first}]"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'iconv'
|
30
|
+
require 'cmess'
|
31
|
+
|
32
|
+
require 'rubygems'
|
33
|
+
require 'htmlentities/string'
|
34
|
+
|
35
|
+
module CMess::DecodeEntities
|
36
|
+
|
37
|
+
extend self
|
38
|
+
|
39
|
+
# our version ;-)
|
40
|
+
VERSION = '0.0.2'
|
41
|
+
|
42
|
+
# HTMLEntities requires UTF-8
|
43
|
+
INTERMEDIATE_ENCODING = 'utf-8'
|
44
|
+
|
45
|
+
ICONV_DUMMY = begin
|
46
|
+
dummy = Object.new
|
47
|
+
|
48
|
+
def dummy.iconv(string)
|
49
|
+
string
|
50
|
+
end
|
51
|
+
|
52
|
+
dummy
|
53
|
+
end
|
54
|
+
|
55
|
+
def decode(input, output, source_encoding, target_encoding = nil)
|
56
|
+
target_encoding ||= source_encoding
|
57
|
+
|
58
|
+
iconv_in = source_encoding != INTERMEDIATE_ENCODING ?
|
59
|
+
Iconv.new(INTERMEDIATE_ENCODING, source_encoding) : ICONV_DUMMY
|
60
|
+
|
61
|
+
iconv_out = target_encoding != INTERMEDIATE_ENCODING ?
|
62
|
+
Iconv.new(target_encoding, INTERMEDIATE_ENCODING) : ICONV_DUMMY
|
63
|
+
|
64
|
+
input.each { |line|
|
65
|
+
output.puts iconv_out.iconv(iconv_in.iconv(line).decode_entities)
|
66
|
+
}
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|