blackwinter-cmess 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +676 -0
- data/ChangeLog +54 -0
- data/README +63 -0
- data/Rakefile +51 -0
- data/bin/bconv +130 -0
- data/bin/cinderella +190 -0
- data/bin/decode_entities +106 -0
- data/bin/guess_encoding +223 -0
- data/data/chartab.yaml +26724 -0
- data/data/csets/iso_8859-1.yaml +195 -0
- data/data/csets/iso_8859-15.yaml +204 -0
- data/data/csets/latin1.yaml +195 -0
- data/data/csets/unicode/basic_latin.yaml +97 -0
- data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
- data/data/csets/unicode/cyrillic.yaml +256 -0
- data/data/csets/unicode/greek.yaml +129 -0
- data/data/csets/unicode/ipa_extensions.yaml +97 -0
- data/data/csets/unicode/latin-extended-c.yaml +18 -0
- data/data/csets/unicode/latin-extended-d.yaml +3 -0
- data/data/csets/unicode/latin_1_supplement.yaml +128 -0
- data/data/csets/unicode/latin_extended_a.yaml +129 -0
- data/data/csets/unicode/latin_extended_additional.yaml +247 -0
- data/data/csets/unicode/latin_extended_b.yaml +209 -0
- data/data/csets/unicode/letterlike_symbols.yaml +80 -0
- data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
- data/data/csets/utf-8.yaml +1504 -0
- data/data/csets/utf8.yaml +1504 -0
- data/data/test_chars.yaml +14 -0
- data/example/cinderella/crop +127 -0
- data/example/cinderella/crop_repaired +127 -0
- data/example/cinderella/empty6-slash.txt +1495 -0
- data/example/cinderella/empty6-slash_repaired.txt +1495 -0
- data/example/cinderella/pot +1368 -0
- data/example/guess_encoding/check_results +60 -0
- data/example/guess_encoding/de.utf-8.txt +10030 -0
- data/example/guess_encoding/en.utf-8.txt +10030 -0
- data/example/guess_encoding/fr.utf-8.txt +10030 -0
- data/example/guess_encoding/it.utf-8.txt +10030 -0
- data/lib/cmess/bconv.rb +169 -0
- data/lib/cmess/cinderella.rb +66 -0
- data/lib/cmess/cli.rb +120 -0
- data/lib/cmess/decode_entities.rb +69 -0
- data/lib/cmess/guess_encoding/automatic.rb +343 -0
- data/lib/cmess/guess_encoding/encoding.rb +78 -0
- data/lib/cmess/guess_encoding/manual.rb +108 -0
- data/lib/cmess/guess_encoding.rb +61 -0
- data/lib/cmess/version.rb +51 -0
- data/lib/cmess.rb +49 -0
- metadata +136 -0
data/lib/cmess/bconv.rb
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2008 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'yaml'
|
30
|
+
require 'iconv'
|
31
|
+
require 'cmess'
|
32
|
+
|
33
|
+
# Convert between bibliographic (and other) encodings.
|
34
|
+
|
35
|
+
class CMess::BConv
|
36
|
+
|
37
|
+
# our version ;-)
|
38
|
+
VERSION = '0.0.2'
|
39
|
+
|
40
|
+
INTERMEDIATE_ENCODING = 'utf-8'
|
41
|
+
|
42
|
+
DEFAULT_CHARTAB_FILE = File.join(CMess::DATA_DIR, 'chartab.yaml')
|
43
|
+
|
44
|
+
class << self
|
45
|
+
|
46
|
+
def encodings(chartab = DEFAULT_CHARTAB_FILE)
|
47
|
+
chartab = load_chartab(chartab)
|
48
|
+
|
49
|
+
chartab[chartab.keys.first].keys.map { |encoding|
|
50
|
+
encoding.upcase unless encoding =~ /\A__/
|
51
|
+
}.compact.sort
|
52
|
+
end
|
53
|
+
|
54
|
+
def convert(*args)
|
55
|
+
new(*args).convert
|
56
|
+
end
|
57
|
+
|
58
|
+
def load_chartab(chartab)
|
59
|
+
case chartab
|
60
|
+
when Hash
|
61
|
+
chartab
|
62
|
+
when String
|
63
|
+
raise "chartab file not found: #{chartab}" unless File.readable?(chartab)
|
64
|
+
YAML.load_file(chartab)
|
65
|
+
else
|
66
|
+
raise ArgumentError, "invalid chartab of type #{chartab.class}"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
attr_reader :input, :output, :source_encoding, :target_encoding, :chartab, :encodings
|
73
|
+
|
74
|
+
def initialize(input, output, source_encoding, target_encoding, chartab = DEFAULT_CHARTAB_FILE)
|
75
|
+
@input, @output = input, output
|
76
|
+
|
77
|
+
@source_encoding = source_encoding.upcase
|
78
|
+
@target_encoding = target_encoding.upcase
|
79
|
+
|
80
|
+
@chartab = self.class.load_chartab(chartab)
|
81
|
+
@encodings = self.class.encodings(@chartab)
|
82
|
+
end
|
83
|
+
|
84
|
+
def encoding?(encoding)
|
85
|
+
encodings.include?(encoding)
|
86
|
+
end
|
87
|
+
|
88
|
+
def convert
|
89
|
+
if encoding?(source_encoding)
|
90
|
+
if encoding?(target_encoding)
|
91
|
+
@charmap = chartab.inject({}) { |hash, (code, map)|
|
92
|
+
hash.update(map[source_encoding] => map[target_encoding].pack('U*'))
|
93
|
+
}
|
94
|
+
|
95
|
+
input.each_byte { |char|
|
96
|
+
output.print map(char)
|
97
|
+
}
|
98
|
+
else
|
99
|
+
iconv = iconv_to
|
100
|
+
|
101
|
+
@charmap = chartab.inject({}) { |hash, (code, map)|
|
102
|
+
hash.update(map[source_encoding] => [code.to_i(16)].pack('U*'))
|
103
|
+
}
|
104
|
+
|
105
|
+
input.each_byte { |char|
|
106
|
+
output.print iconv.iconv(map(char))
|
107
|
+
}
|
108
|
+
end
|
109
|
+
else
|
110
|
+
if encoding?(target_encoding)
|
111
|
+
iconv = iconv_from
|
112
|
+
|
113
|
+
charmap = chartab.inject({}) { |hash, (code, map)|
|
114
|
+
hash.update(code.to_i(16) => map[target_encoding].pack('U*'))
|
115
|
+
}
|
116
|
+
|
117
|
+
input.each { |line|
|
118
|
+
iconv.iconv(line).unpack('U*').each { |char|
|
119
|
+
output.print charmap[char]
|
120
|
+
}
|
121
|
+
}
|
122
|
+
else
|
123
|
+
iconv = iconv_from_to
|
124
|
+
|
125
|
+
input.each { |line|
|
126
|
+
output.puts iconv.iconv(line)
|
127
|
+
}
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
private
|
133
|
+
|
134
|
+
def iconv_from_to(from = source_encoding, to = target_encoding)
|
135
|
+
iconv = begin
|
136
|
+
Iconv.new(to, from)
|
137
|
+
rescue Iconv::InvalidEncoding
|
138
|
+
raise ArgumentError, "invalid encoding: source encoding = #{from}, target encoding = #{to}"
|
139
|
+
end
|
140
|
+
|
141
|
+
def iconv.iconv(*args)
|
142
|
+
super
|
143
|
+
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
|
144
|
+
warn "ILLEGAL INPUT SEQUENCE: #{err}"; ''
|
145
|
+
end
|
146
|
+
|
147
|
+
iconv
|
148
|
+
end
|
149
|
+
|
150
|
+
def iconv_from(from = source_encoding)
|
151
|
+
iconv_from_to(from, INTERMEDIATE_ENCODING)
|
152
|
+
end
|
153
|
+
|
154
|
+
def iconv_to(to = target_encoding)
|
155
|
+
iconv_from_to(INTERMEDIATE_ENCODING, to)
|
156
|
+
end
|
157
|
+
|
158
|
+
def map(char, charmap = @charmap)
|
159
|
+
unless map = charmap[[char]]
|
160
|
+
unless map = charmap[[char, c = input.getc]]
|
161
|
+
input.ungetc(c) if c
|
162
|
+
map = ''
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
map
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'iconv'
|
30
|
+
require 'cmess'
|
31
|
+
|
32
|
+
# Find (and possibly repair) doubly encoded characters. Here's how it's done:
|
33
|
+
#
|
34
|
+
# Treats characters encoded in target encoding as if they were encoded in
|
35
|
+
# source encoding, converts them to target encoding and "grep"s for lines
|
36
|
+
# containing those doubly encoded characters; if asked to repair doubly
|
37
|
+
# encoded characters, substitutes them with their original character.
|
38
|
+
|
39
|
+
module CMess::Cinderella
|
40
|
+
|
41
|
+
extend self
|
42
|
+
|
43
|
+
# our version ;-)
|
44
|
+
VERSION = '0.0.3'
|
45
|
+
|
46
|
+
DEFAULT_CSETS_DIR = File.join(CMess::DATA_DIR, 'csets')
|
47
|
+
|
48
|
+
def pick(input, pot, crop, source_encoding, target_encoding, chars, repair = false)
|
49
|
+
iconv = Iconv.new(target_encoding, source_encoding)
|
50
|
+
|
51
|
+
encoded = chars.inject({}) { |hash, char|
|
52
|
+
hash.update(iconv.iconv(char) => char)
|
53
|
+
}
|
54
|
+
|
55
|
+
regexp = Regexp.union(*encoded.keys)
|
56
|
+
|
57
|
+
input.each { |line|
|
58
|
+
if out = line =~ regexp ? crop : pot
|
59
|
+
line.gsub!(regexp) { |m| encoded[m] } if repair
|
60
|
+
|
61
|
+
out.puts(line)
|
62
|
+
end
|
63
|
+
}
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
data/lib/cmess/cli.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007-2009 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'tempfile'
|
30
|
+
|
31
|
+
require 'rubygems'
|
32
|
+
require 'nuggets/env/user_encoding'
|
33
|
+
|
34
|
+
module CMess::CLI
|
35
|
+
|
36
|
+
# how to split list of arguments
|
37
|
+
SPLIT_ARG_LIST_RE = /\s*[,\s]\s*/o
|
38
|
+
|
39
|
+
def ensure_readable(file)
|
40
|
+
abort "Can't find input file: #{file}" unless File.readable?(file)
|
41
|
+
end
|
42
|
+
|
43
|
+
def ensure_directory(dir)
|
44
|
+
abort "Directory not found: #{dir}" unless File.directory?(dir)
|
45
|
+
end
|
46
|
+
|
47
|
+
def open_file_in_place(file)
|
48
|
+
[open_temporary_input(file), File.open(file, 'w')]
|
49
|
+
end
|
50
|
+
|
51
|
+
def open_file_or_std(file, mode = 'r')
|
52
|
+
if file == '-'
|
53
|
+
case mode
|
54
|
+
when 'r' then STDIN
|
55
|
+
when 'w' then STDOUT
|
56
|
+
when 'a' then STDERR
|
57
|
+
else raise ArgumentError, "don't know how to handle mode '#{mode}'"
|
58
|
+
end
|
59
|
+
else
|
60
|
+
ensure_readable(file) unless mode == 'w'
|
61
|
+
File.open(file, mode)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def open_temporary_input(*files)
|
66
|
+
temp = Tempfile.new('cmess_cli')
|
67
|
+
|
68
|
+
files.each { |file|
|
69
|
+
if file == '-'
|
70
|
+
STDIN.each { |line| temp << line }
|
71
|
+
else
|
72
|
+
ensure_readable(file)
|
73
|
+
File.open(file) { |f| f.each { |line| temp << line } }
|
74
|
+
end
|
75
|
+
}
|
76
|
+
|
77
|
+
# return File, instead of Tempfile
|
78
|
+
temp.close
|
79
|
+
temp.open
|
80
|
+
end
|
81
|
+
|
82
|
+
def trailing_args_as_input(options)
|
83
|
+
unless ARGV.empty? || options[:input_set]
|
84
|
+
options[:input] = if ARGV.size == 1
|
85
|
+
open_file_or_std(ARGV.first)
|
86
|
+
else
|
87
|
+
open_temporary_input(*ARGV)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def determine_system_encoding
|
93
|
+
ENV.user_encoding || begin
|
94
|
+
dummy = lambda {
|
95
|
+
abort <<-EOT
|
96
|
+
Your system's encoding couldn't be determined automatically -- please specify
|
97
|
+
it explicitly via the ENCODING environment variable or via the '-t' option.
|
98
|
+
EOT
|
99
|
+
}
|
100
|
+
|
101
|
+
def dummy.to_s; 'NOT FOUND' end
|
102
|
+
|
103
|
+
dummy
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def cli
|
108
|
+
yield
|
109
|
+
rescue => err
|
110
|
+
if $VERBOSE
|
111
|
+
backtrace = err.backtrace
|
112
|
+
fromtrace = backtrace[1..-1].map { |i| "\n from #{i}" }
|
113
|
+
|
114
|
+
abort "#{backtrace.first} #{err} (#{err.class})#{fromtrace}"
|
115
|
+
else
|
116
|
+
abort "#{err.to_s.capitalize} [#{err.backtrace.first}]"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#--
|
2
|
+
###############################################################################
|
3
|
+
# #
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
5
|
+
# #
|
6
|
+
# Copyright (C) 2007 University of Cologne, #
|
7
|
+
# Albertus-Magnus-Platz, #
|
8
|
+
# 50932 Cologne, Germany #
|
9
|
+
# #
|
10
|
+
# Authors: #
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
12
|
+
# #
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
16
|
+
# version. #
|
17
|
+
# #
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
21
|
+
# details. #
|
22
|
+
# #
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
25
|
+
# #
|
26
|
+
###############################################################################
|
27
|
+
#++
|
28
|
+
|
29
|
+
require 'iconv'
|
30
|
+
require 'cmess'
|
31
|
+
|
32
|
+
require 'rubygems'
|
33
|
+
require 'htmlentities/string'
|
34
|
+
|
35
|
+
module CMess::DecodeEntities
|
36
|
+
|
37
|
+
extend self
|
38
|
+
|
39
|
+
# our version ;-)
|
40
|
+
VERSION = '0.0.2'
|
41
|
+
|
42
|
+
# HTMLEntities requires UTF-8
|
43
|
+
INTERMEDIATE_ENCODING = 'utf-8'
|
44
|
+
|
45
|
+
ICONV_DUMMY = begin
|
46
|
+
dummy = Object.new
|
47
|
+
|
48
|
+
def dummy.iconv(string)
|
49
|
+
string
|
50
|
+
end
|
51
|
+
|
52
|
+
dummy
|
53
|
+
end
|
54
|
+
|
55
|
+
def decode(input, output, source_encoding, target_encoding = nil)
|
56
|
+
target_encoding ||= source_encoding
|
57
|
+
|
58
|
+
iconv_in = source_encoding != INTERMEDIATE_ENCODING ?
|
59
|
+
Iconv.new(INTERMEDIATE_ENCODING, source_encoding) : ICONV_DUMMY
|
60
|
+
|
61
|
+
iconv_out = target_encoding != INTERMEDIATE_ENCODING ?
|
62
|
+
Iconv.new(target_encoding, INTERMEDIATE_ENCODING) : ICONV_DUMMY
|
63
|
+
|
64
|
+
input.each { |line|
|
65
|
+
output.puts iconv_out.iconv(iconv_in.iconv(line).decode_entities)
|
66
|
+
}
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|