blackwinter-cmess 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +676 -0
- data/ChangeLog +54 -0
- data/README +63 -0
- data/Rakefile +51 -0
- data/bin/bconv +130 -0
- data/bin/cinderella +190 -0
- data/bin/decode_entities +106 -0
- data/bin/guess_encoding +223 -0
- data/data/chartab.yaml +26724 -0
- data/data/csets/iso_8859-1.yaml +195 -0
- data/data/csets/iso_8859-15.yaml +204 -0
- data/data/csets/latin1.yaml +195 -0
- data/data/csets/unicode/basic_latin.yaml +97 -0
- data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
- data/data/csets/unicode/cyrillic.yaml +256 -0
- data/data/csets/unicode/greek.yaml +129 -0
- data/data/csets/unicode/ipa_extensions.yaml +97 -0
- data/data/csets/unicode/latin-extended-c.yaml +18 -0
- data/data/csets/unicode/latin-extended-d.yaml +3 -0
- data/data/csets/unicode/latin_1_supplement.yaml +128 -0
- data/data/csets/unicode/latin_extended_a.yaml +129 -0
- data/data/csets/unicode/latin_extended_additional.yaml +247 -0
- data/data/csets/unicode/latin_extended_b.yaml +209 -0
- data/data/csets/unicode/letterlike_symbols.yaml +80 -0
- data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
- data/data/csets/utf-8.yaml +1504 -0
- data/data/csets/utf8.yaml +1504 -0
- data/data/test_chars.yaml +14 -0
- data/example/cinderella/crop +127 -0
- data/example/cinderella/crop_repaired +127 -0
- data/example/cinderella/empty6-slash.txt +1495 -0
- data/example/cinderella/empty6-slash_repaired.txt +1495 -0
- data/example/cinderella/pot +1368 -0
- data/example/guess_encoding/check_results +60 -0
- data/example/guess_encoding/de.utf-8.txt +10030 -0
- data/example/guess_encoding/en.utf-8.txt +10030 -0
- data/example/guess_encoding/fr.utf-8.txt +10030 -0
- data/example/guess_encoding/it.utf-8.txt +10030 -0
- data/lib/cmess/bconv.rb +169 -0
- data/lib/cmess/cinderella.rb +66 -0
- data/lib/cmess/cli.rb +120 -0
- data/lib/cmess/decode_entities.rb +69 -0
- data/lib/cmess/guess_encoding/automatic.rb +343 -0
- data/lib/cmess/guess_encoding/encoding.rb +78 -0
- data/lib/cmess/guess_encoding/manual.rb +108 -0
- data/lib/cmess/guess_encoding.rb +61 -0
- data/lib/cmess/version.rb +51 -0
- data/lib/cmess.rb +49 -0
- metadata +136 -0
data/bin/guess_encoding
ADDED
@@ -0,0 +1,223 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# guess_encoding -- Assist with guessing the encoding of some input at hand #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2007-2008 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
|
34
|
+
require 'rubygems'
|
35
|
+
require 'nuggets/string/word_wrap'
|
36
|
+
|
37
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
38
|
+
|
39
|
+
require 'cmess/guess_encoding'
|
40
|
+
require 'cmess/cli'
|
41
|
+
|
42
|
+
include CMess::CLI
|
43
|
+
|
44
|
+
PROGNAME = File.basename($0)
|
45
|
+
|
46
|
+
# short-cut
|
47
|
+
CGE = CMess::GuessEncoding
|
48
|
+
|
49
|
+
options = {
|
50
|
+
:input => STDIN,
|
51
|
+
:line => 1,
|
52
|
+
:encodings => nil,
|
53
|
+
:additional_encodings => [],
|
54
|
+
:target_encoding => determine_system_encoding,
|
55
|
+
:manual => false,
|
56
|
+
:chunk_size => nil,
|
57
|
+
:ignore_bom => false,
|
58
|
+
:charcodes => nil,
|
59
|
+
:decimal => false,
|
60
|
+
:octal => false
|
61
|
+
}
|
62
|
+
|
63
|
+
OptionParser.new(nil, 40) { |opts|
|
64
|
+
opts.banner = "Usage: #{$0} [options] [FILE...]"
|
65
|
+
|
66
|
+
opts.separator ''
|
67
|
+
opts.separator 'Options:'
|
68
|
+
|
69
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
70
|
+
options[:input] = open_file_or_std(f)
|
71
|
+
options[:input_set] = true
|
72
|
+
}
|
73
|
+
|
74
|
+
opts.separator ''
|
75
|
+
opts.separator ' * Automatic guessing'
|
76
|
+
opts.separator ''
|
77
|
+
|
78
|
+
opts.on('-c', '--chunk-size SIZE', Integer, "Size of chunks input will be read in until a valid encoding", "has been found; by default the whole file will be read") { |s|
|
79
|
+
options[:chunk_size] = s
|
80
|
+
}
|
81
|
+
|
82
|
+
opts.separator ''
|
83
|
+
|
84
|
+
opts.on('-b', '--ignore-bom', "Ignore detected BOM (if any)", "(see below for a list of supported encodings)") {
|
85
|
+
options[:ignore_bom] = true
|
86
|
+
}
|
87
|
+
|
88
|
+
opts.separator ''
|
89
|
+
opts.separator ' * Manual guessing'
|
90
|
+
opts.separator ''
|
91
|
+
|
92
|
+
opts.on('-m', '--manual', "Present variously encoded input for manual encoding guessing") {
|
93
|
+
options[:manual] = true
|
94
|
+
}
|
95
|
+
|
96
|
+
opts.separator ''
|
97
|
+
|
98
|
+
opts.on('-l', '--line LINE', "Line number of input file to use for testing [Default: #{options[:line]}]") { |l|
|
99
|
+
options[:line] = l.to_i
|
100
|
+
|
101
|
+
unless options[:line] > 0
|
102
|
+
options[:input].read # prevent 'Broken pipe' error
|
103
|
+
abort "Line number must be greater then 0!"
|
104
|
+
end
|
105
|
+
}
|
106
|
+
|
107
|
+
opts.separator ''
|
108
|
+
|
109
|
+
opts.on('-e', '--encodings ENCODINGS...', "List of encodings to try >instead of< default (see below)") { |e|
|
110
|
+
options[:encodings] ||= []
|
111
|
+
options[:encodings] += e.split(SPLIT_ARG_LIST_RE)
|
112
|
+
}
|
113
|
+
|
114
|
+
opts.on('-a', '--additional-encodings ENCODINGS...', "List of encodings to try >in addition to< default (see below)") { |e|
|
115
|
+
options[:additional_encodings] += e.split(SPLIT_ARG_LIST_RE)
|
116
|
+
}
|
117
|
+
|
118
|
+
opts.separator ''
|
119
|
+
|
120
|
+
opts.on('-t', '--target-encoding ENCODING', "Target encoding of your system [Default: #{options[:target_encoding]}]") { |e|
|
121
|
+
options[:target_encoding] = e
|
122
|
+
}
|
123
|
+
|
124
|
+
opts.separator ''
|
125
|
+
|
126
|
+
opts.on('--list-encodings', 'Print a list of all available encodings on your system and exit') {
|
127
|
+
puts CGE::Encoding.all_encodings
|
128
|
+
exit
|
129
|
+
}
|
130
|
+
|
131
|
+
opts.separator ''
|
132
|
+
opts.separator ' * Charcodes'
|
133
|
+
opts.separator ''
|
134
|
+
|
135
|
+
opts.on('-C', '--charcodes CHARCODES', "Specify a list of character codes (in hexadecimal by default)", "for manual guessing. (Options '-e', '-a', and '-t' apply here", "as well; see under \"Manual guessing\" for details.)") { |c|
|
136
|
+
options[:charcodes] = c.split(SPLIT_ARG_LIST_RE)
|
137
|
+
}
|
138
|
+
|
139
|
+
opts.separator ''
|
140
|
+
|
141
|
+
opts.on('-D', '--decimal', "Charcodes are in decimal") {
|
142
|
+
options[:decimal] = true
|
143
|
+
}
|
144
|
+
|
145
|
+
opts.on('-O', '--octal', "Charcodes are in octal") {
|
146
|
+
options[:octal] = true
|
147
|
+
}
|
148
|
+
|
149
|
+
opts.separator ''
|
150
|
+
opts.separator 'Generic options:'
|
151
|
+
|
152
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
153
|
+
puts opts
|
154
|
+
exit
|
155
|
+
}
|
156
|
+
|
157
|
+
opts.on('--version', "Print program version and exit") {
|
158
|
+
puts "#{PROGNAME} v#{CGE::VERSION} (part of cmess v#{CMess::VERSION})"
|
159
|
+
exit
|
160
|
+
}
|
161
|
+
|
162
|
+
opts.separator ''
|
163
|
+
opts.separator 'Supported encodings for automatic guessing (will be tried in that order):'
|
164
|
+
CGE::Automatic.supported_encodings.join(', ').word_wrap(110, true).each { |l|
|
165
|
+
opts.separator l
|
166
|
+
}
|
167
|
+
|
168
|
+
opts.separator ''
|
169
|
+
opts.separator 'Supported encodings for BOM detection (will be tried in that order):'
|
170
|
+
CGE::Automatic.supported_boms.join(', ').word_wrap(110, true).each { |l|
|
171
|
+
opts.separator l
|
172
|
+
}
|
173
|
+
|
174
|
+
opts.separator ''
|
175
|
+
opts.separator 'Default encodings for manual guessing:'
|
176
|
+
CGE::Manual::ENCODINGS.join(', ').word_wrap(110, true).each { |l|
|
177
|
+
opts.separator l
|
178
|
+
}
|
179
|
+
|
180
|
+
opts.separator ''
|
181
|
+
opts.separator 'Likely candidates for additional testing:'
|
182
|
+
CGE::Manual::CANDIDATES.join(', ').word_wrap(110, true).each { |l|
|
183
|
+
opts.separator l
|
184
|
+
}
|
185
|
+
|
186
|
+
opts.separator ''
|
187
|
+
opts.separator 'NOTE: To select all encodings available on your system, specify __ALL__.'
|
188
|
+
|
189
|
+
opts.separator ''
|
190
|
+
opts.separator "When FILE is -, STDIN is used."
|
191
|
+
}.parse!
|
192
|
+
|
193
|
+
cli do
|
194
|
+
trailing_args_as_input(options)
|
195
|
+
|
196
|
+
if options[:manual] || options[:charcodes]
|
197
|
+
options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
|
198
|
+
|
199
|
+
if charcodes = options[:charcodes]
|
200
|
+
base = options[:octal] ? 8 : options[:decimal] ? 10 : 16
|
201
|
+
input = charcodes.map { |c| c.to_i(base).chr }.join
|
202
|
+
else
|
203
|
+
# reset line counter
|
204
|
+
$. = 0
|
205
|
+
|
206
|
+
input = options[:input].each { |line|
|
207
|
+
break line if $. == options[:line]
|
208
|
+
}
|
209
|
+
|
210
|
+
abort "Input was empty!" if $..zero?
|
211
|
+
abort "Line not found -- input has only #{$.} line#{'s' if $. != 1}" unless input.is_a?(String)
|
212
|
+
end
|
213
|
+
|
214
|
+
CGE::Manual.display(
|
215
|
+
input,
|
216
|
+
options[:target_encoding],
|
217
|
+
options[:encodings],
|
218
|
+
options[:additional_encodings]
|
219
|
+
)
|
220
|
+
else # automatic
|
221
|
+
puts CGE::Automatic.guess(options[:input], options[:chunk_size], options[:ignore_bom])
|
222
|
+
end
|
223
|
+
end
|