cmess 0.0.4.136
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +676 -0
- data/ChangeLog +6 -0
- data/README +53 -0
- data/Rakefile +30 -0
- data/bin/cinderella +186 -0
- data/bin/decode_entities +101 -0
- data/bin/guess_encoding +183 -0
- data/data/csets/iso_8859-1.yaml +195 -0
- data/data/csets/iso_8859-15.yaml +204 -0
- data/data/csets/latin1.yaml +195 -0
- data/data/csets/unicode/basic_latin.yaml +97 -0
- data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
- data/data/csets/unicode/cyrillic.yaml +256 -0
- data/data/csets/unicode/greek.yaml +129 -0
- data/data/csets/unicode/ipa_extensions.yaml +97 -0
- data/data/csets/unicode/latin-extended-c.yaml +18 -0
- data/data/csets/unicode/latin-extended-d.yaml +3 -0
- data/data/csets/unicode/latin_1_supplement.yaml +128 -0
- data/data/csets/unicode/latin_extended_a.yaml +129 -0
- data/data/csets/unicode/latin_extended_additional.yaml +247 -0
- data/data/csets/unicode/latin_extended_b.yaml +209 -0
- data/data/csets/unicode/letterlike_symbols.yaml +80 -0
- data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
- data/data/csets/utf-8.yaml +1504 -0
- data/data/csets/utf8.yaml +1504 -0
- data/example/crop +127 -0
- data/example/crop_repaired +127 -0
- data/example/empty6-slash.txt +1495 -0
- data/example/empty6-slash_repaired.txt +1495 -0
- data/example/pot +1368 -0
- data/lib/cmess.rb +44 -0
- data/lib/cmess/cinderella.rb +63 -0
- data/lib/cmess/cli.rb +79 -0
- data/lib/cmess/decode_entities.rb +68 -0
- data/lib/cmess/guess_encoding.rb +372 -0
- data/lib/cmess/version.rb +51 -0
- metadata +119 -0
data/ChangeLog
ADDED
data/README
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
= cmess - Assist with messed up encodings
|
2
|
+
|
3
|
+
== VERSION
|
4
|
+
|
5
|
+
This documentation refers to cmess version 0.0.3
|
6
|
+
|
7
|
+
|
8
|
+
== DESCRIPTION
|
9
|
+
|
10
|
+
CMess bundles several tools under its hood that aim at dealing with various
|
11
|
+
problems occurring in the context of character sets and encodings. Currently,
|
12
|
+
there are:
|
13
|
+
|
14
|
+
guess_encoding:: Simple helper to identify the encoding of a given string.
|
15
|
+
Includes the ability to automatically detect the encoding
|
16
|
+
of an input.
|
17
|
+
cinderella:: When characters are "double encoded", you can't easily
|
18
|
+
convert them back -- this is where cinderella comes in,
|
19
|
+
sorting the good ones into the pot and the (potentially)
|
20
|
+
bad ones into the crop...
|
21
|
+
decode_entities:: Decode HTML entities in a string.
|
22
|
+
|
23
|
+
TODO: well, more of the description... ;-)
|
24
|
+
|
25
|
+
|
26
|
+
== AUTHORS
|
27
|
+
|
28
|
+
* Jens Wille <mailto:jens.wille@uni-koeln.de>
|
29
|
+
|
30
|
+
|
31
|
+
== CREDITS
|
32
|
+
|
33
|
+
* John Vorhauer <mailto:john@vorhauer.de> for the idea and
|
34
|
+
original implementation of the automatic encoding guesser
|
35
|
+
(see CMess::GuessEncoding::Guesser).
|
36
|
+
|
37
|
+
|
38
|
+
== LICENSE AND COPYRIGHT
|
39
|
+
|
40
|
+
Copyright (C) 2007 University of Cologne,
|
41
|
+
Albertus-Magnus-Platz, 50932 Cologne, Germany
|
42
|
+
|
43
|
+
cmess is free software: you can redistribute it and/or modify it under the
|
44
|
+
terms of the GNU General Public License as published by the Free Software
|
45
|
+
Foundation, either version 3 of the License, or (at your option) any later
|
46
|
+
version.
|
47
|
+
|
48
|
+
cmess is distributed in the hope that it will be useful, but WITHOUT ANY
|
49
|
+
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
50
|
+
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
51
|
+
|
52
|
+
You should have received a copy of the GNU General Public License along with
|
53
|
+
cmess. If not, see <http://www.gnu.org/licenses/>.
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# Utilizes global rake-tasks: alias rake="rake -r rake -R /path/to/rakelibdir"
|
2
|
+
# (Base tasks at <http://prometheus.khi.uni-koeln.de/svn/scratch/rake-tasks/>)
|
3
|
+
|
4
|
+
$:.unshift('lib')
|
5
|
+
|
6
|
+
require 'cmess'
|
7
|
+
|
8
|
+
FILES = FileList['lib/**/*.rb'].to_a
|
9
|
+
EXECS = FileList['bin/*'].to_a
|
10
|
+
RDOCS = %w[README COPYING ChangeLog]
|
11
|
+
OTHER = FileList['[A-Z]*', 'example/**/*', 'data/**/*'].to_a
|
12
|
+
|
13
|
+
task(:doc_spec) {{
|
14
|
+
:title => 'cmess Application documentation',
|
15
|
+
:rdoc_files => RDOCS + FILES
|
16
|
+
}}
|
17
|
+
|
18
|
+
task(:gem_spec) {{
|
19
|
+
:name => 'cmess',
|
20
|
+
:version => CMess::VERSION,
|
21
|
+
:summary => "Assist with handling messed up encodings " <<
|
22
|
+
"(Currently includes the following tools: " <<
|
23
|
+
"#{EXECS.map { |e| File.basename(e) }.join(', ')})",
|
24
|
+
:files => FILES + EXECS + OTHER,
|
25
|
+
:require_path => 'lib',
|
26
|
+
:bindir => 'bin',
|
27
|
+
:executables => EXECS,
|
28
|
+
:extra_rdoc_files => RDOCS,
|
29
|
+
:dependencies => %w[ruby-nuggets htmlentities]
|
30
|
+
}}
|
data/bin/cinderella
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# cinderella -- Handle double encoded characters #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2007 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
require 'yaml'
|
34
|
+
|
35
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
36
|
+
|
37
|
+
require 'cmess'
|
38
|
+
require 'cmess/cinderella'
|
39
|
+
require 'cmess/cli'
|
40
|
+
|
41
|
+
include CMess::CLI
|
42
|
+
|
43
|
+
PROGNAME = File.basename($0)
|
44
|
+
|
45
|
+
options = {
|
46
|
+
:input => STDIN,
|
47
|
+
:output => STDOUT,
|
48
|
+
:pot => nil,
|
49
|
+
:crop => nil,
|
50
|
+
:source_encoding => nil,
|
51
|
+
:target_encoding => determine_system_encoding,
|
52
|
+
:csets => [
|
53
|
+
File.expand_path(File.join(File.dirname(__FILE__), '..', 'data', 'csets'))
|
54
|
+
],
|
55
|
+
:repair => false
|
56
|
+
}
|
57
|
+
|
58
|
+
OptionParser.new(nil, 40) { |opts|
|
59
|
+
opts.banner = "Usage: #{$0} [options]"
|
60
|
+
|
61
|
+
opts.separator ''
|
62
|
+
opts.separator 'Options:'
|
63
|
+
|
64
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
65
|
+
options[:input] = open_file_or_std(f)
|
66
|
+
}
|
67
|
+
|
68
|
+
opts.separator ''
|
69
|
+
|
70
|
+
opts.on('-p', '--pot FILE', "The good into the pot...") { |f|
|
71
|
+
options[:pot] = open_file_or_std(f, 'w')
|
72
|
+
}
|
73
|
+
|
74
|
+
opts.on('-c', '--crop FILE', "...the bad into the crop") { |f|
|
75
|
+
options[:crop] = open_file_or_std(f, 'w')
|
76
|
+
}
|
77
|
+
|
78
|
+
opts.separator ''
|
79
|
+
|
80
|
+
opts.on('-o', '--output [FILE]', "Write both good and bad lines to FILE or", "default [Default: STDOUT] (Particularly", "useful in combination with the '-r' option)") { |f|
|
81
|
+
options[:output] = open_file_or_std(f, 'w') if f
|
82
|
+
|
83
|
+
options[:pot] = options[:output]
|
84
|
+
options[:crop] = options[:output]
|
85
|
+
}
|
86
|
+
|
87
|
+
opts.separator ''
|
88
|
+
|
89
|
+
opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'", "(Only really useful in combination with", "the '-r' option)") { |f|
|
90
|
+
options[:input], options[:output] = open_file_in_place(f)
|
91
|
+
|
92
|
+
options[:pot] = options[:output]
|
93
|
+
options[:crop] = options[:output]
|
94
|
+
}
|
95
|
+
|
96
|
+
opts.separator ''
|
97
|
+
|
98
|
+
opts.on('-e', '--source-encoding ENCODING', "Source encoding (from) [REQUIRED]") { |e|
|
99
|
+
options[:source_encoding] = e
|
100
|
+
}
|
101
|
+
|
102
|
+
opts.on('-t', '--target-encoding ENCODING', "Target encoding (to); see '-l' for a list", "of available encodings [Default: #{options[:target_encoding]}]") { |e|
|
103
|
+
options[:target_encoding] = e
|
104
|
+
}
|
105
|
+
|
106
|
+
opts.separator ''
|
107
|
+
|
108
|
+
opts.on('-T', '--addtl-target-encodings DIRECTORY', "Directory providing additional char files", "for target encoding") { |d|
|
109
|
+
ensure_directory(d)
|
110
|
+
|
111
|
+
options[:csets] |= [File.expand_path(d)]
|
112
|
+
}
|
113
|
+
|
114
|
+
opts.separator ''
|
115
|
+
|
116
|
+
opts.on('-l', '--list-encodings', "Display a list of available target encodings", "and exit; see '-T' on how to add your own") {
|
117
|
+
csets = options[:csets].inject({}) { |hash, cset|
|
118
|
+
encodings = Dir[File.join(cset, '*.yaml')].sort.map { |yaml|
|
119
|
+
File.basename(yaml, '.yaml') unless File.symlink?(yaml)
|
120
|
+
}.compact
|
121
|
+
|
122
|
+
hash[cset] = encodings unless encodings.empty?
|
123
|
+
hash
|
124
|
+
}
|
125
|
+
|
126
|
+
if csets.empty?
|
127
|
+
puts "No target encodings available for #{PROGNAME}"
|
128
|
+
else
|
129
|
+
puts "Available target encodings for #{PROGNAME}:"
|
130
|
+
csets.each { |cset, encodings|
|
131
|
+
puts "[#{cset}]"
|
132
|
+
encodings.each { |encoding|
|
133
|
+
puts " - #{encoding}"
|
134
|
+
}
|
135
|
+
}
|
136
|
+
end
|
137
|
+
|
138
|
+
exit
|
139
|
+
}
|
140
|
+
|
141
|
+
opts.separator ''
|
142
|
+
|
143
|
+
opts.on('-r', '--repair', "Try to repair corrupted characters") {
|
144
|
+
options[:repair] = true
|
145
|
+
}
|
146
|
+
|
147
|
+
opts.separator ''
|
148
|
+
opts.separator 'Generic options:'
|
149
|
+
|
150
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
151
|
+
puts opts
|
152
|
+
exit
|
153
|
+
}
|
154
|
+
|
155
|
+
opts.on('--version', "Print program version and exit") {
|
156
|
+
puts "#{PROGNAME} v#{CMess::Cinderella::VERSION} (part of cmess v#{CMess::VERSION})"
|
157
|
+
exit
|
158
|
+
}
|
159
|
+
|
160
|
+
opts.separator ''
|
161
|
+
opts.separator "If '-p' or '-c' is omitted, and '-o' is not given either, that particular output"
|
162
|
+
opts.separator "is ignored. When FILE is -, either STDIN or STDOUT is used (as appropriate)."
|
163
|
+
}.parse!
|
164
|
+
|
165
|
+
options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
|
166
|
+
|
167
|
+
abort "No source encoding given! (Use the '-e' switch to do so; see '--help' for more information)" \
|
168
|
+
unless options[:source_encoding]
|
169
|
+
|
170
|
+
yaml_file = "#{options[:target_encoding].downcase}.yaml"
|
171
|
+
char_file = options[:csets].inject(nil) { |path, cset|
|
172
|
+
path = File.join(cset, yaml_file)
|
173
|
+
break path if File.readable?(path)
|
174
|
+
}
|
175
|
+
abort "Char file not found for target encoding: #{options[:target_encoding]}" \
|
176
|
+
unless char_file
|
177
|
+
|
178
|
+
CMess::Cinderella.pick(
|
179
|
+
options[:input],
|
180
|
+
options[:pot],
|
181
|
+
options[:crop],
|
182
|
+
options[:source_encoding],
|
183
|
+
options[:target_encoding],
|
184
|
+
YAML.load_file(char_file),
|
185
|
+
options[:repair]
|
186
|
+
)
|
data/bin/decode_entities
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# decode_entities -- Decode HTML entities #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2007 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
|
34
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
35
|
+
|
36
|
+
require 'cmess'
|
37
|
+
require 'cmess/decode_entities'
|
38
|
+
require 'cmess/cli'
|
39
|
+
|
40
|
+
include CMess::CLI
|
41
|
+
|
42
|
+
PROGNAME = File.basename($0)
|
43
|
+
|
44
|
+
options = {
|
45
|
+
:input => STDIN,
|
46
|
+
:output => STDOUT,
|
47
|
+
:source_encoding => CMess::DecodeEntities::INTERMEDIATE_ENCODING,
|
48
|
+
:target_encoding => nil
|
49
|
+
}
|
50
|
+
|
51
|
+
OptionParser.new { |opts|
|
52
|
+
opts.banner = "Usage: #{$0} [options]"
|
53
|
+
|
54
|
+
opts.separator ''
|
55
|
+
opts.separator 'Options:'
|
56
|
+
|
57
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
58
|
+
options[:input] = open_file_or_std(f)
|
59
|
+
}
|
60
|
+
|
61
|
+
opts.on('-o', '--output FILE', "Output file to write to [Default: STDOUT]") { |f|
|
62
|
+
options[:output] = open_file_or_std(f, 'w')
|
63
|
+
}
|
64
|
+
|
65
|
+
opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'") { |f|
|
66
|
+
options[:input], options[:output] = open_file_in_place(f)
|
67
|
+
}
|
68
|
+
|
69
|
+
opts.separator ''
|
70
|
+
|
71
|
+
opts.on('-e', '--source-encoding ENCODING', "Encoding of input file [Default: #{options[:source_encoding].upcase}]") { |e|
|
72
|
+
options[:source_encoding] = e.downcase
|
73
|
+
}
|
74
|
+
|
75
|
+
opts.on('-t', '--target-encoding ENCODING', "Desired encoding for output file [Default: <source_encoding>]") { |e|
|
76
|
+
options[:target_encoding] = e.downcase
|
77
|
+
}
|
78
|
+
|
79
|
+
opts.separator ''
|
80
|
+
opts.separator 'Generic options:'
|
81
|
+
|
82
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
83
|
+
puts opts
|
84
|
+
exit
|
85
|
+
}
|
86
|
+
|
87
|
+
opts.on('--version', "Print program version and exit") {
|
88
|
+
puts "#{PROGNAME} v#{CMess::DecodeEntities::VERSION} (part of cmess v#{CMess::VERSION})"
|
89
|
+
exit
|
90
|
+
}
|
91
|
+
|
92
|
+
opts.separator ''
|
93
|
+
opts.separator "When FILE is -, either STDIN or STDOUT is used (as appropriate)."
|
94
|
+
}.parse!
|
95
|
+
|
96
|
+
CMess::DecodeEntities.decode(
|
97
|
+
options[:input],
|
98
|
+
options[:output],
|
99
|
+
options[:source_encoding],
|
100
|
+
options[:target_encoding]
|
101
|
+
)
|
data/bin/guess_encoding
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# guess_encoding -- Assist with guessing the encoding of some input at hand #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2007 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
|
34
|
+
require 'rubygems'
|
35
|
+
require 'nuggets/string/word_wrap'
|
36
|
+
|
37
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
38
|
+
|
39
|
+
require 'cmess'
|
40
|
+
require 'cmess/guess_encoding'
|
41
|
+
require 'cmess/cli'
|
42
|
+
|
43
|
+
include CMess::CLI
|
44
|
+
|
45
|
+
PROGNAME = File.basename($0)
|
46
|
+
|
47
|
+
# short-cut
|
48
|
+
CGE = CMess::GuessEncoding
|
49
|
+
|
50
|
+
# how to split list of encodings
|
51
|
+
SPLIT_ENCODING_LIST_RE = /\s*[,\s]\s*/o
|
52
|
+
|
53
|
+
options = {
|
54
|
+
:input => STDIN,
|
55
|
+
:line => 1,
|
56
|
+
:encodings => nil,
|
57
|
+
:additional_encodings => [],
|
58
|
+
:target_encoding => determine_system_encoding,
|
59
|
+
:guess => false,
|
60
|
+
:chunk_size => nil,
|
61
|
+
:ignore_bom => false
|
62
|
+
}
|
63
|
+
|
64
|
+
OptionParser.new(nil, 40) { |opts|
|
65
|
+
opts.banner = "Usage: #{$0} [options]"
|
66
|
+
|
67
|
+
opts.separator ''
|
68
|
+
opts.separator 'Options:'
|
69
|
+
|
70
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
71
|
+
options[:input] = open_file_or_std(f)
|
72
|
+
}
|
73
|
+
|
74
|
+
opts.separator ''
|
75
|
+
opts.separator ' * Manual guessing'
|
76
|
+
opts.separator ''
|
77
|
+
|
78
|
+
opts.on('-l', '--line LINE', "Line number of input file to use for testing [Default: #{options[:line]}]") { |l|
|
79
|
+
options[:line] = l.to_i
|
80
|
+
|
81
|
+
unless options[:line] > 0
|
82
|
+
options[:input].read # prevent 'Broken pipe' error
|
83
|
+
abort "Line number must be greater then 0!"
|
84
|
+
end
|
85
|
+
}
|
86
|
+
|
87
|
+
opts.separator ''
|
88
|
+
|
89
|
+
opts.on('-e', '--encodings ENCODINGS...', "List of encodings to try >instead of< default (see below)") { |e|
|
90
|
+
options[:encodings] ||= []
|
91
|
+
options[:encodings] += e.split(SPLIT_ENCODING_LIST_RE)
|
92
|
+
}
|
93
|
+
|
94
|
+
opts.on('-a', '--additional-encodings ENCODINGS...', "List of encodings to try >in addition to< default (see below)") { |e|
|
95
|
+
options[:additional_encodings] += e.split(SPLIT_ENCODING_LIST_RE)
|
96
|
+
}
|
97
|
+
|
98
|
+
opts.separator ''
|
99
|
+
|
100
|
+
opts.on('-t', '--target-encoding ENCODING', "Target encoding of your system [Default: #{options[:target_encoding]}]") { |e|
|
101
|
+
options[:target_encoding] = e
|
102
|
+
}
|
103
|
+
|
104
|
+
opts.separator ''
|
105
|
+
opts.separator ' * Automatic guessing'
|
106
|
+
opts.separator ''
|
107
|
+
|
108
|
+
opts.on('-g', '--guess', "Actually guess the encoding of the input, automatically!", "(see below for a list of supported encodings)") {
|
109
|
+
options[:guess] = true
|
110
|
+
}
|
111
|
+
|
112
|
+
opts.on('-c', '--chunk-size SIZE', Integer, "Size of chunks input will be read in until a valid encoding", "has been found; by default the whole file will be read") { |s|
|
113
|
+
options[:chunk_size] = s
|
114
|
+
}
|
115
|
+
|
116
|
+
opts.separator ''
|
117
|
+
|
118
|
+
opts.on('-b', '--ignore-bom', "Ignore detected BOM (if any)", "(see below for a list of supported encodings)") {
|
119
|
+
options[:ignore_bom] = true
|
120
|
+
}
|
121
|
+
|
122
|
+
opts.separator ''
|
123
|
+
opts.separator 'Generic options:'
|
124
|
+
|
125
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
126
|
+
puts opts
|
127
|
+
exit
|
128
|
+
}
|
129
|
+
|
130
|
+
opts.on('--version', "Print program version and exit") {
|
131
|
+
puts "#{PROGNAME} v#{CGE::VERSION} (part of cmess v#{CMess::VERSION})"
|
132
|
+
exit
|
133
|
+
}
|
134
|
+
|
135
|
+
opts.separator ''
|
136
|
+
opts.separator 'Default encodings for manual guessing:'
|
137
|
+
CGE::Manual::ENCODINGS.join(', ').word_wrap(110, true).each { |l|
|
138
|
+
opts.separator l
|
139
|
+
}
|
140
|
+
|
141
|
+
opts.separator ''
|
142
|
+
opts.separator 'Likely candidates for additional testing:'
|
143
|
+
CGE::Manual::CANDIDATES.join(', ').word_wrap(110, true).each { |l|
|
144
|
+
opts.separator l
|
145
|
+
}
|
146
|
+
|
147
|
+
opts.separator ''
|
148
|
+
opts.separator 'Supported encodings for automatic guessing (will be tried in that order):'
|
149
|
+
CGE::Automatic.supported_encodings.join(', ').word_wrap(110, true).each { |l|
|
150
|
+
opts.separator l
|
151
|
+
}
|
152
|
+
|
153
|
+
opts.separator ''
|
154
|
+
opts.separator 'Supported encodings for BOM detection (will be tried in that order):'
|
155
|
+
CGE::Automatic.supported_boms.join(', ').word_wrap(110, true).each { |l|
|
156
|
+
opts.separator l
|
157
|
+
}
|
158
|
+
|
159
|
+
opts.separator ''
|
160
|
+
opts.separator "When FILE is -, STDIN is used."
|
161
|
+
}.parse!
|
162
|
+
|
163
|
+
if options[:guess]
|
164
|
+
puts CGE::Automatic.guess(options[:input], options[:chunk_size], options[:ignore_bom])
|
165
|
+
else
|
166
|
+
options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
|
167
|
+
|
168
|
+
# reset line counter
|
169
|
+
$. = 0
|
170
|
+
|
171
|
+
input = options[:input].each { |line|
|
172
|
+
break line if $. == options[:line]
|
173
|
+
}
|
174
|
+
abort "Input was empty!" if $..zero?
|
175
|
+
abort "Line not found -- input has only #{$.} line#{'s' if $. != 1}" unless input.is_a?(String)
|
176
|
+
|
177
|
+
CGE::Manual.display(
|
178
|
+
input,
|
179
|
+
options[:target_encoding],
|
180
|
+
options[:encodings],
|
181
|
+
options[:additional_encodings]
|
182
|
+
)
|
183
|
+
end
|