cmess 0.0.4.136
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +676 -0
- data/ChangeLog +6 -0
- data/README +53 -0
- data/Rakefile +30 -0
- data/bin/cinderella +186 -0
- data/bin/decode_entities +101 -0
- data/bin/guess_encoding +183 -0
- data/data/csets/iso_8859-1.yaml +195 -0
- data/data/csets/iso_8859-15.yaml +204 -0
- data/data/csets/latin1.yaml +195 -0
- data/data/csets/unicode/basic_latin.yaml +97 -0
- data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
- data/data/csets/unicode/cyrillic.yaml +256 -0
- data/data/csets/unicode/greek.yaml +129 -0
- data/data/csets/unicode/ipa_extensions.yaml +97 -0
- data/data/csets/unicode/latin-extended-c.yaml +18 -0
- data/data/csets/unicode/latin-extended-d.yaml +3 -0
- data/data/csets/unicode/latin_1_supplement.yaml +128 -0
- data/data/csets/unicode/latin_extended_a.yaml +129 -0
- data/data/csets/unicode/latin_extended_additional.yaml +247 -0
- data/data/csets/unicode/latin_extended_b.yaml +209 -0
- data/data/csets/unicode/letterlike_symbols.yaml +80 -0
- data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
- data/data/csets/utf-8.yaml +1504 -0
- data/data/csets/utf8.yaml +1504 -0
- data/example/crop +127 -0
- data/example/crop_repaired +127 -0
- data/example/empty6-slash.txt +1495 -0
- data/example/empty6-slash_repaired.txt +1495 -0
- data/example/pot +1368 -0
- data/lib/cmess.rb +44 -0
- data/lib/cmess/cinderella.rb +63 -0
- data/lib/cmess/cli.rb +79 -0
- data/lib/cmess/decode_entities.rb +68 -0
- data/lib/cmess/guess_encoding.rb +372 -0
- data/lib/cmess/version.rb +51 -0
- metadata +119 -0
data/ChangeLog
ADDED
data/README
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
= cmess - Assist with messed up encodings
|
2
|
+
|
3
|
+
== VERSION
|
4
|
+
|
5
|
+
This documentation refers to cmess version 0.0.3
|
6
|
+
|
7
|
+
|
8
|
+
== DESCRIPTION
|
9
|
+
|
10
|
+
CMess bundles several tools under its hood that aim at dealing with various
|
11
|
+
problems occurring in the context of character sets and encodings. Currently,
|
12
|
+
there are:
|
13
|
+
|
14
|
+
guess_encoding:: Simple helper to identify the encoding of a given string.
|
15
|
+
Includes the ability to automatically detect the encoding
|
16
|
+
of an input.
|
17
|
+
cinderella:: When characters are "double encoded", you can't easily
|
18
|
+
convert them back -- this is where cinderella comes in,
|
19
|
+
sorting the good ones into the pot and the (potentially)
|
20
|
+
bad ones into the crop...
|
21
|
+
decode_entities:: Decode HTML entities in a string.
|
22
|
+
|
23
|
+
TODO: well, more of the description... ;-)
|
24
|
+
|
25
|
+
|
26
|
+
== AUTHORS
|
27
|
+
|
28
|
+
* Jens Wille <mailto:jens.wille@uni-koeln.de>
|
29
|
+
|
30
|
+
|
31
|
+
== CREDITS
|
32
|
+
|
33
|
+
* John Vorhauer <mailto:john@vorhauer.de> for the idea and
|
34
|
+
original implementation of the automatic encoding guesser
|
35
|
+
(see CMess::GuessEncoding::Guesser).
|
36
|
+
|
37
|
+
|
38
|
+
== LICENSE AND COPYRIGHT
|
39
|
+
|
40
|
+
Copyright (C) 2007 University of Cologne,
|
41
|
+
Albertus-Magnus-Platz, 50932 Cologne, Germany
|
42
|
+
|
43
|
+
cmess is free software: you can redistribute it and/or modify it under the
|
44
|
+
terms of the GNU General Public License as published by the Free Software
|
45
|
+
Foundation, either version 3 of the License, or (at your option) any later
|
46
|
+
version.
|
47
|
+
|
48
|
+
cmess is distributed in the hope that it will be useful, but WITHOUT ANY
|
49
|
+
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
50
|
+
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
51
|
+
|
52
|
+
You should have received a copy of the GNU General Public License along with
|
53
|
+
cmess. If not, see <http://www.gnu.org/licenses/>.
|
data/Rakefile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# Utilizes global rake-tasks: alias rake="rake -r rake -R /path/to/rakelibdir"
|
2
|
+
# (Base tasks at <http://prometheus.khi.uni-koeln.de/svn/scratch/rake-tasks/>)
|
3
|
+
|
4
|
+
$:.unshift('lib')
|
5
|
+
|
6
|
+
require 'cmess'
|
7
|
+
|
8
|
+
FILES = FileList['lib/**/*.rb'].to_a
|
9
|
+
EXECS = FileList['bin/*'].to_a
|
10
|
+
RDOCS = %w[README COPYING ChangeLog]
|
11
|
+
OTHER = FileList['[A-Z]*', 'example/**/*', 'data/**/*'].to_a
|
12
|
+
|
13
|
+
task(:doc_spec) {{
|
14
|
+
:title => 'cmess Application documentation',
|
15
|
+
:rdoc_files => RDOCS + FILES
|
16
|
+
}}
|
17
|
+
|
18
|
+
task(:gem_spec) {{
|
19
|
+
:name => 'cmess',
|
20
|
+
:version => CMess::VERSION,
|
21
|
+
:summary => "Assist with handling messed up encodings " <<
|
22
|
+
"(Currently includes the following tools: " <<
|
23
|
+
"#{EXECS.map { |e| File.basename(e) }.join(', ')})",
|
24
|
+
:files => FILES + EXECS + OTHER,
|
25
|
+
:require_path => 'lib',
|
26
|
+
:bindir => 'bin',
|
27
|
+
:executables => EXECS,
|
28
|
+
:extra_rdoc_files => RDOCS,
|
29
|
+
:dependencies => %w[ruby-nuggets htmlentities]
|
30
|
+
}}
|
data/bin/cinderella
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# cinderella -- Handle double encoded characters #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2007 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
require 'yaml'
|
34
|
+
|
35
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
36
|
+
|
37
|
+
require 'cmess'
|
38
|
+
require 'cmess/cinderella'
|
39
|
+
require 'cmess/cli'
|
40
|
+
|
41
|
+
include CMess::CLI
|
42
|
+
|
43
|
+
PROGNAME = File.basename($0)
|
44
|
+
|
45
|
+
options = {
|
46
|
+
:input => STDIN,
|
47
|
+
:output => STDOUT,
|
48
|
+
:pot => nil,
|
49
|
+
:crop => nil,
|
50
|
+
:source_encoding => nil,
|
51
|
+
:target_encoding => determine_system_encoding,
|
52
|
+
:csets => [
|
53
|
+
File.expand_path(File.join(File.dirname(__FILE__), '..', 'data', 'csets'))
|
54
|
+
],
|
55
|
+
:repair => false
|
56
|
+
}
|
57
|
+
|
58
|
+
OptionParser.new(nil, 40) { |opts|
|
59
|
+
opts.banner = "Usage: #{$0} [options]"
|
60
|
+
|
61
|
+
opts.separator ''
|
62
|
+
opts.separator 'Options:'
|
63
|
+
|
64
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
65
|
+
options[:input] = open_file_or_std(f)
|
66
|
+
}
|
67
|
+
|
68
|
+
opts.separator ''
|
69
|
+
|
70
|
+
opts.on('-p', '--pot FILE', "The good into the pot...") { |f|
|
71
|
+
options[:pot] = open_file_or_std(f, 'w')
|
72
|
+
}
|
73
|
+
|
74
|
+
opts.on('-c', '--crop FILE', "...the bad into the crop") { |f|
|
75
|
+
options[:crop] = open_file_or_std(f, 'w')
|
76
|
+
}
|
77
|
+
|
78
|
+
opts.separator ''
|
79
|
+
|
80
|
+
opts.on('-o', '--output [FILE]', "Write both good and bad lines to FILE or", "default [Default: STDOUT] (Particularly", "useful in combination with the '-r' option)") { |f|
|
81
|
+
options[:output] = open_file_or_std(f, 'w') if f
|
82
|
+
|
83
|
+
options[:pot] = options[:output]
|
84
|
+
options[:crop] = options[:output]
|
85
|
+
}
|
86
|
+
|
87
|
+
opts.separator ''
|
88
|
+
|
89
|
+
opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'", "(Only really useful in combination with", "the '-r' option)") { |f|
|
90
|
+
options[:input], options[:output] = open_file_in_place(f)
|
91
|
+
|
92
|
+
options[:pot] = options[:output]
|
93
|
+
options[:crop] = options[:output]
|
94
|
+
}
|
95
|
+
|
96
|
+
opts.separator ''
|
97
|
+
|
98
|
+
opts.on('-e', '--source-encoding ENCODING', "Source encoding (from) [REQUIRED]") { |e|
|
99
|
+
options[:source_encoding] = e
|
100
|
+
}
|
101
|
+
|
102
|
+
opts.on('-t', '--target-encoding ENCODING', "Target encoding (to); see '-l' for a list", "of available encodings [Default: #{options[:target_encoding]}]") { |e|
|
103
|
+
options[:target_encoding] = e
|
104
|
+
}
|
105
|
+
|
106
|
+
opts.separator ''
|
107
|
+
|
108
|
+
opts.on('-T', '--addtl-target-encodings DIRECTORY', "Directory providing additional char files", "for target encoding") { |d|
|
109
|
+
ensure_directory(d)
|
110
|
+
|
111
|
+
options[:csets] |= [File.expand_path(d)]
|
112
|
+
}
|
113
|
+
|
114
|
+
opts.separator ''
|
115
|
+
|
116
|
+
opts.on('-l', '--list-encodings', "Display a list of available target encodings", "and exit; see '-T' on how to add your own") {
|
117
|
+
csets = options[:csets].inject({}) { |hash, cset|
|
118
|
+
encodings = Dir[File.join(cset, '*.yaml')].sort.map { |yaml|
|
119
|
+
File.basename(yaml, '.yaml') unless File.symlink?(yaml)
|
120
|
+
}.compact
|
121
|
+
|
122
|
+
hash[cset] = encodings unless encodings.empty?
|
123
|
+
hash
|
124
|
+
}
|
125
|
+
|
126
|
+
if csets.empty?
|
127
|
+
puts "No target encodings available for #{PROGNAME}"
|
128
|
+
else
|
129
|
+
puts "Available target encodings for #{PROGNAME}:"
|
130
|
+
csets.each { |cset, encodings|
|
131
|
+
puts "[#{cset}]"
|
132
|
+
encodings.each { |encoding|
|
133
|
+
puts " - #{encoding}"
|
134
|
+
}
|
135
|
+
}
|
136
|
+
end
|
137
|
+
|
138
|
+
exit
|
139
|
+
}
|
140
|
+
|
141
|
+
opts.separator ''
|
142
|
+
|
143
|
+
opts.on('-r', '--repair', "Try to repair corrupted characters") {
|
144
|
+
options[:repair] = true
|
145
|
+
}
|
146
|
+
|
147
|
+
opts.separator ''
|
148
|
+
opts.separator 'Generic options:'
|
149
|
+
|
150
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
151
|
+
puts opts
|
152
|
+
exit
|
153
|
+
}
|
154
|
+
|
155
|
+
opts.on('--version', "Print program version and exit") {
|
156
|
+
puts "#{PROGNAME} v#{CMess::Cinderella::VERSION} (part of cmess v#{CMess::VERSION})"
|
157
|
+
exit
|
158
|
+
}
|
159
|
+
|
160
|
+
opts.separator ''
|
161
|
+
opts.separator "If '-p' or '-c' is omitted, and '-o' is not given either, that particular output"
|
162
|
+
opts.separator "is ignored. When FILE is -, either STDIN or STDOUT is used (as appropriate)."
|
163
|
+
}.parse!
|
164
|
+
|
165
|
+
options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
|
166
|
+
|
167
|
+
abort "No source encoding given! (Use the '-e' switch to do so; see '--help' for more information)" \
|
168
|
+
unless options[:source_encoding]
|
169
|
+
|
170
|
+
yaml_file = "#{options[:target_encoding].downcase}.yaml"
|
171
|
+
char_file = options[:csets].inject(nil) { |path, cset|
|
172
|
+
path = File.join(cset, yaml_file)
|
173
|
+
break path if File.readable?(path)
|
174
|
+
}
|
175
|
+
abort "Char file not found for target encoding: #{options[:target_encoding]}" \
|
176
|
+
unless char_file
|
177
|
+
|
178
|
+
CMess::Cinderella.pick(
|
179
|
+
options[:input],
|
180
|
+
options[:pot],
|
181
|
+
options[:crop],
|
182
|
+
options[:source_encoding],
|
183
|
+
options[:target_encoding],
|
184
|
+
YAML.load_file(char_file),
|
185
|
+
options[:repair]
|
186
|
+
)
|
data/bin/decode_entities
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# decode_entities -- Decode HTML entities #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2007 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
|
34
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
35
|
+
|
36
|
+
require 'cmess'
|
37
|
+
require 'cmess/decode_entities'
|
38
|
+
require 'cmess/cli'
|
39
|
+
|
40
|
+
include CMess::CLI
|
41
|
+
|
42
|
+
PROGNAME = File.basename($0)
|
43
|
+
|
44
|
+
options = {
|
45
|
+
:input => STDIN,
|
46
|
+
:output => STDOUT,
|
47
|
+
:source_encoding => CMess::DecodeEntities::INTERMEDIATE_ENCODING,
|
48
|
+
:target_encoding => nil
|
49
|
+
}
|
50
|
+
|
51
|
+
OptionParser.new { |opts|
|
52
|
+
opts.banner = "Usage: #{$0} [options]"
|
53
|
+
|
54
|
+
opts.separator ''
|
55
|
+
opts.separator 'Options:'
|
56
|
+
|
57
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
58
|
+
options[:input] = open_file_or_std(f)
|
59
|
+
}
|
60
|
+
|
61
|
+
opts.on('-o', '--output FILE', "Output file to write to [Default: STDOUT]") { |f|
|
62
|
+
options[:output] = open_file_or_std(f, 'w')
|
63
|
+
}
|
64
|
+
|
65
|
+
opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'") { |f|
|
66
|
+
options[:input], options[:output] = open_file_in_place(f)
|
67
|
+
}
|
68
|
+
|
69
|
+
opts.separator ''
|
70
|
+
|
71
|
+
opts.on('-e', '--source-encoding ENCODING', "Encoding of input file [Default: #{options[:source_encoding].upcase}]") { |e|
|
72
|
+
options[:source_encoding] = e.downcase
|
73
|
+
}
|
74
|
+
|
75
|
+
opts.on('-t', '--target-encoding ENCODING', "Desired encoding for output file [Default: <source_encoding>]") { |e|
|
76
|
+
options[:target_encoding] = e.downcase
|
77
|
+
}
|
78
|
+
|
79
|
+
opts.separator ''
|
80
|
+
opts.separator 'Generic options:'
|
81
|
+
|
82
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
83
|
+
puts opts
|
84
|
+
exit
|
85
|
+
}
|
86
|
+
|
87
|
+
opts.on('--version', "Print program version and exit") {
|
88
|
+
puts "#{PROGNAME} v#{CMess::DecodeEntities::VERSION} (part of cmess v#{CMess::VERSION})"
|
89
|
+
exit
|
90
|
+
}
|
91
|
+
|
92
|
+
opts.separator ''
|
93
|
+
opts.separator "When FILE is -, either STDIN or STDOUT is used (as appropriate)."
|
94
|
+
}.parse!
|
95
|
+
|
96
|
+
CMess::DecodeEntities.decode(
|
97
|
+
options[:input],
|
98
|
+
options[:output],
|
99
|
+
options[:source_encoding],
|
100
|
+
options[:target_encoding]
|
101
|
+
)
|
data/bin/guess_encoding
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# guess_encoding -- Assist with guessing the encoding of some input at hand #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2007 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
|
34
|
+
require 'rubygems'
|
35
|
+
require 'nuggets/string/word_wrap'
|
36
|
+
|
37
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
38
|
+
|
39
|
+
require 'cmess'
|
40
|
+
require 'cmess/guess_encoding'
|
41
|
+
require 'cmess/cli'
|
42
|
+
|
43
|
+
include CMess::CLI
|
44
|
+
|
45
|
+
PROGNAME = File.basename($0)
|
46
|
+
|
47
|
+
# short-cut
|
48
|
+
CGE = CMess::GuessEncoding
|
49
|
+
|
50
|
+
# how to split list of encodings
|
51
|
+
SPLIT_ENCODING_LIST_RE = /\s*[,\s]\s*/o
|
52
|
+
|
53
|
+
options = {
|
54
|
+
:input => STDIN,
|
55
|
+
:line => 1,
|
56
|
+
:encodings => nil,
|
57
|
+
:additional_encodings => [],
|
58
|
+
:target_encoding => determine_system_encoding,
|
59
|
+
:guess => false,
|
60
|
+
:chunk_size => nil,
|
61
|
+
:ignore_bom => false
|
62
|
+
}
|
63
|
+
|
64
|
+
OptionParser.new(nil, 40) { |opts|
|
65
|
+
opts.banner = "Usage: #{$0} [options]"
|
66
|
+
|
67
|
+
opts.separator ''
|
68
|
+
opts.separator 'Options:'
|
69
|
+
|
70
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
71
|
+
options[:input] = open_file_or_std(f)
|
72
|
+
}
|
73
|
+
|
74
|
+
opts.separator ''
|
75
|
+
opts.separator ' * Manual guessing'
|
76
|
+
opts.separator ''
|
77
|
+
|
78
|
+
opts.on('-l', '--line LINE', "Line number of input file to use for testing [Default: #{options[:line]}]") { |l|
|
79
|
+
options[:line] = l.to_i
|
80
|
+
|
81
|
+
unless options[:line] > 0
|
82
|
+
options[:input].read # prevent 'Broken pipe' error
|
83
|
+
abort "Line number must be greater then 0!"
|
84
|
+
end
|
85
|
+
}
|
86
|
+
|
87
|
+
opts.separator ''
|
88
|
+
|
89
|
+
opts.on('-e', '--encodings ENCODINGS...', "List of encodings to try >instead of< default (see below)") { |e|
|
90
|
+
options[:encodings] ||= []
|
91
|
+
options[:encodings] += e.split(SPLIT_ENCODING_LIST_RE)
|
92
|
+
}
|
93
|
+
|
94
|
+
opts.on('-a', '--additional-encodings ENCODINGS...', "List of encodings to try >in addition to< default (see below)") { |e|
|
95
|
+
options[:additional_encodings] += e.split(SPLIT_ENCODING_LIST_RE)
|
96
|
+
}
|
97
|
+
|
98
|
+
opts.separator ''
|
99
|
+
|
100
|
+
opts.on('-t', '--target-encoding ENCODING', "Target encoding of your system [Default: #{options[:target_encoding]}]") { |e|
|
101
|
+
options[:target_encoding] = e
|
102
|
+
}
|
103
|
+
|
104
|
+
opts.separator ''
|
105
|
+
opts.separator ' * Automatic guessing'
|
106
|
+
opts.separator ''
|
107
|
+
|
108
|
+
opts.on('-g', '--guess', "Actually guess the encoding of the input, automatically!", "(see below for a list of supported encodings)") {
|
109
|
+
options[:guess] = true
|
110
|
+
}
|
111
|
+
|
112
|
+
opts.on('-c', '--chunk-size SIZE', Integer, "Size of chunks input will be read in until a valid encoding", "has been found; by default the whole file will be read") { |s|
|
113
|
+
options[:chunk_size] = s
|
114
|
+
}
|
115
|
+
|
116
|
+
opts.separator ''
|
117
|
+
|
118
|
+
opts.on('-b', '--ignore-bom', "Ignore detected BOM (if any)", "(see below for a list of supported encodings)") {
|
119
|
+
options[:ignore_bom] = true
|
120
|
+
}
|
121
|
+
|
122
|
+
opts.separator ''
|
123
|
+
opts.separator 'Generic options:'
|
124
|
+
|
125
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
126
|
+
puts opts
|
127
|
+
exit
|
128
|
+
}
|
129
|
+
|
130
|
+
opts.on('--version', "Print program version and exit") {
|
131
|
+
puts "#{PROGNAME} v#{CGE::VERSION} (part of cmess v#{CMess::VERSION})"
|
132
|
+
exit
|
133
|
+
}
|
134
|
+
|
135
|
+
opts.separator ''
|
136
|
+
opts.separator 'Default encodings for manual guessing:'
|
137
|
+
CGE::Manual::ENCODINGS.join(', ').word_wrap(110, true).each { |l|
|
138
|
+
opts.separator l
|
139
|
+
}
|
140
|
+
|
141
|
+
opts.separator ''
|
142
|
+
opts.separator 'Likely candidates for additional testing:'
|
143
|
+
CGE::Manual::CANDIDATES.join(', ').word_wrap(110, true).each { |l|
|
144
|
+
opts.separator l
|
145
|
+
}
|
146
|
+
|
147
|
+
opts.separator ''
|
148
|
+
opts.separator 'Supported encodings for automatic guessing (will be tried in that order):'
|
149
|
+
CGE::Automatic.supported_encodings.join(', ').word_wrap(110, true).each { |l|
|
150
|
+
opts.separator l
|
151
|
+
}
|
152
|
+
|
153
|
+
opts.separator ''
|
154
|
+
opts.separator 'Supported encodings for BOM detection (will be tried in that order):'
|
155
|
+
CGE::Automatic.supported_boms.join(', ').word_wrap(110, true).each { |l|
|
156
|
+
opts.separator l
|
157
|
+
}
|
158
|
+
|
159
|
+
opts.separator ''
|
160
|
+
opts.separator "When FILE is -, STDIN is used."
|
161
|
+
}.parse!
|
162
|
+
|
163
|
+
if options[:guess]
|
164
|
+
puts CGE::Automatic.guess(options[:input], options[:chunk_size], options[:ignore_bom])
|
165
|
+
else
|
166
|
+
options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
|
167
|
+
|
168
|
+
# reset line counter
|
169
|
+
$. = 0
|
170
|
+
|
171
|
+
input = options[:input].each { |line|
|
172
|
+
break line if $. == options[:line]
|
173
|
+
}
|
174
|
+
abort "Input was empty!" if $..zero?
|
175
|
+
abort "Line not found -- input has only #{$.} line#{'s' if $. != 1}" unless input.is_a?(String)
|
176
|
+
|
177
|
+
CGE::Manual.display(
|
178
|
+
input,
|
179
|
+
options[:target_encoding],
|
180
|
+
options[:encodings],
|
181
|
+
options[:additional_encodings]
|
182
|
+
)
|
183
|
+
end
|