cmess 0.0.4.136

Sign up to get free protection for your applications and to get access to all the features.
data/ChangeLog ADDED
@@ -0,0 +1,6 @@
1
+ = Revision history for cmess
2
+
3
+ == 0.0.3 [2007-12-12]
4
+
5
+ * Added automatic encoding detection to GuessEncoding. Idea and original
6
+ implementation provided by John. Thanks :-)
data/README ADDED
@@ -0,0 +1,53 @@
1
+ = cmess - Assist with messed up encodings
2
+
3
+ == VERSION
4
+
5
+ This documentation refers to cmess version 0.0.3
6
+
7
+
8
+ == DESCRIPTION
9
+
10
+ CMess bundles several tools under its hood that aim at dealing with various
11
+ problems occurring in the context of character sets and encodings. Currently,
12
+ there are:
13
+
14
+ guess_encoding:: Simple helper to identify the encoding of a given string.
15
+ Includes the ability to automatically detect the encoding
16
+ of an input.
17
+ cinderella:: When characters are "double encoded", you can't easily
18
+ convert them back -- this is where cinderella comes in,
19
+ sorting the good ones into the pot and the (potentially)
20
+ bad ones into the crop...
21
+ decode_entities:: Decode HTML entities in a string.
22
+
23
+ TODO: well, more of the description... ;-)
24
+
25
+
26
+ == AUTHORS
27
+
28
+ * Jens Wille <mailto:jens.wille@uni-koeln.de>
29
+
30
+
31
+ == CREDITS
32
+
33
+ * John Vorhauer <mailto:john@vorhauer.de> for the idea and
34
+ original implementation of the automatic encoding guesser
35
+ (see CMess::GuessEncoding::Guesser).
36
+
37
+
38
+ == LICENSE AND COPYRIGHT
39
+
40
+ Copyright (C) 2007 University of Cologne,
41
+ Albertus-Magnus-Platz, 50932 Cologne, Germany
42
+
43
+ cmess is free software: you can redistribute it and/or modify it under the
44
+ terms of the GNU General Public License as published by the Free Software
45
+ Foundation, either version 3 of the License, or (at your option) any later
46
+ version.
47
+
48
+ cmess is distributed in the hope that it will be useful, but WITHOUT ANY
49
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
50
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
51
+
52
+ You should have received a copy of the GNU General Public License along with
53
+ cmess. If not, see <http://www.gnu.org/licenses/>.
data/Rakefile ADDED
@@ -0,0 +1,30 @@
1
+ # Utilizes global rake-tasks: alias rake="rake -r rake -R /path/to/rakelibdir"
2
+ # (Base tasks at <http://prometheus.khi.uni-koeln.de/svn/scratch/rake-tasks/>)
3
+
4
+ $:.unshift('lib')
5
+
6
+ require 'cmess'
7
+
8
+ FILES = FileList['lib/**/*.rb'].to_a
9
+ EXECS = FileList['bin/*'].to_a
10
+ RDOCS = %w[README COPYING ChangeLog]
11
+ OTHER = FileList['[A-Z]*', 'example/**/*', 'data/**/*'].to_a
12
+
13
+ task(:doc_spec) {{
14
+ :title => 'cmess Application documentation',
15
+ :rdoc_files => RDOCS + FILES
16
+ }}
17
+
18
+ task(:gem_spec) {{
19
+ :name => 'cmess',
20
+ :version => CMess::VERSION,
21
+ :summary => "Assist with handling messed up encodings " <<
22
+ "(Currently includes the following tools: " <<
23
+ "#{EXECS.map { |e| File.basename(e) }.join(', ')})",
24
+ :files => FILES + EXECS + OTHER,
25
+ :require_path => 'lib',
26
+ :bindir => 'bin',
27
+ :executables => EXECS,
28
+ :extra_rdoc_files => RDOCS,
29
+ :dependencies => %w[ruby-nuggets htmlentities]
30
+ }}
data/bin/cinderella ADDED
@@ -0,0 +1,186 @@
1
+ #! /usr/bin/ruby
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # cinderella -- Handle double encoded characters #
7
+ # [A component of cmess, the encoding tool-box] #
8
+ # #
9
+ # Copyright (C) 2007 University of Cologne, #
10
+ # Albertus-Magnus-Platz, #
11
+ # 50932 Cologne, Germany #
12
+ # #
13
+ # Authors: #
14
+ # Jens Wille <jens.wille@uni-koeln.de> #
15
+ # #
16
+ # cmess is free software; you can redistribute it and/or modify it under the #
17
+ # terms of the GNU General Public License as published by the Free Software #
18
+ # Foundation; either version 3 of the License, or (at your option) any later #
19
+ # version. #
20
+ # #
21
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
22
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
23
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
24
+ # details. #
25
+ # #
26
+ # You should have received a copy of the GNU General Public License along #
27
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
28
+ # #
29
+ ###############################################################################
30
+ #++
31
+
32
+ require 'optparse'
33
+ require 'yaml'
34
+
35
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
36
+
37
+ require 'cmess'
38
+ require 'cmess/cinderella'
39
+ require 'cmess/cli'
40
+
41
+ include CMess::CLI
42
+
43
+ PROGNAME = File.basename($0)
44
+
45
+ options = {
46
+ :input => STDIN,
47
+ :output => STDOUT,
48
+ :pot => nil,
49
+ :crop => nil,
50
+ :source_encoding => nil,
51
+ :target_encoding => determine_system_encoding,
52
+ :csets => [
53
+ File.expand_path(File.join(File.dirname(__FILE__), '..', 'data', 'csets'))
54
+ ],
55
+ :repair => false
56
+ }
57
+
58
+ OptionParser.new(nil, 40) { |opts|
59
+ opts.banner = "Usage: #{$0} [options]"
60
+
61
+ opts.separator ''
62
+ opts.separator 'Options:'
63
+
64
+ opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
65
+ options[:input] = open_file_or_std(f)
66
+ }
67
+
68
+ opts.separator ''
69
+
70
+ opts.on('-p', '--pot FILE', "The good into the pot...") { |f|
71
+ options[:pot] = open_file_or_std(f, 'w')
72
+ }
73
+
74
+ opts.on('-c', '--crop FILE', "...the bad into the crop") { |f|
75
+ options[:crop] = open_file_or_std(f, 'w')
76
+ }
77
+
78
+ opts.separator ''
79
+
80
+ opts.on('-o', '--output [FILE]', "Write both good and bad lines to FILE or", "default [Default: STDOUT] (Particularly", "useful in combination with the '-r' option)") { |f|
81
+ options[:output] = open_file_or_std(f, 'w') if f
82
+
83
+ options[:pot] = options[:output]
84
+ options[:crop] = options[:output]
85
+ }
86
+
87
+ opts.separator ''
88
+
89
+ opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'", "(Only really useful in combination with", "the '-r' option)") { |f|
90
+ options[:input], options[:output] = open_file_in_place(f)
91
+
92
+ options[:pot] = options[:output]
93
+ options[:crop] = options[:output]
94
+ }
95
+
96
+ opts.separator ''
97
+
98
+ opts.on('-e', '--source-encoding ENCODING', "Source encoding (from) [REQUIRED]") { |e|
99
+ options[:source_encoding] = e
100
+ }
101
+
102
+ opts.on('-t', '--target-encoding ENCODING', "Target encoding (to); see '-l' for a list", "of available encodings [Default: #{options[:target_encoding]}]") { |e|
103
+ options[:target_encoding] = e
104
+ }
105
+
106
+ opts.separator ''
107
+
108
+ opts.on('-T', '--addtl-target-encodings DIRECTORY', "Directory providing additional char files", "for target encoding") { |d|
109
+ ensure_directory(d)
110
+
111
+ options[:csets] |= [File.expand_path(d)]
112
+ }
113
+
114
+ opts.separator ''
115
+
116
+ opts.on('-l', '--list-encodings', "Display a list of available target encodings", "and exit; see '-T' on how to add your own") {
117
+ csets = options[:csets].inject({}) { |hash, cset|
118
+ encodings = Dir[File.join(cset, '*.yaml')].sort.map { |yaml|
119
+ File.basename(yaml, '.yaml') unless File.symlink?(yaml)
120
+ }.compact
121
+
122
+ hash[cset] = encodings unless encodings.empty?
123
+ hash
124
+ }
125
+
126
+ if csets.empty?
127
+ puts "No target encodings available for #{PROGNAME}"
128
+ else
129
+ puts "Available target encodings for #{PROGNAME}:"
130
+ csets.each { |cset, encodings|
131
+ puts "[#{cset}]"
132
+ encodings.each { |encoding|
133
+ puts " - #{encoding}"
134
+ }
135
+ }
136
+ end
137
+
138
+ exit
139
+ }
140
+
141
+ opts.separator ''
142
+
143
+ opts.on('-r', '--repair', "Try to repair corrupted characters") {
144
+ options[:repair] = true
145
+ }
146
+
147
+ opts.separator ''
148
+ opts.separator 'Generic options:'
149
+
150
+ opts.on('-h', '--help', "Print this help message and exit") {
151
+ puts opts
152
+ exit
153
+ }
154
+
155
+ opts.on('--version', "Print program version and exit") {
156
+ puts "#{PROGNAME} v#{CMess::Cinderella::VERSION} (part of cmess v#{CMess::VERSION})"
157
+ exit
158
+ }
159
+
160
+ opts.separator ''
161
+ opts.separator "If '-p' or '-c' is omitted, and '-o' is not given either, that particular output"
162
+ opts.separator "is ignored. When FILE is -, either STDIN or STDOUT is used (as appropriate)."
163
+ }.parse!
164
+
165
+ options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
166
+
167
+ abort "No source encoding given! (Use the '-e' switch to do so; see '--help' for more information)" \
168
+ unless options[:source_encoding]
169
+
170
+ yaml_file = "#{options[:target_encoding].downcase}.yaml"
171
+ char_file = options[:csets].inject(nil) { |path, cset|
172
+ path = File.join(cset, yaml_file)
173
+ break path if File.readable?(path)
174
+ }
175
+ abort "Char file not found for target encoding: #{options[:target_encoding]}" \
176
+ unless char_file
177
+
178
+ CMess::Cinderella.pick(
179
+ options[:input],
180
+ options[:pot],
181
+ options[:crop],
182
+ options[:source_encoding],
183
+ options[:target_encoding],
184
+ YAML.load_file(char_file),
185
+ options[:repair]
186
+ )
@@ -0,0 +1,101 @@
1
+ #! /usr/bin/ruby
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # decode_entities -- Decode HTML entities #
7
+ # [A component of cmess, the encoding tool-box] #
8
+ # #
9
+ # Copyright (C) 2007 University of Cologne, #
10
+ # Albertus-Magnus-Platz, #
11
+ # 50932 Cologne, Germany #
12
+ # #
13
+ # Authors: #
14
+ # Jens Wille <jens.wille@uni-koeln.de> #
15
+ # #
16
+ # cmess is free software; you can redistribute it and/or modify it under the #
17
+ # terms of the GNU General Public License as published by the Free Software #
18
+ # Foundation; either version 3 of the License, or (at your option) any later #
19
+ # version. #
20
+ # #
21
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
22
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
23
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
24
+ # details. #
25
+ # #
26
+ # You should have received a copy of the GNU General Public License along #
27
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
28
+ # #
29
+ ###############################################################################
30
+ #++
31
+
32
+ require 'optparse'
33
+
34
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
35
+
36
+ require 'cmess'
37
+ require 'cmess/decode_entities'
38
+ require 'cmess/cli'
39
+
40
+ include CMess::CLI
41
+
42
+ PROGNAME = File.basename($0)
43
+
44
+ options = {
45
+ :input => STDIN,
46
+ :output => STDOUT,
47
+ :source_encoding => CMess::DecodeEntities::INTERMEDIATE_ENCODING,
48
+ :target_encoding => nil
49
+ }
50
+
51
+ OptionParser.new { |opts|
52
+ opts.banner = "Usage: #{$0} [options]"
53
+
54
+ opts.separator ''
55
+ opts.separator 'Options:'
56
+
57
+ opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
58
+ options[:input] = open_file_or_std(f)
59
+ }
60
+
61
+ opts.on('-o', '--output FILE', "Output file to write to [Default: STDOUT]") { |f|
62
+ options[:output] = open_file_or_std(f, 'w')
63
+ }
64
+
65
+ opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'") { |f|
66
+ options[:input], options[:output] = open_file_in_place(f)
67
+ }
68
+
69
+ opts.separator ''
70
+
71
+ opts.on('-e', '--source-encoding ENCODING', "Encoding of input file [Default: #{options[:source_encoding].upcase}]") { |e|
72
+ options[:source_encoding] = e.downcase
73
+ }
74
+
75
+ opts.on('-t', '--target-encoding ENCODING', "Desired encoding for output file [Default: <source_encoding>]") { |e|
76
+ options[:target_encoding] = e.downcase
77
+ }
78
+
79
+ opts.separator ''
80
+ opts.separator 'Generic options:'
81
+
82
+ opts.on('-h', '--help', "Print this help message and exit") {
83
+ puts opts
84
+ exit
85
+ }
86
+
87
+ opts.on('--version', "Print program version and exit") {
88
+ puts "#{PROGNAME} v#{CMess::DecodeEntities::VERSION} (part of cmess v#{CMess::VERSION})"
89
+ exit
90
+ }
91
+
92
+ opts.separator ''
93
+ opts.separator "When FILE is -, either STDIN or STDOUT is used (as appropriate)."
94
+ }.parse!
95
+
96
+ CMess::DecodeEntities.decode(
97
+ options[:input],
98
+ options[:output],
99
+ options[:source_encoding],
100
+ options[:target_encoding]
101
+ )
@@ -0,0 +1,183 @@
1
+ #! /usr/bin/ruby
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # guess_encoding -- Assist with guessing the encoding of some input at hand #
7
+ # [A component of cmess, the encoding tool-box] #
8
+ # #
9
+ # Copyright (C) 2007 University of Cologne, #
10
+ # Albertus-Magnus-Platz, #
11
+ # 50932 Cologne, Germany #
12
+ # #
13
+ # Authors: #
14
+ # Jens Wille <jens.wille@uni-koeln.de> #
15
+ # #
16
+ # cmess is free software; you can redistribute it and/or modify it under the #
17
+ # terms of the GNU General Public License as published by the Free Software #
18
+ # Foundation; either version 3 of the License, or (at your option) any later #
19
+ # version. #
20
+ # #
21
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
22
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
23
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
24
+ # details. #
25
+ # #
26
+ # You should have received a copy of the GNU General Public License along #
27
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
28
+ # #
29
+ ###############################################################################
30
+ #++
31
+
32
+ require 'optparse'
33
+
34
+ require 'rubygems'
35
+ require 'nuggets/string/word_wrap'
36
+
37
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
38
+
39
+ require 'cmess'
40
+ require 'cmess/guess_encoding'
41
+ require 'cmess/cli'
42
+
43
+ include CMess::CLI
44
+
45
+ PROGNAME = File.basename($0)
46
+
47
+ # short-cut
48
+ CGE = CMess::GuessEncoding
49
+
50
+ # how to split list of encodings
51
+ SPLIT_ENCODING_LIST_RE = /\s*[,\s]\s*/o
52
+
53
+ options = {
54
+ :input => STDIN,
55
+ :line => 1,
56
+ :encodings => nil,
57
+ :additional_encodings => [],
58
+ :target_encoding => determine_system_encoding,
59
+ :guess => false,
60
+ :chunk_size => nil,
61
+ :ignore_bom => false
62
+ }
63
+
64
+ OptionParser.new(nil, 40) { |opts|
65
+ opts.banner = "Usage: #{$0} [options]"
66
+
67
+ opts.separator ''
68
+ opts.separator 'Options:'
69
+
70
+ opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
71
+ options[:input] = open_file_or_std(f)
72
+ }
73
+
74
+ opts.separator ''
75
+ opts.separator ' * Manual guessing'
76
+ opts.separator ''
77
+
78
+ opts.on('-l', '--line LINE', "Line number of input file to use for testing [Default: #{options[:line]}]") { |l|
79
+ options[:line] = l.to_i
80
+
81
+ unless options[:line] > 0
82
+ options[:input].read # prevent 'Broken pipe' error
83
+ abort "Line number must be greater then 0!"
84
+ end
85
+ }
86
+
87
+ opts.separator ''
88
+
89
+ opts.on('-e', '--encodings ENCODINGS...', "List of encodings to try >instead of< default (see below)") { |e|
90
+ options[:encodings] ||= []
91
+ options[:encodings] += e.split(SPLIT_ENCODING_LIST_RE)
92
+ }
93
+
94
+ opts.on('-a', '--additional-encodings ENCODINGS...', "List of encodings to try >in addition to< default (see below)") { |e|
95
+ options[:additional_encodings] += e.split(SPLIT_ENCODING_LIST_RE)
96
+ }
97
+
98
+ opts.separator ''
99
+
100
+ opts.on('-t', '--target-encoding ENCODING', "Target encoding of your system [Default: #{options[:target_encoding]}]") { |e|
101
+ options[:target_encoding] = e
102
+ }
103
+
104
+ opts.separator ''
105
+ opts.separator ' * Automatic guessing'
106
+ opts.separator ''
107
+
108
+ opts.on('-g', '--guess', "Actually guess the encoding of the input, automatically!", "(see below for a list of supported encodings)") {
109
+ options[:guess] = true
110
+ }
111
+
112
+ opts.on('-c', '--chunk-size SIZE', Integer, "Size of chunks input will be read in until a valid encoding", "has been found; by default the whole file will be read") { |s|
113
+ options[:chunk_size] = s
114
+ }
115
+
116
+ opts.separator ''
117
+
118
+ opts.on('-b', '--ignore-bom', "Ignore detected BOM (if any)", "(see below for a list of supported encodings)") {
119
+ options[:ignore_bom] = true
120
+ }
121
+
122
+ opts.separator ''
123
+ opts.separator 'Generic options:'
124
+
125
+ opts.on('-h', '--help', "Print this help message and exit") {
126
+ puts opts
127
+ exit
128
+ }
129
+
130
+ opts.on('--version', "Print program version and exit") {
131
+ puts "#{PROGNAME} v#{CGE::VERSION} (part of cmess v#{CMess::VERSION})"
132
+ exit
133
+ }
134
+
135
+ opts.separator ''
136
+ opts.separator 'Default encodings for manual guessing:'
137
+ CGE::Manual::ENCODINGS.join(', ').word_wrap(110, true).each { |l|
138
+ opts.separator l
139
+ }
140
+
141
+ opts.separator ''
142
+ opts.separator 'Likely candidates for additional testing:'
143
+ CGE::Manual::CANDIDATES.join(', ').word_wrap(110, true).each { |l|
144
+ opts.separator l
145
+ }
146
+
147
+ opts.separator ''
148
+ opts.separator 'Supported encodings for automatic guessing (will be tried in that order):'
149
+ CGE::Automatic.supported_encodings.join(', ').word_wrap(110, true).each { |l|
150
+ opts.separator l
151
+ }
152
+
153
+ opts.separator ''
154
+ opts.separator 'Supported encodings for BOM detection (will be tried in that order):'
155
+ CGE::Automatic.supported_boms.join(', ').word_wrap(110, true).each { |l|
156
+ opts.separator l
157
+ }
158
+
159
+ opts.separator ''
160
+ opts.separator "When FILE is -, STDIN is used."
161
+ }.parse!
162
+
163
+ if options[:guess]
164
+ puts CGE::Automatic.guess(options[:input], options[:chunk_size], options[:ignore_bom])
165
+ else
166
+ options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
167
+
168
+ # reset line counter
169
+ $. = 0
170
+
171
+ input = options[:input].each { |line|
172
+ break line if $. == options[:line]
173
+ }
174
+ abort "Input was empty!" if $..zero?
175
+ abort "Line not found -- input has only #{$.} line#{'s' if $. != 1}" unless input.is_a?(String)
176
+
177
+ CGE::Manual.display(
178
+ input,
179
+ options[:target_encoding],
180
+ options[:encodings],
181
+ options[:additional_encodings]
182
+ )
183
+ end