cmess 0.0.4.136

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ChangeLog ADDED
@@ -0,0 +1,6 @@
1
+ = Revision history for cmess
2
+
3
+ == 0.0.3 [2007-12-12]
4
+
5
+ * Added automatic encoding detection to GuessEncoding. Idea and original
6
+ implementation provided by John. Thanks :-)
data/README ADDED
@@ -0,0 +1,53 @@
1
+ = cmess - Assist with messed up encodings
2
+
3
+ == VERSION
4
+
5
+ This documentation refers to cmess version 0.0.3
6
+
7
+
8
+ == DESCRIPTION
9
+
10
+ CMess bundles several tools under its hood that aim at dealing with various
11
+ problems occurring in the context of character sets and encodings. Currently,
12
+ there are:
13
+
14
+ guess_encoding:: Simple helper to identify the encoding of a given string.
15
+ Includes the ability to automatically detect the encoding
16
+ of an input.
17
+ cinderella:: When characters are "double encoded", you can't easily
18
+ convert them back -- this is where cinderella comes in,
19
+ sorting the good ones into the pot and the (potentially)
20
+ bad ones into the crop...
21
+ decode_entities:: Decode HTML entities in a string.
22
+
23
+ TODO: well, more of the description... ;-)
24
+
25
+
26
+ == AUTHORS
27
+
28
+ * Jens Wille <mailto:jens.wille@uni-koeln.de>
29
+
30
+
31
+ == CREDITS
32
+
33
+ * John Vorhauer <mailto:john@vorhauer.de> for the idea and
34
+ original implementation of the automatic encoding guesser
35
+ (see CMess::GuessEncoding::Guesser).
36
+
37
+
38
+ == LICENSE AND COPYRIGHT
39
+
40
+ Copyright (C) 2007 University of Cologne,
41
+ Albertus-Magnus-Platz, 50932 Cologne, Germany
42
+
43
+ cmess is free software: you can redistribute it and/or modify it under the
44
+ terms of the GNU General Public License as published by the Free Software
45
+ Foundation, either version 3 of the License, or (at your option) any later
46
+ version.
47
+
48
+ cmess is distributed in the hope that it will be useful, but WITHOUT ANY
49
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
50
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
51
+
52
+ You should have received a copy of the GNU General Public License along with
53
+ cmess. If not, see <http://www.gnu.org/licenses/>.
data/Rakefile ADDED
@@ -0,0 +1,30 @@
1
+ # Utilizes global rake-tasks: alias rake="rake -r rake -R /path/to/rakelibdir"
2
+ # (Base tasks at <http://prometheus.khi.uni-koeln.de/svn/scratch/rake-tasks/>)
3
+
4
+ $:.unshift('lib')
5
+
6
+ require 'cmess'
7
+
8
+ FILES = FileList['lib/**/*.rb'].to_a
9
+ EXECS = FileList['bin/*'].to_a
10
+ RDOCS = %w[README COPYING ChangeLog]
11
+ OTHER = FileList['[A-Z]*', 'example/**/*', 'data/**/*'].to_a
12
+
13
+ task(:doc_spec) {{
14
+ :title => 'cmess Application documentation',
15
+ :rdoc_files => RDOCS + FILES
16
+ }}
17
+
18
+ task(:gem_spec) {{
19
+ :name => 'cmess',
20
+ :version => CMess::VERSION,
21
+ :summary => "Assist with handling messed up encodings " <<
22
+ "(Currently includes the following tools: " <<
23
+ "#{EXECS.map { |e| File.basename(e) }.join(', ')})",
24
+ :files => FILES + EXECS + OTHER,
25
+ :require_path => 'lib',
26
+ :bindir => 'bin',
27
+ :executables => EXECS,
28
+ :extra_rdoc_files => RDOCS,
29
+ :dependencies => %w[ruby-nuggets htmlentities]
30
+ }}
data/bin/cinderella ADDED
@@ -0,0 +1,186 @@
1
+ #! /usr/bin/ruby
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # cinderella -- Handle double encoded characters #
7
+ # [A component of cmess, the encoding tool-box] #
8
+ # #
9
+ # Copyright (C) 2007 University of Cologne, #
10
+ # Albertus-Magnus-Platz, #
11
+ # 50932 Cologne, Germany #
12
+ # #
13
+ # Authors: #
14
+ # Jens Wille <jens.wille@uni-koeln.de> #
15
+ # #
16
+ # cmess is free software; you can redistribute it and/or modify it under the #
17
+ # terms of the GNU General Public License as published by the Free Software #
18
+ # Foundation; either version 3 of the License, or (at your option) any later #
19
+ # version. #
20
+ # #
21
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
22
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
23
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
24
+ # details. #
25
+ # #
26
+ # You should have received a copy of the GNU General Public License along #
27
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
28
+ # #
29
+ ###############################################################################
30
+ #++
31
+
32
+ require 'optparse'
33
+ require 'yaml'
34
+
35
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
36
+
37
+ require 'cmess'
38
+ require 'cmess/cinderella'
39
+ require 'cmess/cli'
40
+
41
+ include CMess::CLI
42
+
43
+ PROGNAME = File.basename($0)
44
+
45
+ options = {
46
+ :input => STDIN,
47
+ :output => STDOUT,
48
+ :pot => nil,
49
+ :crop => nil,
50
+ :source_encoding => nil,
51
+ :target_encoding => determine_system_encoding,
52
+ :csets => [
53
+ File.expand_path(File.join(File.dirname(__FILE__), '..', 'data', 'csets'))
54
+ ],
55
+ :repair => false
56
+ }
57
+
58
+ OptionParser.new(nil, 40) { |opts|
59
+ opts.banner = "Usage: #{$0} [options]"
60
+
61
+ opts.separator ''
62
+ opts.separator 'Options:'
63
+
64
+ opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
65
+ options[:input] = open_file_or_std(f)
66
+ }
67
+
68
+ opts.separator ''
69
+
70
+ opts.on('-p', '--pot FILE', "The good into the pot...") { |f|
71
+ options[:pot] = open_file_or_std(f, 'w')
72
+ }
73
+
74
+ opts.on('-c', '--crop FILE', "...the bad into the crop") { |f|
75
+ options[:crop] = open_file_or_std(f, 'w')
76
+ }
77
+
78
+ opts.separator ''
79
+
80
+ opts.on('-o', '--output [FILE]', "Write both good and bad lines to FILE or", "default [Default: STDOUT] (Particularly", "useful in combination with the '-r' option)") { |f|
81
+ options[:output] = open_file_or_std(f, 'w') if f
82
+
83
+ options[:pot] = options[:output]
84
+ options[:crop] = options[:output]
85
+ }
86
+
87
+ opts.separator ''
88
+
89
+ opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'", "(Only really useful in combination with", "the '-r' option)") { |f|
90
+ options[:input], options[:output] = open_file_in_place(f)
91
+
92
+ options[:pot] = options[:output]
93
+ options[:crop] = options[:output]
94
+ }
95
+
96
+ opts.separator ''
97
+
98
+ opts.on('-e', '--source-encoding ENCODING', "Source encoding (from) [REQUIRED]") { |e|
99
+ options[:source_encoding] = e
100
+ }
101
+
102
+ opts.on('-t', '--target-encoding ENCODING', "Target encoding (to); see '-l' for a list", "of available encodings [Default: #{options[:target_encoding]}]") { |e|
103
+ options[:target_encoding] = e
104
+ }
105
+
106
+ opts.separator ''
107
+
108
+ opts.on('-T', '--addtl-target-encodings DIRECTORY', "Directory providing additional char files", "for target encoding") { |d|
109
+ ensure_directory(d)
110
+
111
+ options[:csets] |= [File.expand_path(d)]
112
+ }
113
+
114
+ opts.separator ''
115
+
116
+ opts.on('-l', '--list-encodings', "Display a list of available target encodings", "and exit; see '-T' on how to add your own") {
117
+ csets = options[:csets].inject({}) { |hash, cset|
118
+ encodings = Dir[File.join(cset, '*.yaml')].sort.map { |yaml|
119
+ File.basename(yaml, '.yaml') unless File.symlink?(yaml)
120
+ }.compact
121
+
122
+ hash[cset] = encodings unless encodings.empty?
123
+ hash
124
+ }
125
+
126
+ if csets.empty?
127
+ puts "No target encodings available for #{PROGNAME}"
128
+ else
129
+ puts "Available target encodings for #{PROGNAME}:"
130
+ csets.each { |cset, encodings|
131
+ puts "[#{cset}]"
132
+ encodings.each { |encoding|
133
+ puts " - #{encoding}"
134
+ }
135
+ }
136
+ end
137
+
138
+ exit
139
+ }
140
+
141
+ opts.separator ''
142
+
143
+ opts.on('-r', '--repair', "Try to repair corrupted characters") {
144
+ options[:repair] = true
145
+ }
146
+
147
+ opts.separator ''
148
+ opts.separator 'Generic options:'
149
+
150
+ opts.on('-h', '--help', "Print this help message and exit") {
151
+ puts opts
152
+ exit
153
+ }
154
+
155
+ opts.on('--version', "Print program version and exit") {
156
+ puts "#{PROGNAME} v#{CMess::Cinderella::VERSION} (part of cmess v#{CMess::VERSION})"
157
+ exit
158
+ }
159
+
160
+ opts.separator ''
161
+ opts.separator "If '-p' or '-c' is omitted, and '-o' is not given either, that particular output"
162
+ opts.separator "is ignored. When FILE is -, either STDIN or STDOUT is used (as appropriate)."
163
+ }.parse!
164
+
165
+ options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
166
+
167
+ abort "No source encoding given! (Use the '-e' switch to do so; see '--help' for more information)" \
168
+ unless options[:source_encoding]
169
+
170
+ yaml_file = "#{options[:target_encoding].downcase}.yaml"
171
+ char_file = options[:csets].inject(nil) { |path, cset|
172
+ path = File.join(cset, yaml_file)
173
+ break path if File.readable?(path)
174
+ }
175
+ abort "Char file not found for target encoding: #{options[:target_encoding]}" \
176
+ unless char_file
177
+
178
+ CMess::Cinderella.pick(
179
+ options[:input],
180
+ options[:pot],
181
+ options[:crop],
182
+ options[:source_encoding],
183
+ options[:target_encoding],
184
+ YAML.load_file(char_file),
185
+ options[:repair]
186
+ )
@@ -0,0 +1,101 @@
1
+ #! /usr/bin/ruby
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # decode_entities -- Decode HTML entities #
7
+ # [A component of cmess, the encoding tool-box] #
8
+ # #
9
+ # Copyright (C) 2007 University of Cologne, #
10
+ # Albertus-Magnus-Platz, #
11
+ # 50932 Cologne, Germany #
12
+ # #
13
+ # Authors: #
14
+ # Jens Wille <jens.wille@uni-koeln.de> #
15
+ # #
16
+ # cmess is free software; you can redistribute it and/or modify it under the #
17
+ # terms of the GNU General Public License as published by the Free Software #
18
+ # Foundation; either version 3 of the License, or (at your option) any later #
19
+ # version. #
20
+ # #
21
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
22
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
23
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
24
+ # details. #
25
+ # #
26
+ # You should have received a copy of the GNU General Public License along #
27
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
28
+ # #
29
+ ###############################################################################
30
+ #++
31
+
32
+ require 'optparse'
33
+
34
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
35
+
36
+ require 'cmess'
37
+ require 'cmess/decode_entities'
38
+ require 'cmess/cli'
39
+
40
+ include CMess::CLI
41
+
42
+ PROGNAME = File.basename($0)
43
+
44
+ options = {
45
+ :input => STDIN,
46
+ :output => STDOUT,
47
+ :source_encoding => CMess::DecodeEntities::INTERMEDIATE_ENCODING,
48
+ :target_encoding => nil
49
+ }
50
+
51
+ OptionParser.new { |opts|
52
+ opts.banner = "Usage: #{$0} [options]"
53
+
54
+ opts.separator ''
55
+ opts.separator 'Options:'
56
+
57
+ opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
58
+ options[:input] = open_file_or_std(f)
59
+ }
60
+
61
+ opts.on('-o', '--output FILE', "Output file to write to [Default: STDOUT]") { |f|
62
+ options[:output] = open_file_or_std(f, 'w')
63
+ }
64
+
65
+ opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'") { |f|
66
+ options[:input], options[:output] = open_file_in_place(f)
67
+ }
68
+
69
+ opts.separator ''
70
+
71
+ opts.on('-e', '--source-encoding ENCODING', "Encoding of input file [Default: #{options[:source_encoding].upcase}]") { |e|
72
+ options[:source_encoding] = e.downcase
73
+ }
74
+
75
+ opts.on('-t', '--target-encoding ENCODING', "Desired encoding for output file [Default: <source_encoding>]") { |e|
76
+ options[:target_encoding] = e.downcase
77
+ }
78
+
79
+ opts.separator ''
80
+ opts.separator 'Generic options:'
81
+
82
+ opts.on('-h', '--help', "Print this help message and exit") {
83
+ puts opts
84
+ exit
85
+ }
86
+
87
+ opts.on('--version', "Print program version and exit") {
88
+ puts "#{PROGNAME} v#{CMess::DecodeEntities::VERSION} (part of cmess v#{CMess::VERSION})"
89
+ exit
90
+ }
91
+
92
+ opts.separator ''
93
+ opts.separator "When FILE is -, either STDIN or STDOUT is used (as appropriate)."
94
+ }.parse!
95
+
96
+ CMess::DecodeEntities.decode(
97
+ options[:input],
98
+ options[:output],
99
+ options[:source_encoding],
100
+ options[:target_encoding]
101
+ )
@@ -0,0 +1,183 @@
1
+ #! /usr/bin/ruby
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # guess_encoding -- Assist with guessing the encoding of some input at hand #
7
+ # [A component of cmess, the encoding tool-box] #
8
+ # #
9
+ # Copyright (C) 2007 University of Cologne, #
10
+ # Albertus-Magnus-Platz, #
11
+ # 50932 Cologne, Germany #
12
+ # #
13
+ # Authors: #
14
+ # Jens Wille <jens.wille@uni-koeln.de> #
15
+ # #
16
+ # cmess is free software; you can redistribute it and/or modify it under the #
17
+ # terms of the GNU General Public License as published by the Free Software #
18
+ # Foundation; either version 3 of the License, or (at your option) any later #
19
+ # version. #
20
+ # #
21
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
22
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
23
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
24
+ # details. #
25
+ # #
26
+ # You should have received a copy of the GNU General Public License along #
27
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
28
+ # #
29
+ ###############################################################################
30
+ #++
31
+
32
+ require 'optparse'
33
+
34
+ require 'rubygems'
35
+ require 'nuggets/string/word_wrap'
36
+
37
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
38
+
39
+ require 'cmess'
40
+ require 'cmess/guess_encoding'
41
+ require 'cmess/cli'
42
+
43
+ include CMess::CLI
44
+
45
+ PROGNAME = File.basename($0)
46
+
47
+ # short-cut
48
+ CGE = CMess::GuessEncoding
49
+
50
+ # how to split list of encodings
51
+ SPLIT_ENCODING_LIST_RE = /\s*[,\s]\s*/o
52
+
53
+ options = {
54
+ :input => STDIN,
55
+ :line => 1,
56
+ :encodings => nil,
57
+ :additional_encodings => [],
58
+ :target_encoding => determine_system_encoding,
59
+ :guess => false,
60
+ :chunk_size => nil,
61
+ :ignore_bom => false
62
+ }
63
+
64
+ OptionParser.new(nil, 40) { |opts|
65
+ opts.banner = "Usage: #{$0} [options]"
66
+
67
+ opts.separator ''
68
+ opts.separator 'Options:'
69
+
70
+ opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
71
+ options[:input] = open_file_or_std(f)
72
+ }
73
+
74
+ opts.separator ''
75
+ opts.separator ' * Manual guessing'
76
+ opts.separator ''
77
+
78
+ opts.on('-l', '--line LINE', "Line number of input file to use for testing [Default: #{options[:line]}]") { |l|
79
+ options[:line] = l.to_i
80
+
81
+ unless options[:line] > 0
82
+ options[:input].read # prevent 'Broken pipe' error
83
+ abort "Line number must be greater then 0!"
84
+ end
85
+ }
86
+
87
+ opts.separator ''
88
+
89
+ opts.on('-e', '--encodings ENCODINGS...', "List of encodings to try >instead of< default (see below)") { |e|
90
+ options[:encodings] ||= []
91
+ options[:encodings] += e.split(SPLIT_ENCODING_LIST_RE)
92
+ }
93
+
94
+ opts.on('-a', '--additional-encodings ENCODINGS...', "List of encodings to try >in addition to< default (see below)") { |e|
95
+ options[:additional_encodings] += e.split(SPLIT_ENCODING_LIST_RE)
96
+ }
97
+
98
+ opts.separator ''
99
+
100
+ opts.on('-t', '--target-encoding ENCODING', "Target encoding of your system [Default: #{options[:target_encoding]}]") { |e|
101
+ options[:target_encoding] = e
102
+ }
103
+
104
+ opts.separator ''
105
+ opts.separator ' * Automatic guessing'
106
+ opts.separator ''
107
+
108
+ opts.on('-g', '--guess', "Actually guess the encoding of the input, automatically!", "(see below for a list of supported encodings)") {
109
+ options[:guess] = true
110
+ }
111
+
112
+ opts.on('-c', '--chunk-size SIZE', Integer, "Size of chunks input will be read in until a valid encoding", "has been found; by default the whole file will be read") { |s|
113
+ options[:chunk_size] = s
114
+ }
115
+
116
+ opts.separator ''
117
+
118
+ opts.on('-b', '--ignore-bom', "Ignore detected BOM (if any)", "(see below for a list of supported encodings)") {
119
+ options[:ignore_bom] = true
120
+ }
121
+
122
+ opts.separator ''
123
+ opts.separator 'Generic options:'
124
+
125
+ opts.on('-h', '--help', "Print this help message and exit") {
126
+ puts opts
127
+ exit
128
+ }
129
+
130
+ opts.on('--version', "Print program version and exit") {
131
+ puts "#{PROGNAME} v#{CGE::VERSION} (part of cmess v#{CMess::VERSION})"
132
+ exit
133
+ }
134
+
135
+ opts.separator ''
136
+ opts.separator 'Default encodings for manual guessing:'
137
+ CGE::Manual::ENCODINGS.join(', ').word_wrap(110, true).each { |l|
138
+ opts.separator l
139
+ }
140
+
141
+ opts.separator ''
142
+ opts.separator 'Likely candidates for additional testing:'
143
+ CGE::Manual::CANDIDATES.join(', ').word_wrap(110, true).each { |l|
144
+ opts.separator l
145
+ }
146
+
147
+ opts.separator ''
148
+ opts.separator 'Supported encodings for automatic guessing (will be tried in that order):'
149
+ CGE::Automatic.supported_encodings.join(', ').word_wrap(110, true).each { |l|
150
+ opts.separator l
151
+ }
152
+
153
+ opts.separator ''
154
+ opts.separator 'Supported encodings for BOM detection (will be tried in that order):'
155
+ CGE::Automatic.supported_boms.join(', ').word_wrap(110, true).each { |l|
156
+ opts.separator l
157
+ }
158
+
159
+ opts.separator ''
160
+ opts.separator "When FILE is -, STDIN is used."
161
+ }.parse!
162
+
163
+ if options[:guess]
164
+ puts CGE::Automatic.guess(options[:input], options[:chunk_size], options[:ignore_bom])
165
+ else
166
+ options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
167
+
168
+ # reset line counter
169
+ $. = 0
170
+
171
+ input = options[:input].each { |line|
172
+ break line if $. == options[:line]
173
+ }
174
+ abort "Input was empty!" if $..zero?
175
+ abort "Line not found -- input has only #{$.} line#{'s' if $. != 1}" unless input.is_a?(String)
176
+
177
+ CGE::Manual.display(
178
+ input,
179
+ options[:target_encoding],
180
+ options[:encodings],
181
+ options[:additional_encodings]
182
+ )
183
+ end