blackwinter-cmess 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/COPYING +676 -0
  2. data/ChangeLog +54 -0
  3. data/README +63 -0
  4. data/Rakefile +51 -0
  5. data/bin/bconv +130 -0
  6. data/bin/cinderella +190 -0
  7. data/bin/decode_entities +106 -0
  8. data/bin/guess_encoding +223 -0
  9. data/data/chartab.yaml +26724 -0
  10. data/data/csets/iso_8859-1.yaml +195 -0
  11. data/data/csets/iso_8859-15.yaml +204 -0
  12. data/data/csets/latin1.yaml +195 -0
  13. data/data/csets/unicode/basic_latin.yaml +97 -0
  14. data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
  15. data/data/csets/unicode/cyrillic.yaml +256 -0
  16. data/data/csets/unicode/greek.yaml +129 -0
  17. data/data/csets/unicode/ipa_extensions.yaml +97 -0
  18. data/data/csets/unicode/latin-extended-c.yaml +18 -0
  19. data/data/csets/unicode/latin-extended-d.yaml +3 -0
  20. data/data/csets/unicode/latin_1_supplement.yaml +128 -0
  21. data/data/csets/unicode/latin_extended_a.yaml +129 -0
  22. data/data/csets/unicode/latin_extended_additional.yaml +247 -0
  23. data/data/csets/unicode/latin_extended_b.yaml +209 -0
  24. data/data/csets/unicode/letterlike_symbols.yaml +80 -0
  25. data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
  26. data/data/csets/utf-8.yaml +1504 -0
  27. data/data/csets/utf8.yaml +1504 -0
  28. data/data/test_chars.yaml +14 -0
  29. data/example/cinderella/crop +127 -0
  30. data/example/cinderella/crop_repaired +127 -0
  31. data/example/cinderella/empty6-slash.txt +1495 -0
  32. data/example/cinderella/empty6-slash_repaired.txt +1495 -0
  33. data/example/cinderella/pot +1368 -0
  34. data/example/guess_encoding/check_results +60 -0
  35. data/example/guess_encoding/de.utf-8.txt +10030 -0
  36. data/example/guess_encoding/en.utf-8.txt +10030 -0
  37. data/example/guess_encoding/fr.utf-8.txt +10030 -0
  38. data/example/guess_encoding/it.utf-8.txt +10030 -0
  39. data/lib/cmess/bconv.rb +169 -0
  40. data/lib/cmess/cinderella.rb +66 -0
  41. data/lib/cmess/cli.rb +120 -0
  42. data/lib/cmess/decode_entities.rb +69 -0
  43. data/lib/cmess/guess_encoding/automatic.rb +343 -0
  44. data/lib/cmess/guess_encoding/encoding.rb +78 -0
  45. data/lib/cmess/guess_encoding/manual.rb +108 -0
  46. data/lib/cmess/guess_encoding.rb +61 -0
  47. data/lib/cmess/version.rb +51 -0
  48. data/lib/cmess.rb +49 -0
  49. metadata +136 -0
@@ -0,0 +1,223 @@
1
+ #! /usr/bin/ruby
2
+
3
+ #--
4
+ ###############################################################################
5
+ # #
6
+ # guess_encoding -- Assist with guessing the encoding of some input at hand #
7
+ # [A component of cmess, the encoding tool-box] #
8
+ # #
9
+ # Copyright (C) 2007-2008 University of Cologne, #
10
+ # Albertus-Magnus-Platz, #
11
+ # 50932 Cologne, Germany #
12
+ # #
13
+ # Authors: #
14
+ # Jens Wille <jens.wille@uni-koeln.de> #
15
+ # #
16
+ # cmess is free software; you can redistribute it and/or modify it under the #
17
+ # terms of the GNU General Public License as published by the Free Software #
18
+ # Foundation; either version 3 of the License, or (at your option) any later #
19
+ # version. #
20
+ # #
21
+ # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
22
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
23
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
24
+ # details. #
25
+ # #
26
+ # You should have received a copy of the GNU General Public License along #
27
+ # with cmess. If not, see <http://www.gnu.org/licenses/>. #
28
+ # #
29
+ ###############################################################################
30
+ #++
31
+
32
+ require 'optparse'
33
+
34
+ require 'rubygems'
35
+ require 'nuggets/string/word_wrap'
36
+
37
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
38
+
39
+ require 'cmess/guess_encoding'
40
+ require 'cmess/cli'
41
+
42
+ include CMess::CLI
43
+
44
+ PROGNAME = File.basename($0)
45
+
46
+ # short-cut
47
+ CGE = CMess::GuessEncoding
48
+
49
+ options = {
50
+ :input => STDIN,
51
+ :line => 1,
52
+ :encodings => nil,
53
+ :additional_encodings => [],
54
+ :target_encoding => determine_system_encoding,
55
+ :manual => false,
56
+ :chunk_size => nil,
57
+ :ignore_bom => false,
58
+ :charcodes => nil,
59
+ :decimal => false,
60
+ :octal => false
61
+ }
62
+
63
+ OptionParser.new(nil, 40) { |opts|
64
+ opts.banner = "Usage: #{$0} [options] [FILE...]"
65
+
66
+ opts.separator ''
67
+ opts.separator 'Options:'
68
+
69
+ opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
70
+ options[:input] = open_file_or_std(f)
71
+ options[:input_set] = true
72
+ }
73
+
74
+ opts.separator ''
75
+ opts.separator ' * Automatic guessing'
76
+ opts.separator ''
77
+
78
+ opts.on('-c', '--chunk-size SIZE', Integer, "Size of chunks input will be read in until a valid encoding", "has been found; by default the whole file will be read") { |s|
79
+ options[:chunk_size] = s
80
+ }
81
+
82
+ opts.separator ''
83
+
84
+ opts.on('-b', '--ignore-bom', "Ignore detected BOM (if any)", "(see below for a list of supported encodings)") {
85
+ options[:ignore_bom] = true
86
+ }
87
+
88
+ opts.separator ''
89
+ opts.separator ' * Manual guessing'
90
+ opts.separator ''
91
+
92
+ opts.on('-m', '--manual', "Present variously encoded input for manual encoding guessing") {
93
+ options[:manual] = true
94
+ }
95
+
96
+ opts.separator ''
97
+
98
+ opts.on('-l', '--line LINE', "Line number of input file to use for testing [Default: #{options[:line]}]") { |l|
99
+ options[:line] = l.to_i
100
+
101
+ unless options[:line] > 0
102
+ options[:input].read # prevent 'Broken pipe' error
103
+ abort "Line number must be greater then 0!"
104
+ end
105
+ }
106
+
107
+ opts.separator ''
108
+
109
+ opts.on('-e', '--encodings ENCODINGS...', "List of encodings to try >instead of< default (see below)") { |e|
110
+ options[:encodings] ||= []
111
+ options[:encodings] += e.split(SPLIT_ARG_LIST_RE)
112
+ }
113
+
114
+ opts.on('-a', '--additional-encodings ENCODINGS...', "List of encodings to try >in addition to< default (see below)") { |e|
115
+ options[:additional_encodings] += e.split(SPLIT_ARG_LIST_RE)
116
+ }
117
+
118
+ opts.separator ''
119
+
120
+ opts.on('-t', '--target-encoding ENCODING', "Target encoding of your system [Default: #{options[:target_encoding]}]") { |e|
121
+ options[:target_encoding] = e
122
+ }
123
+
124
+ opts.separator ''
125
+
126
+ opts.on('--list-encodings', 'Print a list of all available encodings on your system and exit') {
127
+ puts CGE::Encoding.all_encodings
128
+ exit
129
+ }
130
+
131
+ opts.separator ''
132
+ opts.separator ' * Charcodes'
133
+ opts.separator ''
134
+
135
+ opts.on('-C', '--charcodes CHARCODES', "Specify a list of character codes (in hexadecimal by default)", "for manual guessing. (Options '-e', '-a', and '-t' apply here", "as well; see under \"Manual guessing\" for details.)") { |c|
136
+ options[:charcodes] = c.split(SPLIT_ARG_LIST_RE)
137
+ }
138
+
139
+ opts.separator ''
140
+
141
+ opts.on('-D', '--decimal', "Charcodes are in decimal") {
142
+ options[:decimal] = true
143
+ }
144
+
145
+ opts.on('-O', '--octal', "Charcodes are in octal") {
146
+ options[:octal] = true
147
+ }
148
+
149
+ opts.separator ''
150
+ opts.separator 'Generic options:'
151
+
152
+ opts.on('-h', '--help', "Print this help message and exit") {
153
+ puts opts
154
+ exit
155
+ }
156
+
157
+ opts.on('--version', "Print program version and exit") {
158
+ puts "#{PROGNAME} v#{CGE::VERSION} (part of cmess v#{CMess::VERSION})"
159
+ exit
160
+ }
161
+
162
+ opts.separator ''
163
+ opts.separator 'Supported encodings for automatic guessing (will be tried in that order):'
164
+ CGE::Automatic.supported_encodings.join(', ').word_wrap(110, true).each { |l|
165
+ opts.separator l
166
+ }
167
+
168
+ opts.separator ''
169
+ opts.separator 'Supported encodings for BOM detection (will be tried in that order):'
170
+ CGE::Automatic.supported_boms.join(', ').word_wrap(110, true).each { |l|
171
+ opts.separator l
172
+ }
173
+
174
+ opts.separator ''
175
+ opts.separator 'Default encodings for manual guessing:'
176
+ CGE::Manual::ENCODINGS.join(', ').word_wrap(110, true).each { |l|
177
+ opts.separator l
178
+ }
179
+
180
+ opts.separator ''
181
+ opts.separator 'Likely candidates for additional testing:'
182
+ CGE::Manual::CANDIDATES.join(', ').word_wrap(110, true).each { |l|
183
+ opts.separator l
184
+ }
185
+
186
+ opts.separator ''
187
+ opts.separator 'NOTE: To select all encodings available on your system, specify __ALL__.'
188
+
189
+ opts.separator ''
190
+ opts.separator "When FILE is -, STDIN is used."
191
+ }.parse!
192
+
193
+ cli do
194
+ trailing_args_as_input(options)
195
+
196
+ if options[:manual] || options[:charcodes]
197
+ options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
198
+
199
+ if charcodes = options[:charcodes]
200
+ base = options[:octal] ? 8 : options[:decimal] ? 10 : 16
201
+ input = charcodes.map { |c| c.to_i(base).chr }.join
202
+ else
203
+ # reset line counter
204
+ $. = 0
205
+
206
+ input = options[:input].each { |line|
207
+ break line if $. == options[:line]
208
+ }
209
+
210
+ abort "Input was empty!" if $..zero?
211
+ abort "Line not found -- input has only #{$.} line#{'s' if $. != 1}" unless input.is_a?(String)
212
+ end
213
+
214
+ CGE::Manual.display(
215
+ input,
216
+ options[:target_encoding],
217
+ options[:encodings],
218
+ options[:additional_encodings]
219
+ )
220
+ else # automatic
221
+ puts CGE::Automatic.guess(options[:input], options[:chunk_size], options[:ignore_bom])
222
+ end
223
+ end