blackwinter-cmess 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +676 -0
- data/ChangeLog +54 -0
- data/README +63 -0
- data/Rakefile +51 -0
- data/bin/bconv +130 -0
- data/bin/cinderella +190 -0
- data/bin/decode_entities +106 -0
- data/bin/guess_encoding +223 -0
- data/data/chartab.yaml +26724 -0
- data/data/csets/iso_8859-1.yaml +195 -0
- data/data/csets/iso_8859-15.yaml +204 -0
- data/data/csets/latin1.yaml +195 -0
- data/data/csets/unicode/basic_latin.yaml +97 -0
- data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
- data/data/csets/unicode/cyrillic.yaml +256 -0
- data/data/csets/unicode/greek.yaml +129 -0
- data/data/csets/unicode/ipa_extensions.yaml +97 -0
- data/data/csets/unicode/latin-extended-c.yaml +18 -0
- data/data/csets/unicode/latin-extended-d.yaml +3 -0
- data/data/csets/unicode/latin_1_supplement.yaml +128 -0
- data/data/csets/unicode/latin_extended_a.yaml +129 -0
- data/data/csets/unicode/latin_extended_additional.yaml +247 -0
- data/data/csets/unicode/latin_extended_b.yaml +209 -0
- data/data/csets/unicode/letterlike_symbols.yaml +80 -0
- data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
- data/data/csets/utf-8.yaml +1504 -0
- data/data/csets/utf8.yaml +1504 -0
- data/data/test_chars.yaml +14 -0
- data/example/cinderella/crop +127 -0
- data/example/cinderella/crop_repaired +127 -0
- data/example/cinderella/empty6-slash.txt +1495 -0
- data/example/cinderella/empty6-slash_repaired.txt +1495 -0
- data/example/cinderella/pot +1368 -0
- data/example/guess_encoding/check_results +60 -0
- data/example/guess_encoding/de.utf-8.txt +10030 -0
- data/example/guess_encoding/en.utf-8.txt +10030 -0
- data/example/guess_encoding/fr.utf-8.txt +10030 -0
- data/example/guess_encoding/it.utf-8.txt +10030 -0
- data/lib/cmess/bconv.rb +169 -0
- data/lib/cmess/cinderella.rb +66 -0
- data/lib/cmess/cli.rb +120 -0
- data/lib/cmess/decode_entities.rb +69 -0
- data/lib/cmess/guess_encoding/automatic.rb +343 -0
- data/lib/cmess/guess_encoding/encoding.rb +78 -0
- data/lib/cmess/guess_encoding/manual.rb +108 -0
- data/lib/cmess/guess_encoding.rb +61 -0
- data/lib/cmess/version.rb +51 -0
- data/lib/cmess.rb +49 -0
- metadata +136 -0
data/ChangeLog
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
= Revision history for cmess
|
2
|
+
|
3
|
+
== 0.2.0 [2009-05-08]
|
4
|
+
|
5
|
+
* Ruby 1.9 compatibility (Syntax, at least)
|
6
|
+
|
7
|
+
== 0.1.2 [2008-09-17]
|
8
|
+
|
9
|
+
* Some refactoring; started to make tools more usable as a library
|
10
|
+
* Make tools accept multiple files for input (via tempfile)
|
11
|
+
* Use ENV.user_encoding from ruby-nuggets
|
12
|
+
* Wrap command execution in a block catching any exceptions
|
13
|
+
|
14
|
+
== 0.1.1 [2008-09-16]
|
15
|
+
|
16
|
+
* Added bconv tool to convert between bibliographic encodings
|
17
|
+
|
18
|
+
== 0.1.0 [2008-09-15]
|
19
|
+
|
20
|
+
* Added ability to operate on all encodings available on a system
|
21
|
+
|
22
|
+
== 0.0.9 [2008-08-15]
|
23
|
+
|
24
|
+
* Reorganized file structure for guess_encoding
|
25
|
+
* Added shortcuts GuessEncoding.manual/.automatic
|
26
|
+
* GuessEncoding::Automatic now also takes a String
|
27
|
+
as input (will be converted to a StringIO)
|
28
|
+
|
29
|
+
== 0.0.8 [2008-08-14]
|
30
|
+
|
31
|
+
* Require 'cmess' inside libs, so the user doesn't have to
|
32
|
+
|
33
|
+
== 0.0.7 [2008-05-19]
|
34
|
+
|
35
|
+
* Fixed "Illegal seek" error when inside a pipe
|
36
|
+
|
37
|
+
== 0.0.6 [2008-01-30]
|
38
|
+
|
39
|
+
* Added ability to specify charcodes as input for manual guessing
|
40
|
+
* Improved automatic guessing and further enhancements
|
41
|
+
|
42
|
+
== 0.0.5 [2008-01-21]
|
43
|
+
|
44
|
+
* Made automatic guessing the default for guess_encoding
|
45
|
+
* Allow to specify input file as argument
|
46
|
+
|
47
|
+
== 0.0.4 [2007-12-18]
|
48
|
+
|
49
|
+
* Added BOM detection
|
50
|
+
|
51
|
+
== 0.0.3 [2007-12-12]
|
52
|
+
|
53
|
+
* Added automatic encoding detection to GuessEncoding. Idea and original
|
54
|
+
implementation provided by John. Thanks :-)
|
data/README
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
= cmess - Assist with handling messed up encodings
|
2
|
+
|
3
|
+
== VERSION
|
4
|
+
|
5
|
+
This documentation refers to cmess version 0.2.0
|
6
|
+
|
7
|
+
|
8
|
+
== DESCRIPTION
|
9
|
+
|
10
|
+
CMess bundles several tools under its hood that aim at dealing with various
|
11
|
+
problems occurring in the context of character sets and encodings. Currently,
|
12
|
+
there are:
|
13
|
+
|
14
|
+
guess_encoding:: Simple helper to identify the encoding of a given string.
|
15
|
+
Includes the ability to automatically detect the encoding
|
16
|
+
of an input.
|
17
|
+
cinderella:: When characters are "double encoded", you can't easily
|
18
|
+
convert them back -- this is where cinderella comes in,
|
19
|
+
sorting the good ones into the pot and the (potentially)
|
20
|
+
bad ones into the crop...
|
21
|
+
bconv:: Convert between bibliographic (and other) encodings.
|
22
|
+
decode_entities:: Decode HTML entities in a string.
|
23
|
+
|
24
|
+
TODO: well, more of the description... ;-)
|
25
|
+
|
26
|
+
|
27
|
+
== LINKS
|
28
|
+
|
29
|
+
<b></b>
|
30
|
+
Documentation:: <http://prometheus.rubyforge.org/cmess>
|
31
|
+
Source code (old):: <http://prometheus.rubyforge.org/svn/scratch/cmess>
|
32
|
+
Source code:: <http://github.com/blackwinter/cmess>
|
33
|
+
Rubyforge project:: <http://rubyforge.org/projects/prometheus>
|
34
|
+
|
35
|
+
|
36
|
+
== AUTHORS
|
37
|
+
|
38
|
+
* Jens Wille <mailto:jens.wille@uni-koeln.de>
|
39
|
+
|
40
|
+
|
41
|
+
== CREDITS
|
42
|
+
|
43
|
+
* John Vorhauer <mailto:john@vorhauer.de> for the idea and
|
44
|
+
original implementation of the automatic encoding guesser
|
45
|
+
(see CMess::GuessEncoding::Automatic).
|
46
|
+
|
47
|
+
|
48
|
+
== LICENSE AND COPYRIGHT
|
49
|
+
|
50
|
+
Copyright (C) 2007-2008 University of Cologne,
|
51
|
+
Albertus-Magnus-Platz, 50932 Cologne, Germany
|
52
|
+
|
53
|
+
cmess is free software: you can redistribute it and/or modify it under the
|
54
|
+
terms of the GNU General Public License as published by the Free Software
|
55
|
+
Foundation, either version 3 of the License, or (at your option) any later
|
56
|
+
version.
|
57
|
+
|
58
|
+
cmess is distributed in the hope that it will be useful, but WITHOUT ANY
|
59
|
+
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
60
|
+
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
61
|
+
|
62
|
+
You should have received a copy of the GNU General Public License along with
|
63
|
+
cmess. If not, see <http://www.gnu.org/licenses/>.
|
data/Rakefile
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
$:.unshift('lib')
|
2
|
+
require 'cmess'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'hen'
|
6
|
+
|
7
|
+
Hen.lay! {{
|
8
|
+
:rubyforge => {
|
9
|
+
:package => 'cmess'
|
10
|
+
},
|
11
|
+
|
12
|
+
:gem => {
|
13
|
+
:version => CMess::VERSION,
|
14
|
+
:summary => %Q{
|
15
|
+
Assist with handling messed up encodings (Currently includes the
|
16
|
+
following tools: #{Dir['bin/*'].map { |e| File.basename(e) }.sort.join(', ')})
|
17
|
+
},
|
18
|
+
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
19
|
+
:extra_files => FileList['[A-Z]*', 'example/**/*', 'data/**/*'].to_a,
|
20
|
+
:dependencies => [['ruby-nuggets', '>= 0.3.3'], 'htmlentities']
|
21
|
+
}
|
22
|
+
}}
|
23
|
+
rescue LoadError
|
24
|
+
abort "Please install the 'hen' gem first."
|
25
|
+
end
|
26
|
+
|
27
|
+
namespace :guess_encoding do
|
28
|
+
|
29
|
+
require 'cmess/guess_encoding'
|
30
|
+
include CMess::GuessEncoding::Encoding
|
31
|
+
|
32
|
+
desc "Compare actual encoding and automatic guess of example files"
|
33
|
+
task :check_examples do
|
34
|
+
Dir[File.join(File.dirname(__FILE__), 'example', 'guess_encoding', '??.*.txt')].sort.each { |example|
|
35
|
+
language, encoding = File.basename(example, '.txt').split('.')
|
36
|
+
encoding.upcase!
|
37
|
+
|
38
|
+
guessed = CMess::GuessEncoding::Automatic.guess(File.open(example))
|
39
|
+
|
40
|
+
match = case guessed
|
41
|
+
when UNKNOWN: '?'
|
42
|
+
when ASCII: '#'
|
43
|
+
when encoding: '+'
|
44
|
+
else '-'
|
45
|
+
end
|
46
|
+
|
47
|
+
puts '%s %s/%-11s => %s' % [match, language, encoding, guessed]
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
data/bin/bconv
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# bconv -- Convert between bibliographic (and other) encodings #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2008 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
|
34
|
+
require 'rubygems'
|
35
|
+
require 'nuggets/string/word_wrap'
|
36
|
+
|
37
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
38
|
+
|
39
|
+
require 'cmess/bconv'
|
40
|
+
require 'cmess/cli'
|
41
|
+
|
42
|
+
include CMess::CLI
|
43
|
+
|
44
|
+
PROGNAME = File.basename($0)
|
45
|
+
|
46
|
+
options = {
|
47
|
+
:input => STDIN,
|
48
|
+
:output => STDOUT,
|
49
|
+
:source_encoding => determine_system_encoding,
|
50
|
+
:target_encoding => determine_system_encoding,
|
51
|
+
:chartab_file => CMess::BConv::DEFAULT_CHARTAB_FILE
|
52
|
+
}
|
53
|
+
|
54
|
+
OptionParser.new(nil, 40) { |opts|
|
55
|
+
opts.banner = "Usage: #{$0} [options] [FILE...]"
|
56
|
+
|
57
|
+
opts.separator ''
|
58
|
+
opts.separator 'Options:'
|
59
|
+
|
60
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
61
|
+
options[:input] = open_file_or_std(f)
|
62
|
+
options[:input_set] = true
|
63
|
+
}
|
64
|
+
|
65
|
+
opts.on('-o', '--output FILE', "Output file to write to [Default: STDOUT]") { |f|
|
66
|
+
options[:output] = open_file_or_std(f, 'w')
|
67
|
+
}
|
68
|
+
|
69
|
+
opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'") { |f|
|
70
|
+
options[:input], options[:output] = open_file_in_place(f)
|
71
|
+
options[:input_set] = true
|
72
|
+
}
|
73
|
+
|
74
|
+
opts.separator ''
|
75
|
+
|
76
|
+
opts.on('-e', '--source-encoding ENCODING', "Encoding of input file [Default: #{options[:source_encoding]}]") { |e|
|
77
|
+
options[:source_encoding] = e.downcase
|
78
|
+
}
|
79
|
+
|
80
|
+
opts.on('-t', '--target-encoding ENCODING', "Desired encoding for output file [Default: #{options[:target_encoding]}]") { |e|
|
81
|
+
options[:target_encoding] = e.downcase
|
82
|
+
}
|
83
|
+
|
84
|
+
opts.separator ''
|
85
|
+
|
86
|
+
opts.on('-c', '--chartab YAML_FILE', "File containing character mappings, in YAML format.", "[Default: #{options[:chartab_file]}]") { |c|
|
87
|
+
options[:chartab_file] = c
|
88
|
+
}
|
89
|
+
|
90
|
+
opts.on('-l', '--list-encodings', "Print a list of all available bibliographic", "encodings and exit; depends on <chartab>, see '-c'") {
|
91
|
+
options[:list_encodings] = true
|
92
|
+
}
|
93
|
+
|
94
|
+
opts.separator ''
|
95
|
+
opts.separator 'Generic options:'
|
96
|
+
|
97
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
98
|
+
puts opts
|
99
|
+
exit
|
100
|
+
}
|
101
|
+
|
102
|
+
opts.on('--version', "Print program version and exit") {
|
103
|
+
puts "#{PROGNAME} v#{CMess::BConv::VERSION} (part of cmess v#{CMess::VERSION})"
|
104
|
+
exit
|
105
|
+
}
|
106
|
+
|
107
|
+
opts.separator ''
|
108
|
+
opts.separator "When FILE is -, either STDIN or STDOUT is used (as appropriate)."
|
109
|
+
}.parse!
|
110
|
+
|
111
|
+
cli do
|
112
|
+
if options[:list_encodings]
|
113
|
+
puts CMess::BConv.encodings(options[:chartab_file])
|
114
|
+
exit
|
115
|
+
end
|
116
|
+
|
117
|
+
[:source_encoding, :target_encoding].each { |key|
|
118
|
+
options[key].call if options[key].respond_to?(:call)
|
119
|
+
}
|
120
|
+
|
121
|
+
trailing_args_as_input(options)
|
122
|
+
|
123
|
+
CMess::BConv.convert(
|
124
|
+
options[:input],
|
125
|
+
options[:output],
|
126
|
+
options[:source_encoding],
|
127
|
+
options[:target_encoding],
|
128
|
+
options[:chartab_file]
|
129
|
+
)
|
130
|
+
end
|
data/bin/cinderella
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# cinderella -- Handle double encoded characters #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2007 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
require 'yaml'
|
34
|
+
|
35
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
36
|
+
|
37
|
+
require 'cmess/cinderella'
|
38
|
+
require 'cmess/cli'
|
39
|
+
|
40
|
+
include CMess::CLI
|
41
|
+
|
42
|
+
PROGNAME = File.basename($0)
|
43
|
+
|
44
|
+
options = {
|
45
|
+
:input => STDIN,
|
46
|
+
:output => STDOUT,
|
47
|
+
:pot => nil,
|
48
|
+
:crop => nil,
|
49
|
+
:source_encoding => nil,
|
50
|
+
:target_encoding => determine_system_encoding,
|
51
|
+
:csets => [CMess::Cinderella::DEFAULT_CSETS_DIR],
|
52
|
+
:repair => false
|
53
|
+
}
|
54
|
+
|
55
|
+
OptionParser.new(nil, 40) { |opts|
|
56
|
+
opts.banner = "Usage: #{$0} [options] [FILE...]"
|
57
|
+
|
58
|
+
opts.separator ''
|
59
|
+
opts.separator 'Options:'
|
60
|
+
|
61
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
62
|
+
options[:input] = open_file_or_std(f)
|
63
|
+
options[:input_set] = true
|
64
|
+
}
|
65
|
+
|
66
|
+
opts.separator ''
|
67
|
+
|
68
|
+
opts.on('-p', '--pot FILE', "The good into the pot...") { |f|
|
69
|
+
options[:pot] = open_file_or_std(f, 'w')
|
70
|
+
}
|
71
|
+
|
72
|
+
opts.on('-c', '--crop FILE', "...the bad into the crop") { |f|
|
73
|
+
options[:crop] = open_file_or_std(f, 'w')
|
74
|
+
}
|
75
|
+
|
76
|
+
opts.separator ''
|
77
|
+
|
78
|
+
opts.on('-o', '--output [FILE]', "Write both good and bad lines to FILE or", "default [Default: STDOUT] (Particularly", "useful in combination with the '-r' option)") { |f|
|
79
|
+
options[:output] = open_file_or_std(f, 'w') if f
|
80
|
+
|
81
|
+
options[:pot] = options[:output]
|
82
|
+
options[:crop] = options[:output]
|
83
|
+
}
|
84
|
+
|
85
|
+
opts.separator ''
|
86
|
+
|
87
|
+
opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'", "(Only really useful in combination with", "the '-r' option)") { |f|
|
88
|
+
options[:input], options[:output] = open_file_in_place(f)
|
89
|
+
options[:input_set] = true
|
90
|
+
|
91
|
+
options[:pot] = options[:output]
|
92
|
+
options[:crop] = options[:output]
|
93
|
+
}
|
94
|
+
|
95
|
+
opts.separator ''
|
96
|
+
|
97
|
+
opts.on('-e', '--source-encoding ENCODING', "Source encoding (from) [REQUIRED]") { |e|
|
98
|
+
options[:source_encoding] = e
|
99
|
+
}
|
100
|
+
|
101
|
+
opts.on('-t', '--target-encoding ENCODING', "Target encoding (to); see '-l' for a list", "of available encodings [Default: #{options[:target_encoding]}]") { |e|
|
102
|
+
options[:target_encoding] = e
|
103
|
+
}
|
104
|
+
|
105
|
+
opts.separator ''
|
106
|
+
|
107
|
+
opts.on('-T', '--addtl-target-encodings DIRECTORY', "Directory providing additional char files", "for target encoding") { |d|
|
108
|
+
ensure_directory(d)
|
109
|
+
|
110
|
+
options[:csets] |= [File.expand_path(d)]
|
111
|
+
}
|
112
|
+
|
113
|
+
opts.separator ''
|
114
|
+
|
115
|
+
opts.on('-l', '--list-encodings', "Display a list of available target encodings", "and exit; see '-T' on how to add your own") {
|
116
|
+
csets = options[:csets].inject({}) { |hash, cset|
|
117
|
+
encodings = Dir[File.join(cset, '*.yaml')].sort.map { |yaml|
|
118
|
+
File.basename(yaml, '.yaml') unless File.symlink?(yaml)
|
119
|
+
}.compact
|
120
|
+
|
121
|
+
hash[cset] = encodings unless encodings.empty?
|
122
|
+
hash
|
123
|
+
}
|
124
|
+
|
125
|
+
if csets.empty?
|
126
|
+
puts "No target encodings available for #{PROGNAME}"
|
127
|
+
else
|
128
|
+
puts "Available target encodings for #{PROGNAME}:"
|
129
|
+
csets.each { |cset, encodings|
|
130
|
+
puts "[#{cset}]"
|
131
|
+
encodings.each { |encoding|
|
132
|
+
puts " - #{encoding}"
|
133
|
+
}
|
134
|
+
}
|
135
|
+
end
|
136
|
+
|
137
|
+
exit
|
138
|
+
}
|
139
|
+
|
140
|
+
opts.separator ''
|
141
|
+
|
142
|
+
opts.on('-r', '--repair', "Try to repair corrupted characters") {
|
143
|
+
options[:repair] = true
|
144
|
+
}
|
145
|
+
|
146
|
+
opts.separator ''
|
147
|
+
opts.separator 'Generic options:'
|
148
|
+
|
149
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
150
|
+
puts opts
|
151
|
+
exit
|
152
|
+
}
|
153
|
+
|
154
|
+
opts.on('--version', "Print program version and exit") {
|
155
|
+
puts "#{PROGNAME} v#{CMess::Cinderella::VERSION} (part of cmess v#{CMess::VERSION})"
|
156
|
+
exit
|
157
|
+
}
|
158
|
+
|
159
|
+
opts.separator ''
|
160
|
+
opts.separator "If '-p' or '-c' is omitted, and '-o' is not given either, that particular output"
|
161
|
+
opts.separator "is ignored. When FILE is -, either STDIN or STDOUT is used (as appropriate)."
|
162
|
+
}.parse!
|
163
|
+
|
164
|
+
cli do
|
165
|
+
options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
|
166
|
+
|
167
|
+
abort "No source encoding given! (Use the '-e' switch to do so; see '--help' for more information)" \
|
168
|
+
unless options[:source_encoding]
|
169
|
+
|
170
|
+
yaml_file = "#{options[:target_encoding].downcase}.yaml"
|
171
|
+
char_file = options[:csets].inject(nil) { |path, cset|
|
172
|
+
path = File.join(cset, yaml_file)
|
173
|
+
break path if File.readable?(path)
|
174
|
+
}
|
175
|
+
|
176
|
+
abort "Char file not found for target encoding: #{options[:target_encoding]}" \
|
177
|
+
unless char_file
|
178
|
+
|
179
|
+
trailing_args_as_input(options)
|
180
|
+
|
181
|
+
CMess::Cinderella.pick(
|
182
|
+
options[:input],
|
183
|
+
options[:pot],
|
184
|
+
options[:crop],
|
185
|
+
options[:source_encoding],
|
186
|
+
options[:target_encoding],
|
187
|
+
YAML.load_file(char_file),
|
188
|
+
options[:repair]
|
189
|
+
)
|
190
|
+
end
|
data/bin/decode_entities
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# decode_entities -- Decode HTML entities #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2007 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
|
34
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
35
|
+
|
36
|
+
require 'cmess/decode_entities'
|
37
|
+
require 'cmess/cli'
|
38
|
+
|
39
|
+
include CMess::CLI
|
40
|
+
|
41
|
+
PROGNAME = File.basename($0)
|
42
|
+
|
43
|
+
options = {
|
44
|
+
:input => STDIN,
|
45
|
+
:output => STDOUT,
|
46
|
+
:source_encoding => CMess::DecodeEntities::INTERMEDIATE_ENCODING,
|
47
|
+
:target_encoding => nil
|
48
|
+
}
|
49
|
+
|
50
|
+
OptionParser.new { |opts|
|
51
|
+
opts.banner = "Usage: #{$0} [options] [FILE...]"
|
52
|
+
|
53
|
+
opts.separator ''
|
54
|
+
opts.separator 'Options:'
|
55
|
+
|
56
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
57
|
+
options[:input] = open_file_or_std(f)
|
58
|
+
options[:input_set] = true
|
59
|
+
}
|
60
|
+
|
61
|
+
opts.on('-o', '--output FILE', "Output file to write to [Default: STDOUT]") { |f|
|
62
|
+
options[:output] = open_file_or_std(f, 'w')
|
63
|
+
}
|
64
|
+
|
65
|
+
opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'") { |f|
|
66
|
+
options[:input], options[:output] = open_file_in_place(f)
|
67
|
+
options[:input_set] = true
|
68
|
+
}
|
69
|
+
|
70
|
+
opts.separator ''
|
71
|
+
|
72
|
+
opts.on('-e', '--source-encoding ENCODING', "Encoding of input file [Default: #{options[:source_encoding].upcase}]") { |e|
|
73
|
+
options[:source_encoding] = e.downcase
|
74
|
+
}
|
75
|
+
|
76
|
+
opts.on('-t', '--target-encoding ENCODING', "Desired encoding for output file [Default: <source_encoding>]") { |e|
|
77
|
+
options[:target_encoding] = e.downcase
|
78
|
+
}
|
79
|
+
|
80
|
+
opts.separator ''
|
81
|
+
opts.separator 'Generic options:'
|
82
|
+
|
83
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
84
|
+
puts opts
|
85
|
+
exit
|
86
|
+
}
|
87
|
+
|
88
|
+
opts.on('--version', "Print program version and exit") {
|
89
|
+
puts "#{PROGNAME} v#{CMess::DecodeEntities::VERSION} (part of cmess v#{CMess::VERSION})"
|
90
|
+
exit
|
91
|
+
}
|
92
|
+
|
93
|
+
opts.separator ''
|
94
|
+
opts.separator "When FILE is -, either STDIN or STDOUT is used (as appropriate)."
|
95
|
+
}.parse!
|
96
|
+
|
97
|
+
cli do
|
98
|
+
trailing_args_as_input(options)
|
99
|
+
|
100
|
+
CMess::DecodeEntities.decode(
|
101
|
+
options[:input],
|
102
|
+
options[:output],
|
103
|
+
options[:source_encoding],
|
104
|
+
options[:target_encoding]
|
105
|
+
)
|
106
|
+
end
|