blackwinter-cmess 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +676 -0
- data/ChangeLog +54 -0
- data/README +63 -0
- data/Rakefile +51 -0
- data/bin/bconv +130 -0
- data/bin/cinderella +190 -0
- data/bin/decode_entities +106 -0
- data/bin/guess_encoding +223 -0
- data/data/chartab.yaml +26724 -0
- data/data/csets/iso_8859-1.yaml +195 -0
- data/data/csets/iso_8859-15.yaml +204 -0
- data/data/csets/latin1.yaml +195 -0
- data/data/csets/unicode/basic_latin.yaml +97 -0
- data/data/csets/unicode/cyrillic-supplement.yaml +17 -0
- data/data/csets/unicode/cyrillic.yaml +256 -0
- data/data/csets/unicode/greek.yaml +129 -0
- data/data/csets/unicode/ipa_extensions.yaml +97 -0
- data/data/csets/unicode/latin-extended-c.yaml +18 -0
- data/data/csets/unicode/latin-extended-d.yaml +3 -0
- data/data/csets/unicode/latin_1_supplement.yaml +128 -0
- data/data/csets/unicode/latin_extended_a.yaml +129 -0
- data/data/csets/unicode/latin_extended_additional.yaml +247 -0
- data/data/csets/unicode/latin_extended_b.yaml +209 -0
- data/data/csets/unicode/letterlike_symbols.yaml +80 -0
- data/data/csets/unicode/spacing_modifier_letters.yaml +81 -0
- data/data/csets/utf-8.yaml +1504 -0
- data/data/csets/utf8.yaml +1504 -0
- data/data/test_chars.yaml +14 -0
- data/example/cinderella/crop +127 -0
- data/example/cinderella/crop_repaired +127 -0
- data/example/cinderella/empty6-slash.txt +1495 -0
- data/example/cinderella/empty6-slash_repaired.txt +1495 -0
- data/example/cinderella/pot +1368 -0
- data/example/guess_encoding/check_results +60 -0
- data/example/guess_encoding/de.utf-8.txt +10030 -0
- data/example/guess_encoding/en.utf-8.txt +10030 -0
- data/example/guess_encoding/fr.utf-8.txt +10030 -0
- data/example/guess_encoding/it.utf-8.txt +10030 -0
- data/lib/cmess/bconv.rb +169 -0
- data/lib/cmess/cinderella.rb +66 -0
- data/lib/cmess/cli.rb +120 -0
- data/lib/cmess/decode_entities.rb +69 -0
- data/lib/cmess/guess_encoding/automatic.rb +343 -0
- data/lib/cmess/guess_encoding/encoding.rb +78 -0
- data/lib/cmess/guess_encoding/manual.rb +108 -0
- data/lib/cmess/guess_encoding.rb +61 -0
- data/lib/cmess/version.rb +51 -0
- data/lib/cmess.rb +49 -0
- metadata +136 -0
data/ChangeLog
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
= Revision history for cmess
|
2
|
+
|
3
|
+
== 0.2.0 [2009-05-08]
|
4
|
+
|
5
|
+
* Ruby 1.9 compatibility (Syntax, at least)
|
6
|
+
|
7
|
+
== 0.1.2 [2008-09-17]
|
8
|
+
|
9
|
+
* Some refactoring; started to make tools more usable as a library
|
10
|
+
* Make tools accept multiple files for input (via tempfile)
|
11
|
+
* Use ENV.user_encoding from ruby-nuggets
|
12
|
+
* Wrap command execution in a block catching any exceptions
|
13
|
+
|
14
|
+
== 0.1.1 [2008-09-16]
|
15
|
+
|
16
|
+
* Added bconv tool to convert between bibliographic encodings
|
17
|
+
|
18
|
+
== 0.1.0 [2008-09-15]
|
19
|
+
|
20
|
+
* Added ability to operate on all encodings available on a system
|
21
|
+
|
22
|
+
== 0.0.9 [2008-08-15]
|
23
|
+
|
24
|
+
* Reorganized file structure for guess_encoding
|
25
|
+
* Added shortcuts GuessEncoding.manual/.automatic
|
26
|
+
* GuessEncoding::Automatic now also takes a String
|
27
|
+
as input (will be converted to a StringIO)
|
28
|
+
|
29
|
+
== 0.0.8 [2008-08-14]
|
30
|
+
|
31
|
+
* Require 'cmess' inside libs, so the user doesn't have to
|
32
|
+
|
33
|
+
== 0.0.7 [2008-05-19]
|
34
|
+
|
35
|
+
* Fixed "Illegal seek" error when inside a pipe
|
36
|
+
|
37
|
+
== 0.0.6 [2008-01-30]
|
38
|
+
|
39
|
+
* Added ability to specify charcodes as input for manual guessing
|
40
|
+
* Improved automatic guessing and further enhancements
|
41
|
+
|
42
|
+
== 0.0.5 [2008-01-21]
|
43
|
+
|
44
|
+
* Made automatic guessing the default for guess_encoding
|
45
|
+
* Allow to specify input file as argument
|
46
|
+
|
47
|
+
== 0.0.4 [2007-12-18]
|
48
|
+
|
49
|
+
* Added BOM detection
|
50
|
+
|
51
|
+
== 0.0.3 [2007-12-12]
|
52
|
+
|
53
|
+
* Added automatic encoding detection to GuessEncoding. Idea and original
|
54
|
+
implementation provided by John. Thanks :-)
|
data/README
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
= cmess - Assist with handling messed up encodings
|
2
|
+
|
3
|
+
== VERSION
|
4
|
+
|
5
|
+
This documentation refers to cmess version 0.2.0
|
6
|
+
|
7
|
+
|
8
|
+
== DESCRIPTION
|
9
|
+
|
10
|
+
CMess bundles several tools under its hood that aim at dealing with various
|
11
|
+
problems occurring in the context of character sets and encodings. Currently,
|
12
|
+
there are:
|
13
|
+
|
14
|
+
guess_encoding:: Simple helper to identify the encoding of a given string.
|
15
|
+
Includes the ability to automatically detect the encoding
|
16
|
+
of an input.
|
17
|
+
cinderella:: When characters are "double encoded", you can't easily
|
18
|
+
convert them back -- this is where cinderella comes in,
|
19
|
+
sorting the good ones into the pot and the (potentially)
|
20
|
+
bad ones into the crop...
|
21
|
+
bconv:: Convert between bibliographic (and other) encodings.
|
22
|
+
decode_entities:: Decode HTML entities in a string.
|
23
|
+
|
24
|
+
TODO: well, more of the description... ;-)
|
25
|
+
|
26
|
+
|
27
|
+
== LINKS
|
28
|
+
|
29
|
+
<b></b>
|
30
|
+
Documentation:: <http://prometheus.rubyforge.org/cmess>
|
31
|
+
Source code (old):: <http://prometheus.rubyforge.org/svn/scratch/cmess>
|
32
|
+
Source code:: <http://github.com/blackwinter/cmess>
|
33
|
+
Rubyforge project:: <http://rubyforge.org/projects/prometheus>
|
34
|
+
|
35
|
+
|
36
|
+
== AUTHORS
|
37
|
+
|
38
|
+
* Jens Wille <mailto:jens.wille@uni-koeln.de>
|
39
|
+
|
40
|
+
|
41
|
+
== CREDITS
|
42
|
+
|
43
|
+
* John Vorhauer <mailto:john@vorhauer.de> for the idea and
|
44
|
+
original implementation of the automatic encoding guesser
|
45
|
+
(see CMess::GuessEncoding::Automatic).
|
46
|
+
|
47
|
+
|
48
|
+
== LICENSE AND COPYRIGHT
|
49
|
+
|
50
|
+
Copyright (C) 2007-2008 University of Cologne,
|
51
|
+
Albertus-Magnus-Platz, 50932 Cologne, Germany
|
52
|
+
|
53
|
+
cmess is free software: you can redistribute it and/or modify it under the
|
54
|
+
terms of the GNU General Public License as published by the Free Software
|
55
|
+
Foundation, either version 3 of the License, or (at your option) any later
|
56
|
+
version.
|
57
|
+
|
58
|
+
cmess is distributed in the hope that it will be useful, but WITHOUT ANY
|
59
|
+
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
60
|
+
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
61
|
+
|
62
|
+
You should have received a copy of the GNU General Public License along with
|
63
|
+
cmess. If not, see <http://www.gnu.org/licenses/>.
|
data/Rakefile
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
$:.unshift('lib')
|
2
|
+
require 'cmess'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'hen'
|
6
|
+
|
7
|
+
Hen.lay! {{
|
8
|
+
:rubyforge => {
|
9
|
+
:package => 'cmess'
|
10
|
+
},
|
11
|
+
|
12
|
+
:gem => {
|
13
|
+
:version => CMess::VERSION,
|
14
|
+
:summary => %Q{
|
15
|
+
Assist with handling messed up encodings (Currently includes the
|
16
|
+
following tools: #{Dir['bin/*'].map { |e| File.basename(e) }.sort.join(', ')})
|
17
|
+
},
|
18
|
+
:files => FileList['lib/**/*.rb', 'bin/*'].to_a,
|
19
|
+
:extra_files => FileList['[A-Z]*', 'example/**/*', 'data/**/*'].to_a,
|
20
|
+
:dependencies => [['ruby-nuggets', '>= 0.3.3'], 'htmlentities']
|
21
|
+
}
|
22
|
+
}}
|
23
|
+
rescue LoadError
|
24
|
+
abort "Please install the 'hen' gem first."
|
25
|
+
end
|
26
|
+
|
27
|
+
namespace :guess_encoding do
|
28
|
+
|
29
|
+
require 'cmess/guess_encoding'
|
30
|
+
include CMess::GuessEncoding::Encoding
|
31
|
+
|
32
|
+
desc "Compare actual encoding and automatic guess of example files"
|
33
|
+
task :check_examples do
|
34
|
+
Dir[File.join(File.dirname(__FILE__), 'example', 'guess_encoding', '??.*.txt')].sort.each { |example|
|
35
|
+
language, encoding = File.basename(example, '.txt').split('.')
|
36
|
+
encoding.upcase!
|
37
|
+
|
38
|
+
guessed = CMess::GuessEncoding::Automatic.guess(File.open(example))
|
39
|
+
|
40
|
+
match = case guessed
|
41
|
+
when UNKNOWN: '?'
|
42
|
+
when ASCII: '#'
|
43
|
+
when encoding: '+'
|
44
|
+
else '-'
|
45
|
+
end
|
46
|
+
|
47
|
+
puts '%s %s/%-11s => %s' % [match, language, encoding, guessed]
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
data/bin/bconv
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# bconv -- Convert between bibliographic (and other) encodings #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2008 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
|
34
|
+
require 'rubygems'
|
35
|
+
require 'nuggets/string/word_wrap'
|
36
|
+
|
37
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
38
|
+
|
39
|
+
require 'cmess/bconv'
|
40
|
+
require 'cmess/cli'
|
41
|
+
|
42
|
+
include CMess::CLI
|
43
|
+
|
44
|
+
PROGNAME = File.basename($0)
|
45
|
+
|
46
|
+
options = {
|
47
|
+
:input => STDIN,
|
48
|
+
:output => STDOUT,
|
49
|
+
:source_encoding => determine_system_encoding,
|
50
|
+
:target_encoding => determine_system_encoding,
|
51
|
+
:chartab_file => CMess::BConv::DEFAULT_CHARTAB_FILE
|
52
|
+
}
|
53
|
+
|
54
|
+
OptionParser.new(nil, 40) { |opts|
|
55
|
+
opts.banner = "Usage: #{$0} [options] [FILE...]"
|
56
|
+
|
57
|
+
opts.separator ''
|
58
|
+
opts.separator 'Options:'
|
59
|
+
|
60
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
61
|
+
options[:input] = open_file_or_std(f)
|
62
|
+
options[:input_set] = true
|
63
|
+
}
|
64
|
+
|
65
|
+
opts.on('-o', '--output FILE', "Output file to write to [Default: STDOUT]") { |f|
|
66
|
+
options[:output] = open_file_or_std(f, 'w')
|
67
|
+
}
|
68
|
+
|
69
|
+
opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'") { |f|
|
70
|
+
options[:input], options[:output] = open_file_in_place(f)
|
71
|
+
options[:input_set] = true
|
72
|
+
}
|
73
|
+
|
74
|
+
opts.separator ''
|
75
|
+
|
76
|
+
opts.on('-e', '--source-encoding ENCODING', "Encoding of input file [Default: #{options[:source_encoding]}]") { |e|
|
77
|
+
options[:source_encoding] = e.downcase
|
78
|
+
}
|
79
|
+
|
80
|
+
opts.on('-t', '--target-encoding ENCODING', "Desired encoding for output file [Default: #{options[:target_encoding]}]") { |e|
|
81
|
+
options[:target_encoding] = e.downcase
|
82
|
+
}
|
83
|
+
|
84
|
+
opts.separator ''
|
85
|
+
|
86
|
+
opts.on('-c', '--chartab YAML_FILE', "File containing character mappings, in YAML format.", "[Default: #{options[:chartab_file]}]") { |c|
|
87
|
+
options[:chartab_file] = c
|
88
|
+
}
|
89
|
+
|
90
|
+
opts.on('-l', '--list-encodings', "Print a list of all available bibliographic", "encodings and exit; depends on <chartab>, see '-c'") {
|
91
|
+
options[:list_encodings] = true
|
92
|
+
}
|
93
|
+
|
94
|
+
opts.separator ''
|
95
|
+
opts.separator 'Generic options:'
|
96
|
+
|
97
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
98
|
+
puts opts
|
99
|
+
exit
|
100
|
+
}
|
101
|
+
|
102
|
+
opts.on('--version', "Print program version and exit") {
|
103
|
+
puts "#{PROGNAME} v#{CMess::BConv::VERSION} (part of cmess v#{CMess::VERSION})"
|
104
|
+
exit
|
105
|
+
}
|
106
|
+
|
107
|
+
opts.separator ''
|
108
|
+
opts.separator "When FILE is -, either STDIN or STDOUT is used (as appropriate)."
|
109
|
+
}.parse!
|
110
|
+
|
111
|
+
cli do
|
112
|
+
if options[:list_encodings]
|
113
|
+
puts CMess::BConv.encodings(options[:chartab_file])
|
114
|
+
exit
|
115
|
+
end
|
116
|
+
|
117
|
+
[:source_encoding, :target_encoding].each { |key|
|
118
|
+
options[key].call if options[key].respond_to?(:call)
|
119
|
+
}
|
120
|
+
|
121
|
+
trailing_args_as_input(options)
|
122
|
+
|
123
|
+
CMess::BConv.convert(
|
124
|
+
options[:input],
|
125
|
+
options[:output],
|
126
|
+
options[:source_encoding],
|
127
|
+
options[:target_encoding],
|
128
|
+
options[:chartab_file]
|
129
|
+
)
|
130
|
+
end
|
data/bin/cinderella
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# cinderella -- Handle double encoded characters #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2007 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
require 'yaml'
|
34
|
+
|
35
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
36
|
+
|
37
|
+
require 'cmess/cinderella'
|
38
|
+
require 'cmess/cli'
|
39
|
+
|
40
|
+
include CMess::CLI
|
41
|
+
|
42
|
+
PROGNAME = File.basename($0)
|
43
|
+
|
44
|
+
options = {
|
45
|
+
:input => STDIN,
|
46
|
+
:output => STDOUT,
|
47
|
+
:pot => nil,
|
48
|
+
:crop => nil,
|
49
|
+
:source_encoding => nil,
|
50
|
+
:target_encoding => determine_system_encoding,
|
51
|
+
:csets => [CMess::Cinderella::DEFAULT_CSETS_DIR],
|
52
|
+
:repair => false
|
53
|
+
}
|
54
|
+
|
55
|
+
OptionParser.new(nil, 40) { |opts|
|
56
|
+
opts.banner = "Usage: #{$0} [options] [FILE...]"
|
57
|
+
|
58
|
+
opts.separator ''
|
59
|
+
opts.separator 'Options:'
|
60
|
+
|
61
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
62
|
+
options[:input] = open_file_or_std(f)
|
63
|
+
options[:input_set] = true
|
64
|
+
}
|
65
|
+
|
66
|
+
opts.separator ''
|
67
|
+
|
68
|
+
opts.on('-p', '--pot FILE', "The good into the pot...") { |f|
|
69
|
+
options[:pot] = open_file_or_std(f, 'w')
|
70
|
+
}
|
71
|
+
|
72
|
+
opts.on('-c', '--crop FILE', "...the bad into the crop") { |f|
|
73
|
+
options[:crop] = open_file_or_std(f, 'w')
|
74
|
+
}
|
75
|
+
|
76
|
+
opts.separator ''
|
77
|
+
|
78
|
+
opts.on('-o', '--output [FILE]', "Write both good and bad lines to FILE or", "default [Default: STDOUT] (Particularly", "useful in combination with the '-r' option)") { |f|
|
79
|
+
options[:output] = open_file_or_std(f, 'w') if f
|
80
|
+
|
81
|
+
options[:pot] = options[:output]
|
82
|
+
options[:crop] = options[:output]
|
83
|
+
}
|
84
|
+
|
85
|
+
opts.separator ''
|
86
|
+
|
87
|
+
opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'", "(Only really useful in combination with", "the '-r' option)") { |f|
|
88
|
+
options[:input], options[:output] = open_file_in_place(f)
|
89
|
+
options[:input_set] = true
|
90
|
+
|
91
|
+
options[:pot] = options[:output]
|
92
|
+
options[:crop] = options[:output]
|
93
|
+
}
|
94
|
+
|
95
|
+
opts.separator ''
|
96
|
+
|
97
|
+
opts.on('-e', '--source-encoding ENCODING', "Source encoding (from) [REQUIRED]") { |e|
|
98
|
+
options[:source_encoding] = e
|
99
|
+
}
|
100
|
+
|
101
|
+
opts.on('-t', '--target-encoding ENCODING', "Target encoding (to); see '-l' for a list", "of available encodings [Default: #{options[:target_encoding]}]") { |e|
|
102
|
+
options[:target_encoding] = e
|
103
|
+
}
|
104
|
+
|
105
|
+
opts.separator ''
|
106
|
+
|
107
|
+
opts.on('-T', '--addtl-target-encodings DIRECTORY', "Directory providing additional char files", "for target encoding") { |d|
|
108
|
+
ensure_directory(d)
|
109
|
+
|
110
|
+
options[:csets] |= [File.expand_path(d)]
|
111
|
+
}
|
112
|
+
|
113
|
+
opts.separator ''
|
114
|
+
|
115
|
+
opts.on('-l', '--list-encodings', "Display a list of available target encodings", "and exit; see '-T' on how to add your own") {
|
116
|
+
csets = options[:csets].inject({}) { |hash, cset|
|
117
|
+
encodings = Dir[File.join(cset, '*.yaml')].sort.map { |yaml|
|
118
|
+
File.basename(yaml, '.yaml') unless File.symlink?(yaml)
|
119
|
+
}.compact
|
120
|
+
|
121
|
+
hash[cset] = encodings unless encodings.empty?
|
122
|
+
hash
|
123
|
+
}
|
124
|
+
|
125
|
+
if csets.empty?
|
126
|
+
puts "No target encodings available for #{PROGNAME}"
|
127
|
+
else
|
128
|
+
puts "Available target encodings for #{PROGNAME}:"
|
129
|
+
csets.each { |cset, encodings|
|
130
|
+
puts "[#{cset}]"
|
131
|
+
encodings.each { |encoding|
|
132
|
+
puts " - #{encoding}"
|
133
|
+
}
|
134
|
+
}
|
135
|
+
end
|
136
|
+
|
137
|
+
exit
|
138
|
+
}
|
139
|
+
|
140
|
+
opts.separator ''
|
141
|
+
|
142
|
+
opts.on('-r', '--repair', "Try to repair corrupted characters") {
|
143
|
+
options[:repair] = true
|
144
|
+
}
|
145
|
+
|
146
|
+
opts.separator ''
|
147
|
+
opts.separator 'Generic options:'
|
148
|
+
|
149
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
150
|
+
puts opts
|
151
|
+
exit
|
152
|
+
}
|
153
|
+
|
154
|
+
opts.on('--version', "Print program version and exit") {
|
155
|
+
puts "#{PROGNAME} v#{CMess::Cinderella::VERSION} (part of cmess v#{CMess::VERSION})"
|
156
|
+
exit
|
157
|
+
}
|
158
|
+
|
159
|
+
opts.separator ''
|
160
|
+
opts.separator "If '-p' or '-c' is omitted, and '-o' is not given either, that particular output"
|
161
|
+
opts.separator "is ignored. When FILE is -, either STDIN or STDOUT is used (as appropriate)."
|
162
|
+
}.parse!
|
163
|
+
|
164
|
+
cli do
|
165
|
+
options[:target_encoding].call if options[:target_encoding].respond_to?(:call)
|
166
|
+
|
167
|
+
abort "No source encoding given! (Use the '-e' switch to do so; see '--help' for more information)" \
|
168
|
+
unless options[:source_encoding]
|
169
|
+
|
170
|
+
yaml_file = "#{options[:target_encoding].downcase}.yaml"
|
171
|
+
char_file = options[:csets].inject(nil) { |path, cset|
|
172
|
+
path = File.join(cset, yaml_file)
|
173
|
+
break path if File.readable?(path)
|
174
|
+
}
|
175
|
+
|
176
|
+
abort "Char file not found for target encoding: #{options[:target_encoding]}" \
|
177
|
+
unless char_file
|
178
|
+
|
179
|
+
trailing_args_as_input(options)
|
180
|
+
|
181
|
+
CMess::Cinderella.pick(
|
182
|
+
options[:input],
|
183
|
+
options[:pot],
|
184
|
+
options[:crop],
|
185
|
+
options[:source_encoding],
|
186
|
+
options[:target_encoding],
|
187
|
+
YAML.load_file(char_file),
|
188
|
+
options[:repair]
|
189
|
+
)
|
190
|
+
end
|
data/bin/decode_entities
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
#! /usr/bin/ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
###############################################################################
|
5
|
+
# #
|
6
|
+
# decode_entities -- Decode HTML entities #
|
7
|
+
# [A component of cmess, the encoding tool-box] #
|
8
|
+
# #
|
9
|
+
# Copyright (C) 2007 University of Cologne, #
|
10
|
+
# Albertus-Magnus-Platz, #
|
11
|
+
# 50932 Cologne, Germany #
|
12
|
+
# #
|
13
|
+
# Authors: #
|
14
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
15
|
+
# #
|
16
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
17
|
+
# terms of the GNU General Public License as published by the Free Software #
|
18
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
19
|
+
# version. #
|
20
|
+
# #
|
21
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
22
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
23
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
24
|
+
# details. #
|
25
|
+
# #
|
26
|
+
# You should have received a copy of the GNU General Public License along #
|
27
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
28
|
+
# #
|
29
|
+
###############################################################################
|
30
|
+
#++
|
31
|
+
|
32
|
+
require 'optparse'
|
33
|
+
|
34
|
+
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
35
|
+
|
36
|
+
require 'cmess/decode_entities'
|
37
|
+
require 'cmess/cli'
|
38
|
+
|
39
|
+
include CMess::CLI
|
40
|
+
|
41
|
+
PROGNAME = File.basename($0)
|
42
|
+
|
43
|
+
options = {
|
44
|
+
:input => STDIN,
|
45
|
+
:output => STDOUT,
|
46
|
+
:source_encoding => CMess::DecodeEntities::INTERMEDIATE_ENCODING,
|
47
|
+
:target_encoding => nil
|
48
|
+
}
|
49
|
+
|
50
|
+
OptionParser.new { |opts|
|
51
|
+
opts.banner = "Usage: #{$0} [options] [FILE...]"
|
52
|
+
|
53
|
+
opts.separator ''
|
54
|
+
opts.separator 'Options:'
|
55
|
+
|
56
|
+
opts.on('-i', '--input FILE', "Input file to read from [Default: STDIN]") { |f|
|
57
|
+
options[:input] = open_file_or_std(f)
|
58
|
+
options[:input_set] = true
|
59
|
+
}
|
60
|
+
|
61
|
+
opts.on('-o', '--output FILE', "Output file to write to [Default: STDOUT]") { |f|
|
62
|
+
options[:output] = open_file_or_std(f, 'w')
|
63
|
+
}
|
64
|
+
|
65
|
+
opts.on('-I', '--in-place FILE', "Modify file in-place; sets '-i' and '-o'") { |f|
|
66
|
+
options[:input], options[:output] = open_file_in_place(f)
|
67
|
+
options[:input_set] = true
|
68
|
+
}
|
69
|
+
|
70
|
+
opts.separator ''
|
71
|
+
|
72
|
+
opts.on('-e', '--source-encoding ENCODING', "Encoding of input file [Default: #{options[:source_encoding].upcase}]") { |e|
|
73
|
+
options[:source_encoding] = e.downcase
|
74
|
+
}
|
75
|
+
|
76
|
+
opts.on('-t', '--target-encoding ENCODING', "Desired encoding for output file [Default: <source_encoding>]") { |e|
|
77
|
+
options[:target_encoding] = e.downcase
|
78
|
+
}
|
79
|
+
|
80
|
+
opts.separator ''
|
81
|
+
opts.separator 'Generic options:'
|
82
|
+
|
83
|
+
opts.on('-h', '--help', "Print this help message and exit") {
|
84
|
+
puts opts
|
85
|
+
exit
|
86
|
+
}
|
87
|
+
|
88
|
+
opts.on('--version', "Print program version and exit") {
|
89
|
+
puts "#{PROGNAME} v#{CMess::DecodeEntities::VERSION} (part of cmess v#{CMess::VERSION})"
|
90
|
+
exit
|
91
|
+
}
|
92
|
+
|
93
|
+
opts.separator ''
|
94
|
+
opts.separator "When FILE is -, either STDIN or STDOUT is used (as appropriate)."
|
95
|
+
}.parse!
|
96
|
+
|
97
|
+
cli do
|
98
|
+
trailing_args_as_input(options)
|
99
|
+
|
100
|
+
CMess::DecodeEntities.decode(
|
101
|
+
options[:input],
|
102
|
+
options[:output],
|
103
|
+
options[:source_encoding],
|
104
|
+
options[:target_encoding]
|
105
|
+
)
|
106
|
+
end
|