cmess 0.1.0.281 → 0.1.1.283
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +4 -0
- data/README +2 -1
- data/Rakefile +1 -1
- data/bin/bconv +136 -0
- data/bin/cinderella +1 -3
- data/data/chartab.yaml +26724 -0
- data/lib/cmess.rb +2 -0
- data/lib/cmess/bconv.rb +98 -0
- data/lib/cmess/cli.rb +38 -36
- data/lib/cmess/version.rb +1 -1
- metadata +8 -4
data/lib/cmess.rb
CHANGED
|
@@ -36,6 +36,8 @@
|
|
|
36
36
|
# convert them back -- this is where cinderella comes in,
|
|
37
37
|
# sorting the good ones into the pot and the (potentially)
|
|
38
38
|
# bad ones into the crop... (see Cinderella)
|
|
39
|
+
# bconv:: Convert between bibliographic (and other) encodings.
|
|
40
|
+
# (see BConv)
|
|
39
41
|
# decode_entities:: Decode HTML entities in a string. (see DecodeEntities)
|
|
40
42
|
|
|
41
43
|
module CMess
|
data/lib/cmess/bconv.rb
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
#--
|
|
2
|
+
###############################################################################
|
|
3
|
+
# #
|
|
4
|
+
# A component of cmess, the encoding tool-box. #
|
|
5
|
+
# #
|
|
6
|
+
# Copyright (C) 2008 University of Cologne, #
|
|
7
|
+
# Albertus-Magnus-Platz, #
|
|
8
|
+
# 50932 Cologne, Germany #
|
|
9
|
+
# #
|
|
10
|
+
# Authors: #
|
|
11
|
+
# Jens Wille <jens.wille@uni-koeln.de> #
|
|
12
|
+
# #
|
|
13
|
+
# cmess is free software; you can redistribute it and/or modify it under the #
|
|
14
|
+
# terms of the GNU General Public License as published by the Free Software #
|
|
15
|
+
# Foundation; either version 3 of the License, or (at your option) any later #
|
|
16
|
+
# version. #
|
|
17
|
+
# #
|
|
18
|
+
# cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
|
|
19
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
|
|
20
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
|
|
21
|
+
# details. #
|
|
22
|
+
# #
|
|
23
|
+
# You should have received a copy of the GNU General Public License along #
|
|
24
|
+
# with cmess. If not, see <http://www.gnu.org/licenses/>. #
|
|
25
|
+
# #
|
|
26
|
+
###############################################################################
|
|
27
|
+
#++
|
|
28
|
+
|
|
29
|
+
require 'iconv'
|
|
30
|
+
require 'cmess'
|
|
31
|
+
|
|
32
|
+
# Convert between bibliographic (and other) encodings.
|
|
33
|
+
|
|
34
|
+
module CMess::BConv
|
|
35
|
+
|
|
36
|
+
extend self
|
|
37
|
+
|
|
38
|
+
# our version ;-)
|
|
39
|
+
VERSION = '0.0.1'
|
|
40
|
+
|
|
41
|
+
INTERMEDIATE_ENCODING = 'utf-8'
|
|
42
|
+
|
|
43
|
+
def encodings(chartab)
|
|
44
|
+
chartab[chartab.keys.first].keys.map { |encoding|
|
|
45
|
+
encoding.upcase unless encoding =~ /\A__/
|
|
46
|
+
}.compact.sort
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def convert(input, output, source_encoding, target_encoding, chartab)
|
|
50
|
+
source_encoding.upcase!
|
|
51
|
+
target_encoding.upcase!
|
|
52
|
+
|
|
53
|
+
encodings = self.encodings(chartab)
|
|
54
|
+
|
|
55
|
+
if encodings.include?(source_encoding)
|
|
56
|
+
if encodings.include?(target_encoding)
|
|
57
|
+
charmap = chartab.inject({}) { |hash, (code, map)|
|
|
58
|
+
hash.update(map[source_encoding] => map[target_encoding].pack('U*'))
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
input.each_byte { |byte|
|
|
62
|
+
output.print charmap[[byte]] || charmap[[byte, input.getc]]
|
|
63
|
+
}
|
|
64
|
+
else
|
|
65
|
+
iconv = Iconv.new(target_encoding, INTERMEDIATE_ENCODING)
|
|
66
|
+
|
|
67
|
+
charmap = chartab.inject({}) { |hash, (code, map)|
|
|
68
|
+
hash.update(map[source_encoding] => [code.to_i(16)].pack('U*'))
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
input.each_byte { |byte|
|
|
72
|
+
output.print iconv.iconv(charmap[[byte]] || charmap[[byte, input.getc]])
|
|
73
|
+
}
|
|
74
|
+
end
|
|
75
|
+
else
|
|
76
|
+
if encodings.include?(target_encoding)
|
|
77
|
+
iconv = Iconv.new(INTERMEDIATE_ENCODING, source_encoding)
|
|
78
|
+
|
|
79
|
+
charmap = chartab.inject({}) { |hash, (code, map)|
|
|
80
|
+
hash.update(code.to_i(16) => map[target_encoding].pack('U*'))
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
input.each { |line|
|
|
84
|
+
iconv.iconv(line).unpack('U*').each { |byte|
|
|
85
|
+
output.print charmap[byte]
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
else
|
|
89
|
+
iconv = Iconv.new(target_encoding, source_encoding)
|
|
90
|
+
|
|
91
|
+
input.each { |line|
|
|
92
|
+
output.puts iconv.iconv(line)
|
|
93
|
+
}
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
end
|
data/lib/cmess/cli.rb
CHANGED
|
@@ -28,52 +28,54 @@
|
|
|
28
28
|
|
|
29
29
|
module CMess::CLI
|
|
30
30
|
|
|
31
|
-
|
|
32
|
-
abort "Can't find input file: #{file}" unless File.readable?(file)
|
|
33
|
-
end
|
|
31
|
+
DATA_DIR = File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'data'))
|
|
34
32
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
33
|
+
def ensure_readable(file)
|
|
34
|
+
abort "Can't find input file: #{file}" unless File.readable?(file)
|
|
35
|
+
end
|
|
38
36
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
37
|
+
def ensure_directory(dir)
|
|
38
|
+
abort "Directory not found: #{dir}" unless File.directory?(dir)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def open_file_in_place(file)
|
|
42
|
+
ensure_readable(file)
|
|
43
|
+
[File.readlines(file), File.open(file, 'w')]
|
|
44
|
+
end
|
|
43
45
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
end
|
|
52
|
-
else
|
|
53
|
-
ensure_readable(file) unless mode == 'w'
|
|
54
|
-
File.open(file, mode)
|
|
46
|
+
def open_file_or_std(file, mode = 'r')
|
|
47
|
+
if file == '-'
|
|
48
|
+
case mode
|
|
49
|
+
when 'r': STDIN
|
|
50
|
+
when 'w': STDOUT
|
|
51
|
+
when 'a': STDERR
|
|
52
|
+
else raise ArgumentError, "don't know how to handle mode '#{mode}'"
|
|
55
53
|
end
|
|
54
|
+
else
|
|
55
|
+
ensure_readable(file) unless mode == 'w'
|
|
56
|
+
File.open(file, mode)
|
|
56
57
|
end
|
|
58
|
+
end
|
|
57
59
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
60
|
+
def determine_system_encoding
|
|
61
|
+
ENV['SYSTEM_ENCODING'] ||
|
|
62
|
+
ENV['LANG'][/\.(.*)/, 1] ||
|
|
63
|
+
system_encoding_not_found
|
|
64
|
+
end
|
|
63
65
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
66
|
+
def system_encoding_not_found
|
|
67
|
+
not_found = lambda {
|
|
68
|
+
abort <<-EOT
|
|
67
69
|
Your system's encoding couldn't be determined automatically -- please specify it
|
|
68
70
|
explicitly via the SYSTEM_ENCODING environment variable or via the '-t' option.
|
|
69
|
-
|
|
70
|
-
|
|
71
|
+
EOT
|
|
72
|
+
}
|
|
71
73
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
not_found
|
|
74
|
+
def not_found.to_s
|
|
75
|
+
'NOT FOUND'
|
|
77
76
|
end
|
|
78
77
|
|
|
78
|
+
not_found
|
|
79
79
|
end
|
|
80
|
+
|
|
81
|
+
end
|
data/lib/cmess/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: cmess
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.1.283
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Jens Wille
|
|
@@ -9,7 +9,7 @@ autorequire:
|
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
11
|
|
|
12
|
-
date: 2008-09-
|
|
12
|
+
date: 2008-09-16 00:00:00 +02:00
|
|
13
13
|
default_executable:
|
|
14
14
|
dependencies:
|
|
15
15
|
- !ruby/object:Gem::Dependency
|
|
@@ -32,10 +32,11 @@ dependencies:
|
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
33
|
version: "0"
|
|
34
34
|
version:
|
|
35
|
-
description: "Assist with handling messed up encodings (Currently includes the following tools: cinderella, decode_entities, guess_encoding)"
|
|
35
|
+
description: "Assist with handling messed up encodings (Currently includes the following tools: bconv, cinderella, decode_entities, guess_encoding)"
|
|
36
36
|
email: jens.wille@uni-koeln.de
|
|
37
37
|
executables:
|
|
38
38
|
- cinderella
|
|
39
|
+
- bconv
|
|
39
40
|
- decode_entities
|
|
40
41
|
- guess_encoding
|
|
41
42
|
extensions: []
|
|
@@ -46,6 +47,7 @@ extra_rdoc_files:
|
|
|
46
47
|
- README
|
|
47
48
|
files:
|
|
48
49
|
- lib/cmess.rb
|
|
50
|
+
- lib/cmess/bconv.rb
|
|
49
51
|
- lib/cmess/version.rb
|
|
50
52
|
- lib/cmess/guess_encoding.rb
|
|
51
53
|
- lib/cmess/cli.rb
|
|
@@ -55,6 +57,7 @@ files:
|
|
|
55
57
|
- lib/cmess/guess_encoding/encoding.rb
|
|
56
58
|
- lib/cmess/guess_encoding/automatic.rb
|
|
57
59
|
- bin/cinderella
|
|
60
|
+
- bin/bconv
|
|
58
61
|
- bin/decode_entities
|
|
59
62
|
- bin/guess_encoding
|
|
60
63
|
- COPYING
|
|
@@ -94,6 +97,7 @@ files:
|
|
|
94
97
|
- data/csets/unicode/basic_latin.yaml
|
|
95
98
|
- data/csets/unicode/cyrillic.yaml
|
|
96
99
|
- data/test_chars.yaml
|
|
100
|
+
- data/chartab.yaml
|
|
97
101
|
has_rdoc: true
|
|
98
102
|
homepage: http://prometheus.rubyforge.org/cmess
|
|
99
103
|
post_install_message:
|
|
@@ -127,6 +131,6 @@ rubyforge_project: prometheus
|
|
|
127
131
|
rubygems_version: 1.2.0
|
|
128
132
|
signing_key:
|
|
129
133
|
specification_version: 2
|
|
130
|
-
summary: "Assist with handling messed up encodings (Currently includes the following tools: cinderella, decode_entities, guess_encoding)"
|
|
134
|
+
summary: "Assist with handling messed up encodings (Currently includes the following tools: bconv, cinderella, decode_entities, guess_encoding)"
|
|
131
135
|
test_files: []
|
|
132
136
|
|