sanzang 0.0.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/HACKING +22 -40
- data/MANUAL +312 -0
- data/README +35 -265
- data/bin/{sanzang-reflow → sanzang} +1 -1
- data/lib/sanzang.rb +12 -35
- data/lib/sanzang/batch_translator.rb +77 -0
- data/lib/sanzang/command/batch.rb +131 -0
- data/lib/sanzang/command/reflow.rb +35 -32
- data/lib/sanzang/command/sanzang_cmd.rb +132 -0
- data/lib/sanzang/command/translate.rb +47 -70
- data/lib/sanzang/text_formatter.rb +1 -1
- data/lib/sanzang/translation_table.rb +34 -49
- data/lib/sanzang/translator.rb +1 -65
- data/lib/sanzang/version.rb +3 -2
- data/test/tc_reflow_encodings.rb +2 -2
- data/test/tc_simple_translation.rb +8 -12
- data/test/utf-8/stage_3.txt +4 -0
- metadata +25 -31
- data/bin/sanzang-translate +0 -21
@@ -24,27 +24,23 @@ require_relative File.join("..", "version")
|
|
24
24
|
|
25
25
|
module Sanzang::Command
|
26
26
|
|
27
|
-
#
|
28
|
-
# text
|
29
|
-
#
|
30
|
-
# this is to do initial text transformations to ensure (1) that terms will
|
31
|
-
# be translated reliably, and (2) that the final output of the translation
|
32
|
-
# will be readable by the user (i.e. lines not too long).
|
27
|
+
# This class provides a command for simple translation of one file or text.
|
28
|
+
# Input and output text can be read from either stdin and stdout, or from
|
29
|
+
# files. For mass translation of texts, see Sanzang::Command::Batch.
|
33
30
|
#
|
34
31
|
class Translate
|
35
32
|
|
36
33
|
# Create a new instance of the Translate class.
|
37
34
|
#
|
38
35
|
def initialize
|
39
|
-
@name = "sanzang
|
36
|
+
@name = "sanzang translate"
|
40
37
|
@encoding = nil
|
41
|
-
@batch_dir = nil
|
42
38
|
@infile = nil
|
43
39
|
@outfile = nil
|
44
40
|
end
|
45
41
|
|
46
|
-
# Run the
|
47
|
-
# would typically be an
|
42
|
+
# Run the translate command with the given arguments. The parameter _args_
|
43
|
+
# would typically be an array of command options and parameters. Calling
|
48
44
|
# this with the "-h" or "--help" option will print full usage information
|
49
45
|
# necessary for running this command.
|
50
46
|
#
|
@@ -53,7 +49,7 @@ module Sanzang::Command
|
|
53
49
|
parser.parse!(args)
|
54
50
|
|
55
51
|
if args.length != 1
|
56
|
-
puts parser
|
52
|
+
$stderr.puts parser
|
57
53
|
return 1
|
58
54
|
end
|
59
55
|
|
@@ -61,30 +57,22 @@ module Sanzang::Command
|
|
61
57
|
|
62
58
|
translator = nil
|
63
59
|
File.open(args[0], "rb", encoding: @encoding) do |table_file|
|
64
|
-
table = Sanzang::TranslationTable.new(table_file)
|
60
|
+
table = Sanzang::TranslationTable.new(table_file.read)
|
65
61
|
translator = Sanzang::Translator.new(table)
|
66
62
|
end
|
67
63
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
64
|
+
begin
|
65
|
+
fin = @infile ? File.open(@infile, "rb") : $stdin
|
66
|
+
fin.binmode.set_encoding(@encoding)
|
67
|
+
fout = @outfile ? File.open(@outfile, "wb") : $stdout
|
68
|
+
fout.binmode.set_encoding(@encoding)
|
69
|
+
translator.translate_io(fin, fout)
|
70
|
+
ensure
|
71
|
+
if defined?(fin) and fin != $stdin
|
72
|
+
fin.close if not fin.closed?
|
72
73
|
end
|
73
|
-
|
74
|
-
|
75
|
-
begin
|
76
|
-
fin = @infile ? File.open(@infile, "rb") : $stdin
|
77
|
-
fin.binmode.set_encoding(@encoding)
|
78
|
-
fout = @outfile ? File.open(@outfile, "wb") : $stdout
|
79
|
-
fout.binmode.set_encoding(@encoding)
|
80
|
-
translator.translate_io(fin, fout)
|
81
|
-
ensure
|
82
|
-
if defined?(fin) and fin != $stdin
|
83
|
-
fin.close if not fin.closed?
|
84
|
-
end
|
85
|
-
if defined?(fout) and fin != $stdout
|
86
|
-
fout.close if not fout.closed?
|
87
|
-
end
|
74
|
+
if defined?(fout) and fin != $stdout
|
75
|
+
fout.close if not fout.closed?
|
88
76
|
end
|
89
77
|
end
|
90
78
|
|
@@ -93,12 +81,14 @@ module Sanzang::Command
|
|
93
81
|
return err.status
|
94
82
|
rescue Exception => err
|
95
83
|
$stderr.puts err.backtrace
|
96
|
-
$stderr.puts "
|
84
|
+
$stderr.puts "\nERROR: #{err.inspect}\n\n"
|
97
85
|
return 1
|
98
86
|
end
|
99
87
|
|
100
88
|
private
|
101
89
|
|
90
|
+
# Initialize the encoding for text data if it is not already set
|
91
|
+
#
|
102
92
|
def set_data_encoding
|
103
93
|
if @encoding == nil
|
104
94
|
if Encoding.default_external == Encoding::IBM437
|
@@ -110,58 +100,45 @@ module Sanzang::Command
|
|
110
100
|
end
|
111
101
|
end
|
112
102
|
|
103
|
+
# An OptionParser for the command
|
104
|
+
#
|
113
105
|
def option_parser
|
114
|
-
OptionParser.new do |
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
pr.banner << "\nOptions:\n"
|
128
|
-
|
129
|
-
pr.on("-h", "--help", "show this help message and exit") do |v|
|
130
|
-
puts pr
|
106
|
+
OptionParser.new do |op|
|
107
|
+
op.banner = "Usage: #{@name} [options] table\n"
|
108
|
+
|
109
|
+
op.banner << "\nTranslate text using simple table rules. Input text "
|
110
|
+
op.banner << "is read from STDIN by\ndefault, and the output is "
|
111
|
+
op.banner << "written to STDOUT by default.\n"
|
112
|
+
|
113
|
+
op.banner << "\nExample:\n"
|
114
|
+
op.banner << " #{@name} -i text.txt -o text.sz.txt table.txt\n"
|
115
|
+
op.banner << "\nOptions:\n"
|
116
|
+
|
117
|
+
op.on("-h", "--help", "show this help message and exit") do |v|
|
118
|
+
puts op
|
131
119
|
exit 0
|
132
120
|
end
|
133
|
-
|
134
|
-
@batch_dir = v
|
135
|
-
end
|
136
|
-
pr.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
|
121
|
+
op.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
|
137
122
|
@encoding = Encoding.find(v)
|
138
123
|
end
|
139
|
-
|
140
|
-
|
124
|
+
op.on("-L", "--list-encodings", "list possible encodings") do |v|
|
125
|
+
encodings = Encoding.list.sort do |x,y|
|
126
|
+
x.to_s.upcase <=> y.to_s.upcase
|
127
|
+
end
|
128
|
+
puts encodings
|
141
129
|
exit 0
|
142
130
|
end
|
143
|
-
|
131
|
+
op.on("-i", "--infile=FILE", "read input text from FILE") do |v|
|
144
132
|
@infile = v
|
145
133
|
end
|
146
|
-
|
134
|
+
op.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
|
147
135
|
@outfile = v
|
148
136
|
end
|
149
|
-
pr.on("-P", "--platform", "show platform information") do |v|
|
150
|
-
puts "Ruby version: #{RUBY_VERSION}"
|
151
|
-
puts "Ruby platform: #{RUBY_PLATFORM}"
|
152
|
-
puts "External encoding: #{Encoding.default_external}"
|
153
|
-
if Encoding.default_internal != nil
|
154
|
-
puts "Internal encoding: #{Encoding.default_internal}"
|
155
|
-
end
|
156
|
-
exit 0
|
157
|
-
end
|
158
|
-
pr.on("-V", "--version", "show version number and exit") do |v|
|
159
|
-
puts "Sanzang version: #{Sanzang::VERSION}"
|
160
|
-
exit 0
|
161
|
-
end
|
162
137
|
end
|
163
138
|
end
|
164
139
|
|
140
|
+
# Name of the command
|
141
|
+
#
|
165
142
|
attr_reader :name
|
166
143
|
|
167
144
|
end
|
@@ -36,7 +36,7 @@ module Sanzang
|
|
36
36
|
# kept separate. Following this, all newlines are removed, and the text is
|
37
37
|
# then reformatted according to the remaining punctuation and spacing.
|
38
38
|
#
|
39
|
-
def
|
39
|
+
def reflow_cjk(s)
|
40
40
|
source_encoding = s.encoding
|
41
41
|
s.encode!(Encoding::UTF_8)
|
42
42
|
|
@@ -15,41 +15,30 @@
|
|
15
15
|
#
|
16
16
|
# You should have received a copy of the GNU General Public License along with
|
17
17
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
18
|
+
|
19
19
|
module Sanzang
|
20
20
|
|
21
|
-
#
|
22
|
-
# Sanzang
|
23
|
-
#
|
24
|
-
# will then go through basic parsing to ensure the table data is in the
|
25
|
-
# correct format, and then the rules are reverse sorted by the length of the
|
26
|
-
# source language column. Thereafter, these rules are accessible through the
|
27
|
-
# ''records'' attribute, and metadata is available through other accessors
|
28
|
-
# and methods. It is the responsibility of Sanzang::Translator object to
|
29
|
-
# actually apply the rules of a TranslationTable to some text, as the table
|
30
|
-
# merely encapsulates a set of translation rules.
|
31
|
-
#
|
32
|
-
# The format for translation table data can be summarized as the following:
|
33
|
-
#
|
34
|
-
# * Plain text with one line per record
|
35
|
-
# * Records begin with "~|", end with "|~", and are delimited by "|".
|
36
|
-
# * The number of columns in each record must be consistent.
|
37
|
-
#
|
38
|
-
# An example of this format is the following:
|
39
|
-
#
|
40
|
-
# ~|zh-term1|en-term1|~
|
41
|
-
# ~|zh-term2|en-term2|~
|
42
|
-
# ~|zh-term3|en-term3|~
|
21
|
+
# A translation table encapsulates a set of rules for translating with
|
22
|
+
# the \Sanzang system. These are essentially read-only objects meant for
|
23
|
+
# storing well-defined translation table data.
|
43
24
|
#
|
44
25
|
class TranslationTable
|
45
26
|
|
46
|
-
#
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
27
|
+
# A table is created from a formatted string of translation rules. The
|
28
|
+
# string is in the format of delimited text. The text format can be
|
29
|
+
# summarized as follows:
|
30
|
+
#
|
31
|
+
# - Each line of text is a record for a translation rule.
|
32
|
+
# - Each record begins with "~|" and ends with "|~".
|
33
|
+
# - Fields in the record are separated by the "|" character.
|
34
|
+
# - The first field contains the term in the source language.
|
35
|
+
# - Subsequent fields are equivalent terms in destination languages.
|
36
|
+
# - The number of columns must be consistent for the entire table.
|
37
|
+
#
|
38
|
+
# The first element in a record is a term in the source language, and
|
39
|
+
# subsequent elements are are equivalent terms in destination languages.
|
40
|
+
# The number of "columns" in a translation table must be consistent across
|
41
|
+
# the entire table.
|
53
42
|
#
|
54
43
|
def initialize(rules)
|
55
44
|
contents = rules.kind_of?(String) ? rules : rules.read
|
@@ -60,52 +49,48 @@ module Sanzang
|
|
60
49
|
separator = "|".encode(@encoding)
|
61
50
|
|
62
51
|
@records = contents.gsub("\r", "").split("\n").collect do |rec|
|
63
|
-
rec
|
52
|
+
rec.strip.gsub(left, "").gsub(right, "").split(separator)
|
64
53
|
end
|
65
54
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
raise "Column mismatch: Line #{i + 1}"
|
71
|
-
end
|
55
|
+
@width = records[0].length
|
56
|
+
0.upto(@records.length - 1) do |i|
|
57
|
+
if @records[i].length != @width
|
58
|
+
raise "Column mismatch: Line #{i + 1}"
|
72
59
|
end
|
73
|
-
else
|
74
|
-
@width = 0
|
75
60
|
end
|
76
61
|
|
77
|
-
@records.sort! {|x,y| y.length <=> x.length }
|
62
|
+
@records.sort! {|x,y| y[0].length <=> x[0].length }
|
78
63
|
end
|
79
64
|
|
80
|
-
# Retrieve a record by its numeric index.
|
81
|
-
# looking at the records attribute directly.
|
65
|
+
# Retrieve a record by its numeric index.
|
82
66
|
#
|
83
67
|
def [](index)
|
84
68
|
@records[index]
|
85
69
|
end
|
86
70
|
|
87
|
-
# Find
|
88
|
-
# parameter.
|
71
|
+
# Find a record by the source language term (first column).
|
89
72
|
#
|
90
73
|
def find(term)
|
91
74
|
@records.find {|rec| rec[0] == term }
|
92
75
|
end
|
93
76
|
|
94
|
-
# The number of records in the
|
77
|
+
# The number of records in the table
|
95
78
|
#
|
96
79
|
def length
|
97
80
|
@records.length
|
98
81
|
end
|
99
82
|
|
100
|
-
# The number of columns in the
|
83
|
+
# The number of columns in the table
|
101
84
|
#
|
102
|
-
|
85
|
+
def width
|
86
|
+
@records[0].length
|
87
|
+
end
|
103
88
|
|
104
|
-
# The records for the translation table, as an
|
89
|
+
# The records for the translation table, as an array
|
105
90
|
#
|
106
91
|
attr_reader :records
|
107
92
|
|
108
|
-
# The text encoding used for all translation table data
|
93
|
+
# The text encoding used for all translation table data
|
109
94
|
#
|
110
95
|
attr_reader :encoding
|
111
96
|
|
data/lib/sanzang/translator.rb
CHANGED
@@ -16,22 +16,13 @@
|
|
16
16
|
# You should have received a copy of the GNU General Public License along with
|
17
17
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
18
18
|
|
19
|
-
begin
|
20
|
-
require "parallel"
|
21
|
-
rescue LoadError
|
22
|
-
nil
|
23
|
-
end
|
24
|
-
|
25
19
|
module Sanzang
|
26
20
|
|
27
21
|
# Translator is the main class for performing text translations with Sanzang.
|
28
22
|
# A Translator utilizes a TranslationTable, which is passed to it at the time
|
29
23
|
# of creation. The Translator can then apply these translation rules,
|
30
24
|
# generate full translation listings, and perform translations by reading and
|
31
|
-
# writing to IO objects.
|
32
|
-
# utilize multiprocessing if the _Parallel_ module is available, and if the
|
33
|
-
# platform supports Kernel#fork. Methods are also available for querying the
|
34
|
-
# status of this functionality.
|
25
|
+
# writing to IO objects.
|
35
26
|
#
|
36
27
|
class Translator
|
37
28
|
|
@@ -43,28 +34,6 @@ module Sanzang
|
|
43
34
|
@table = translation_table
|
44
35
|
end
|
45
36
|
|
46
|
-
# Returns true if both the _Parallel_ module is available, and is also
|
47
|
-
# functioning on this particular implementation of Ruby. Currently the
|
48
|
-
# _mingw_ and _mswin_ ports of Ruby do not have Process#fork implemented.
|
49
|
-
#
|
50
|
-
def runs_parallel?
|
51
|
-
if not Process.respond_to?(:fork)
|
52
|
-
false
|
53
|
-
elsif defined?(Parallel) == "constant" and Parallel.class == Module
|
54
|
-
true
|
55
|
-
else
|
56
|
-
false
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
# Return the number of processors available on the current system. This
|
61
|
-
# will return the total number of logical processors, rather than physical
|
62
|
-
# processors.
|
63
|
-
#
|
64
|
-
def processor_count
|
65
|
-
runs_parallel? == true ? Parallel.processor_count : 1
|
66
|
-
end
|
67
|
-
|
68
37
|
# Return an Array of all translation rules used by a particular text.
|
69
38
|
# These records represent the vocabulary used by the text.
|
70
39
|
#
|
@@ -133,39 +102,6 @@ module Sanzang
|
|
133
102
|
output.close
|
134
103
|
end
|
135
104
|
|
136
|
-
# Translate a list of files to some output directory. If the _verbose_
|
137
|
-
# parameter is true, then print progress to STDERR. If the value of
|
138
|
-
# Translator#runs_parallel? is false, then the batch is processed
|
139
|
-
# sequentially, only utilizing one processor. However, if the value is
|
140
|
-
# true, then run the batch by utilizing the Parallel module for efficient
|
141
|
-
# multiprocessing.
|
142
|
-
#
|
143
|
-
def translate_batch(fpath_list, out_dir, verbose = true)
|
144
|
-
fpath_list.collect! {|f| f.chomp }
|
145
|
-
|
146
|
-
if not runs_parallel?
|
147
|
-
fpath_list.each do |in_fpath|
|
148
|
-
out_fpath = File.join(out_dir, File.basename(in_fpath))
|
149
|
-
translate_io(in_fpath, out_fpath)
|
150
|
-
if verbose
|
151
|
-
$stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
|
152
|
-
$stderr.flush
|
153
|
-
end
|
154
|
-
out_fpath
|
155
|
-
end
|
156
|
-
else
|
157
|
-
Parallel.map(fpath_list) do |in_fpath|
|
158
|
-
out_fpath = File.join(out_dir, File.basename(in_fpath))
|
159
|
-
translate_io(in_fpath, out_fpath)
|
160
|
-
if verbose
|
161
|
-
$stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
|
162
|
-
$stderr.flush
|
163
|
-
end
|
164
|
-
out_fpath
|
165
|
-
end
|
166
|
-
end
|
167
|
-
end
|
168
|
-
|
169
105
|
# The TranslationTable used by the Translator
|
170
106
|
#
|
171
107
|
attr_reader :table
|
data/lib/sanzang/version.rb
CHANGED
data/test/tc_reflow_encodings.rb
CHANGED
@@ -25,7 +25,7 @@ class TestReflowEncodings < Test::Unit::TestCase
|
|
25
25
|
text_s1.encode!(encoding)
|
26
26
|
text_s2.encode!(encoding)
|
27
27
|
formatter = Sanzang::TextFormatter.new
|
28
|
-
assert_equal(text_s2, formatter.
|
28
|
+
assert_equal(text_s2, formatter.reflow_cjk(text_s1))
|
29
29
|
end
|
30
30
|
|
31
31
|
# Han characters, simplified and without double vertical bar. The margin
|
@@ -38,7 +38,7 @@ class TestReflowEncodings < Test::Unit::TestCase
|
|
38
38
|
text_s1.encode!(encoding)
|
39
39
|
text_s2.encode!(encoding)
|
40
40
|
formatter = Sanzang::TextFormatter.new
|
41
|
-
assert_equal(text_s2, formatter.
|
41
|
+
assert_equal(text_s2, formatter.reflow_cjk(text_s1))
|
42
42
|
end
|
43
43
|
|
44
44
|
# UTF-8 (Traditional Chinese)
|
@@ -5,10 +5,6 @@ require "test/unit"
|
|
5
5
|
|
6
6
|
require_relative File.join("..", "lib", "sanzang")
|
7
7
|
|
8
|
-
# assert_nothing_raised
|
9
|
-
# assert_equal(x, y)
|
10
|
-
# assert(stmt, "Error message")
|
11
|
-
#
|
12
8
|
class TestSanzang < Test::Unit::TestCase
|
13
9
|
|
14
10
|
def table_string
|
@@ -45,7 +41,7 @@ class TestSanzang < Test::Unit::TestCase
|
|
45
41
|
def test_translation_table
|
46
42
|
table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
|
47
43
|
fin = File.open(table_path, "rb", encoding: "UTF-8")
|
48
|
-
table = Sanzang::TranslationTable.new(fin)
|
44
|
+
table = Sanzang::TranslationTable.new(fin.read)
|
49
45
|
fin.close
|
50
46
|
assert(table.width.class == Fixnum, "Table width undefined")
|
51
47
|
assert(table.length.class == Fixnum, "Table length undefined")
|
@@ -60,7 +56,7 @@ class TestSanzang < Test::Unit::TestCase
|
|
60
56
|
end
|
61
57
|
|
62
58
|
def test_reflow_cjk_string
|
63
|
-
text = Sanzang::TextFormatter.new.
|
59
|
+
text = Sanzang::TextFormatter.new.reflow_cjk(stage_1())
|
64
60
|
assert_equal(stage_2(), text)
|
65
61
|
end
|
66
62
|
|
@@ -74,22 +70,22 @@ class TestSanzang < Test::Unit::TestCase
|
|
74
70
|
table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
|
75
71
|
s2_path = File.join(File.dirname(__FILE__), "utf-8", "stage_2.txt")
|
76
72
|
s3_path = File.join(File.dirname(__FILE__), "utf-8", "stage_3.txt")
|
77
|
-
table = Sanzang::TranslationTable.new(table_path)
|
73
|
+
table = Sanzang::TranslationTable.new(IO.read(table_path))
|
78
74
|
translator = Sanzang::Translator.new(table)
|
79
75
|
translator.translate_io(s2_path, s3_path)
|
80
76
|
end
|
81
77
|
|
82
78
|
def test_translator_parallel
|
83
79
|
table = Sanzang::TranslationTable.new(table_string())
|
84
|
-
|
85
|
-
|
86
|
-
assert(
|
80
|
+
bt = Sanzang::BatchTranslator.new(table)
|
81
|
+
bt.forking?
|
82
|
+
assert(bt.processor_count > 0, "Processor count less than zero")
|
87
83
|
end
|
88
84
|
|
89
85
|
def test_translate_batch
|
90
86
|
table = Sanzang::TranslationTable.new(table_string())
|
91
|
-
|
92
|
-
|
87
|
+
bt = Sanzang::BatchTranslator.new(table)
|
88
|
+
bt.translate_to_dir(
|
93
89
|
Dir.glob(File.join(File.dirname(__FILE__), "utf-8", "file_*.txt")),
|
94
90
|
File.join(File.dirname(__FILE__), "utf-8", "batch"), false)
|
95
91
|
end
|