sanzang 0.0.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/HACKING +22 -40
- data/MANUAL +312 -0
- data/README +35 -265
- data/bin/{sanzang-reflow → sanzang} +1 -1
- data/lib/sanzang.rb +12 -35
- data/lib/sanzang/batch_translator.rb +77 -0
- data/lib/sanzang/command/batch.rb +131 -0
- data/lib/sanzang/command/reflow.rb +35 -32
- data/lib/sanzang/command/sanzang_cmd.rb +132 -0
- data/lib/sanzang/command/translate.rb +47 -70
- data/lib/sanzang/text_formatter.rb +1 -1
- data/lib/sanzang/translation_table.rb +34 -49
- data/lib/sanzang/translator.rb +1 -65
- data/lib/sanzang/version.rb +3 -2
- data/test/tc_reflow_encodings.rb +2 -2
- data/test/tc_simple_translation.rb +8 -12
- data/test/utf-8/stage_3.txt +4 -0
- metadata +25 -31
- data/bin/sanzang-translate +0 -21
@@ -24,27 +24,23 @@ require_relative File.join("..", "version")
|
|
24
24
|
|
25
25
|
module Sanzang::Command
|
26
26
|
|
27
|
-
#
|
28
|
-
# text
|
29
|
-
#
|
30
|
-
# this is to do initial text transformations to ensure (1) that terms will
|
31
|
-
# be translated reliably, and (2) that the final output of the translation
|
32
|
-
# will be readable by the user (i.e. lines not too long).
|
27
|
+
# This class provides a command for simple translation of one file or text.
|
28
|
+
# Input and output text can be read from either stdin and stdout, or from
|
29
|
+
# files. For mass translation of texts, see Sanzang::Command::Batch.
|
33
30
|
#
|
34
31
|
class Translate
|
35
32
|
|
36
33
|
# Create a new instance of the Translate class.
|
37
34
|
#
|
38
35
|
def initialize
|
39
|
-
@name = "sanzang
|
36
|
+
@name = "sanzang translate"
|
40
37
|
@encoding = nil
|
41
|
-
@batch_dir = nil
|
42
38
|
@infile = nil
|
43
39
|
@outfile = nil
|
44
40
|
end
|
45
41
|
|
46
|
-
# Run the
|
47
|
-
# would typically be an
|
42
|
+
# Run the translate command with the given arguments. The parameter _args_
|
43
|
+
# would typically be an array of command options and parameters. Calling
|
48
44
|
# this with the "-h" or "--help" option will print full usage information
|
49
45
|
# necessary for running this command.
|
50
46
|
#
|
@@ -53,7 +49,7 @@ module Sanzang::Command
|
|
53
49
|
parser.parse!(args)
|
54
50
|
|
55
51
|
if args.length != 1
|
56
|
-
puts parser
|
52
|
+
$stderr.puts parser
|
57
53
|
return 1
|
58
54
|
end
|
59
55
|
|
@@ -61,30 +57,22 @@ module Sanzang::Command
|
|
61
57
|
|
62
58
|
translator = nil
|
63
59
|
File.open(args[0], "rb", encoding: @encoding) do |table_file|
|
64
|
-
table = Sanzang::TranslationTable.new(table_file)
|
60
|
+
table = Sanzang::TranslationTable.new(table_file.read)
|
65
61
|
translator = Sanzang::Translator.new(table)
|
66
62
|
end
|
67
63
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
64
|
+
begin
|
65
|
+
fin = @infile ? File.open(@infile, "rb") : $stdin
|
66
|
+
fin.binmode.set_encoding(@encoding)
|
67
|
+
fout = @outfile ? File.open(@outfile, "wb") : $stdout
|
68
|
+
fout.binmode.set_encoding(@encoding)
|
69
|
+
translator.translate_io(fin, fout)
|
70
|
+
ensure
|
71
|
+
if defined?(fin) and fin != $stdin
|
72
|
+
fin.close if not fin.closed?
|
72
73
|
end
|
73
|
-
|
74
|
-
|
75
|
-
begin
|
76
|
-
fin = @infile ? File.open(@infile, "rb") : $stdin
|
77
|
-
fin.binmode.set_encoding(@encoding)
|
78
|
-
fout = @outfile ? File.open(@outfile, "wb") : $stdout
|
79
|
-
fout.binmode.set_encoding(@encoding)
|
80
|
-
translator.translate_io(fin, fout)
|
81
|
-
ensure
|
82
|
-
if defined?(fin) and fin != $stdin
|
83
|
-
fin.close if not fin.closed?
|
84
|
-
end
|
85
|
-
if defined?(fout) and fin != $stdout
|
86
|
-
fout.close if not fout.closed?
|
87
|
-
end
|
74
|
+
if defined?(fout) and fin != $stdout
|
75
|
+
fout.close if not fout.closed?
|
88
76
|
end
|
89
77
|
end
|
90
78
|
|
@@ -93,12 +81,14 @@ module Sanzang::Command
|
|
93
81
|
return err.status
|
94
82
|
rescue Exception => err
|
95
83
|
$stderr.puts err.backtrace
|
96
|
-
$stderr.puts "
|
84
|
+
$stderr.puts "\nERROR: #{err.inspect}\n\n"
|
97
85
|
return 1
|
98
86
|
end
|
99
87
|
|
100
88
|
private
|
101
89
|
|
90
|
+
# Initialize the encoding for text data if it is not already set
|
91
|
+
#
|
102
92
|
def set_data_encoding
|
103
93
|
if @encoding == nil
|
104
94
|
if Encoding.default_external == Encoding::IBM437
|
@@ -110,58 +100,45 @@ module Sanzang::Command
|
|
110
100
|
end
|
111
101
|
end
|
112
102
|
|
103
|
+
# An OptionParser for the command
|
104
|
+
#
|
113
105
|
def option_parser
|
114
|
-
OptionParser.new do |
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
pr.banner << "\nOptions:\n"
|
128
|
-
|
129
|
-
pr.on("-h", "--help", "show this help message and exit") do |v|
|
130
|
-
puts pr
|
106
|
+
OptionParser.new do |op|
|
107
|
+
op.banner = "Usage: #{@name} [options] table\n"
|
108
|
+
|
109
|
+
op.banner << "\nTranslate text using simple table rules. Input text "
|
110
|
+
op.banner << "is read from STDIN by\ndefault, and the output is "
|
111
|
+
op.banner << "written to STDOUT by default.\n"
|
112
|
+
|
113
|
+
op.banner << "\nExample:\n"
|
114
|
+
op.banner << " #{@name} -i text.txt -o text.sz.txt table.txt\n"
|
115
|
+
op.banner << "\nOptions:\n"
|
116
|
+
|
117
|
+
op.on("-h", "--help", "show this help message and exit") do |v|
|
118
|
+
puts op
|
131
119
|
exit 0
|
132
120
|
end
|
133
|
-
|
134
|
-
@batch_dir = v
|
135
|
-
end
|
136
|
-
pr.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
|
121
|
+
op.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
|
137
122
|
@encoding = Encoding.find(v)
|
138
123
|
end
|
139
|
-
|
140
|
-
|
124
|
+
op.on("-L", "--list-encodings", "list possible encodings") do |v|
|
125
|
+
encodings = Encoding.list.sort do |x,y|
|
126
|
+
x.to_s.upcase <=> y.to_s.upcase
|
127
|
+
end
|
128
|
+
puts encodings
|
141
129
|
exit 0
|
142
130
|
end
|
143
|
-
|
131
|
+
op.on("-i", "--infile=FILE", "read input text from FILE") do |v|
|
144
132
|
@infile = v
|
145
133
|
end
|
146
|
-
|
134
|
+
op.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
|
147
135
|
@outfile = v
|
148
136
|
end
|
149
|
-
pr.on("-P", "--platform", "show platform information") do |v|
|
150
|
-
puts "Ruby version: #{RUBY_VERSION}"
|
151
|
-
puts "Ruby platform: #{RUBY_PLATFORM}"
|
152
|
-
puts "External encoding: #{Encoding.default_external}"
|
153
|
-
if Encoding.default_internal != nil
|
154
|
-
puts "Internal encoding: #{Encoding.default_internal}"
|
155
|
-
end
|
156
|
-
exit 0
|
157
|
-
end
|
158
|
-
pr.on("-V", "--version", "show version number and exit") do |v|
|
159
|
-
puts "Sanzang version: #{Sanzang::VERSION}"
|
160
|
-
exit 0
|
161
|
-
end
|
162
137
|
end
|
163
138
|
end
|
164
139
|
|
140
|
+
# Name of the command
|
141
|
+
#
|
165
142
|
attr_reader :name
|
166
143
|
|
167
144
|
end
|
@@ -36,7 +36,7 @@ module Sanzang
|
|
36
36
|
# kept separate. Following this, all newlines are removed, and the text is
|
37
37
|
# then reformatted according to the remaining punctuation and spacing.
|
38
38
|
#
|
39
|
-
def
|
39
|
+
def reflow_cjk(s)
|
40
40
|
source_encoding = s.encoding
|
41
41
|
s.encode!(Encoding::UTF_8)
|
42
42
|
|
@@ -15,41 +15,30 @@
|
|
15
15
|
#
|
16
16
|
# You should have received a copy of the GNU General Public License along with
|
17
17
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
18
|
+
|
19
19
|
module Sanzang
|
20
20
|
|
21
|
-
#
|
22
|
-
# Sanzang
|
23
|
-
#
|
24
|
-
# will then go through basic parsing to ensure the table data is in the
|
25
|
-
# correct format, and then the rules are reverse sorted by the length of the
|
26
|
-
# source language column. Thereafter, these rules are accessible through the
|
27
|
-
# ''records'' attribute, and metadata is available through other accessors
|
28
|
-
# and methods. It is the responsibility of Sanzang::Translator object to
|
29
|
-
# actually apply the rules of a TranslationTable to some text, as the table
|
30
|
-
# merely encapsulates a set of translation rules.
|
31
|
-
#
|
32
|
-
# The format for translation table data can be summarized as the following:
|
33
|
-
#
|
34
|
-
# * Plain text with one line per record
|
35
|
-
# * Records begin with "~|", end with "|~", and are delimited by "|".
|
36
|
-
# * The number of columns in each record must be consistent.
|
37
|
-
#
|
38
|
-
# An example of this format is the following:
|
39
|
-
#
|
40
|
-
# ~|zh-term1|en-term1|~
|
41
|
-
# ~|zh-term2|en-term2|~
|
42
|
-
# ~|zh-term3|en-term3|~
|
21
|
+
# A translation table encapsulates a set of rules for translating with
|
22
|
+
# the \Sanzang system. These are essentially read-only objects meant for
|
23
|
+
# storing well-defined translation table data.
|
43
24
|
#
|
44
25
|
class TranslationTable
|
45
26
|
|
46
|
-
#
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
27
|
+
# A table is created from a formatted string of translation rules. The
|
28
|
+
# string is in the format of delimited text. The text format can be
|
29
|
+
# summarized as follows:
|
30
|
+
#
|
31
|
+
# - Each line of text is a record for a translation rule.
|
32
|
+
# - Each record begins with "~|" and ends with "|~".
|
33
|
+
# - Fields in the record are separated by the "|" character.
|
34
|
+
# - The first field contains the term in the source language.
|
35
|
+
# - Subsequent fields are equivalent terms in destination languages.
|
36
|
+
# - The number of columns must be consistent for the entire table.
|
37
|
+
#
|
38
|
+
# The first element in a record is a term in the source language, and
|
39
|
+
# subsequent elements are are equivalent terms in destination languages.
|
40
|
+
# The number of "columns" in a translation table must be consistent across
|
41
|
+
# the entire table.
|
53
42
|
#
|
54
43
|
def initialize(rules)
|
55
44
|
contents = rules.kind_of?(String) ? rules : rules.read
|
@@ -60,52 +49,48 @@ module Sanzang
|
|
60
49
|
separator = "|".encode(@encoding)
|
61
50
|
|
62
51
|
@records = contents.gsub("\r", "").split("\n").collect do |rec|
|
63
|
-
rec
|
52
|
+
rec.strip.gsub(left, "").gsub(right, "").split(separator)
|
64
53
|
end
|
65
54
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
raise "Column mismatch: Line #{i + 1}"
|
71
|
-
end
|
55
|
+
@width = records[0].length
|
56
|
+
0.upto(@records.length - 1) do |i|
|
57
|
+
if @records[i].length != @width
|
58
|
+
raise "Column mismatch: Line #{i + 1}"
|
72
59
|
end
|
73
|
-
else
|
74
|
-
@width = 0
|
75
60
|
end
|
76
61
|
|
77
|
-
@records.sort! {|x,y| y.length <=> x.length }
|
62
|
+
@records.sort! {|x,y| y[0].length <=> x[0].length }
|
78
63
|
end
|
79
64
|
|
80
|
-
# Retrieve a record by its numeric index.
|
81
|
-
# looking at the records attribute directly.
|
65
|
+
# Retrieve a record by its numeric index.
|
82
66
|
#
|
83
67
|
def [](index)
|
84
68
|
@records[index]
|
85
69
|
end
|
86
70
|
|
87
|
-
# Find
|
88
|
-
# parameter.
|
71
|
+
# Find a record by the source language term (first column).
|
89
72
|
#
|
90
73
|
def find(term)
|
91
74
|
@records.find {|rec| rec[0] == term }
|
92
75
|
end
|
93
76
|
|
94
|
-
# The number of records in the
|
77
|
+
# The number of records in the table
|
95
78
|
#
|
96
79
|
def length
|
97
80
|
@records.length
|
98
81
|
end
|
99
82
|
|
100
|
-
# The number of columns in the
|
83
|
+
# The number of columns in the table
|
101
84
|
#
|
102
|
-
|
85
|
+
def width
|
86
|
+
@records[0].length
|
87
|
+
end
|
103
88
|
|
104
|
-
# The records for the translation table, as an
|
89
|
+
# The records for the translation table, as an array
|
105
90
|
#
|
106
91
|
attr_reader :records
|
107
92
|
|
108
|
-
# The text encoding used for all translation table data
|
93
|
+
# The text encoding used for all translation table data
|
109
94
|
#
|
110
95
|
attr_reader :encoding
|
111
96
|
|
data/lib/sanzang/translator.rb
CHANGED
@@ -16,22 +16,13 @@
|
|
16
16
|
# You should have received a copy of the GNU General Public License along with
|
17
17
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
18
18
|
|
19
|
-
begin
|
20
|
-
require "parallel"
|
21
|
-
rescue LoadError
|
22
|
-
nil
|
23
|
-
end
|
24
|
-
|
25
19
|
module Sanzang
|
26
20
|
|
27
21
|
# Translator is the main class for performing text translations with Sanzang.
|
28
22
|
# A Translator utilizes a TranslationTable, which is passed to it at the time
|
29
23
|
# of creation. The Translator can then apply these translation rules,
|
30
24
|
# generate full translation listings, and perform translations by reading and
|
31
|
-
# writing to IO objects.
|
32
|
-
# utilize multiprocessing if the _Parallel_ module is available, and if the
|
33
|
-
# platform supports Kernel#fork. Methods are also available for querying the
|
34
|
-
# status of this functionality.
|
25
|
+
# writing to IO objects.
|
35
26
|
#
|
36
27
|
class Translator
|
37
28
|
|
@@ -43,28 +34,6 @@ module Sanzang
|
|
43
34
|
@table = translation_table
|
44
35
|
end
|
45
36
|
|
46
|
-
# Returns true if both the _Parallel_ module is available, and is also
|
47
|
-
# functioning on this particular implementation of Ruby. Currently the
|
48
|
-
# _mingw_ and _mswin_ ports of Ruby do not have Process#fork implemented.
|
49
|
-
#
|
50
|
-
def runs_parallel?
|
51
|
-
if not Process.respond_to?(:fork)
|
52
|
-
false
|
53
|
-
elsif defined?(Parallel) == "constant" and Parallel.class == Module
|
54
|
-
true
|
55
|
-
else
|
56
|
-
false
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
# Return the number of processors available on the current system. This
|
61
|
-
# will return the total number of logical processors, rather than physical
|
62
|
-
# processors.
|
63
|
-
#
|
64
|
-
def processor_count
|
65
|
-
runs_parallel? == true ? Parallel.processor_count : 1
|
66
|
-
end
|
67
|
-
|
68
37
|
# Return an Array of all translation rules used by a particular text.
|
69
38
|
# These records represent the vocabulary used by the text.
|
70
39
|
#
|
@@ -133,39 +102,6 @@ module Sanzang
|
|
133
102
|
output.close
|
134
103
|
end
|
135
104
|
|
136
|
-
# Translate a list of files to some output directory. If the _verbose_
|
137
|
-
# parameter is true, then print progress to STDERR. If the value of
|
138
|
-
# Translator#runs_parallel? is false, then the batch is processed
|
139
|
-
# sequentially, only utilizing one processor. However, if the value is
|
140
|
-
# true, then run the batch by utilizing the Parallel module for efficient
|
141
|
-
# multiprocessing.
|
142
|
-
#
|
143
|
-
def translate_batch(fpath_list, out_dir, verbose = true)
|
144
|
-
fpath_list.collect! {|f| f.chomp }
|
145
|
-
|
146
|
-
if not runs_parallel?
|
147
|
-
fpath_list.each do |in_fpath|
|
148
|
-
out_fpath = File.join(out_dir, File.basename(in_fpath))
|
149
|
-
translate_io(in_fpath, out_fpath)
|
150
|
-
if verbose
|
151
|
-
$stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
|
152
|
-
$stderr.flush
|
153
|
-
end
|
154
|
-
out_fpath
|
155
|
-
end
|
156
|
-
else
|
157
|
-
Parallel.map(fpath_list) do |in_fpath|
|
158
|
-
out_fpath = File.join(out_dir, File.basename(in_fpath))
|
159
|
-
translate_io(in_fpath, out_fpath)
|
160
|
-
if verbose
|
161
|
-
$stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
|
162
|
-
$stderr.flush
|
163
|
-
end
|
164
|
-
out_fpath
|
165
|
-
end
|
166
|
-
end
|
167
|
-
end
|
168
|
-
|
169
105
|
# The TranslationTable used by the Translator
|
170
106
|
#
|
171
107
|
attr_reader :table
|
data/lib/sanzang/version.rb
CHANGED
data/test/tc_reflow_encodings.rb
CHANGED
@@ -25,7 +25,7 @@ class TestReflowEncodings < Test::Unit::TestCase
|
|
25
25
|
text_s1.encode!(encoding)
|
26
26
|
text_s2.encode!(encoding)
|
27
27
|
formatter = Sanzang::TextFormatter.new
|
28
|
-
assert_equal(text_s2, formatter.
|
28
|
+
assert_equal(text_s2, formatter.reflow_cjk(text_s1))
|
29
29
|
end
|
30
30
|
|
31
31
|
# Han characters, simplified and without double vertical bar. The margin
|
@@ -38,7 +38,7 @@ class TestReflowEncodings < Test::Unit::TestCase
|
|
38
38
|
text_s1.encode!(encoding)
|
39
39
|
text_s2.encode!(encoding)
|
40
40
|
formatter = Sanzang::TextFormatter.new
|
41
|
-
assert_equal(text_s2, formatter.
|
41
|
+
assert_equal(text_s2, formatter.reflow_cjk(text_s1))
|
42
42
|
end
|
43
43
|
|
44
44
|
# UTF-8 (Traditional Chinese)
|
@@ -5,10 +5,6 @@ require "test/unit"
|
|
5
5
|
|
6
6
|
require_relative File.join("..", "lib", "sanzang")
|
7
7
|
|
8
|
-
# assert_nothing_raised
|
9
|
-
# assert_equal(x, y)
|
10
|
-
# assert(stmt, "Error message")
|
11
|
-
#
|
12
8
|
class TestSanzang < Test::Unit::TestCase
|
13
9
|
|
14
10
|
def table_string
|
@@ -45,7 +41,7 @@ class TestSanzang < Test::Unit::TestCase
|
|
45
41
|
def test_translation_table
|
46
42
|
table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
|
47
43
|
fin = File.open(table_path, "rb", encoding: "UTF-8")
|
48
|
-
table = Sanzang::TranslationTable.new(fin)
|
44
|
+
table = Sanzang::TranslationTable.new(fin.read)
|
49
45
|
fin.close
|
50
46
|
assert(table.width.class == Fixnum, "Table width undefined")
|
51
47
|
assert(table.length.class == Fixnum, "Table length undefined")
|
@@ -60,7 +56,7 @@ class TestSanzang < Test::Unit::TestCase
|
|
60
56
|
end
|
61
57
|
|
62
58
|
def test_reflow_cjk_string
|
63
|
-
text = Sanzang::TextFormatter.new.
|
59
|
+
text = Sanzang::TextFormatter.new.reflow_cjk(stage_1())
|
64
60
|
assert_equal(stage_2(), text)
|
65
61
|
end
|
66
62
|
|
@@ -74,22 +70,22 @@ class TestSanzang < Test::Unit::TestCase
|
|
74
70
|
table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
|
75
71
|
s2_path = File.join(File.dirname(__FILE__), "utf-8", "stage_2.txt")
|
76
72
|
s3_path = File.join(File.dirname(__FILE__), "utf-8", "stage_3.txt")
|
77
|
-
table = Sanzang::TranslationTable.new(table_path)
|
73
|
+
table = Sanzang::TranslationTable.new(IO.read(table_path))
|
78
74
|
translator = Sanzang::Translator.new(table)
|
79
75
|
translator.translate_io(s2_path, s3_path)
|
80
76
|
end
|
81
77
|
|
82
78
|
def test_translator_parallel
|
83
79
|
table = Sanzang::TranslationTable.new(table_string())
|
84
|
-
|
85
|
-
|
86
|
-
assert(
|
80
|
+
bt = Sanzang::BatchTranslator.new(table)
|
81
|
+
bt.forking?
|
82
|
+
assert(bt.processor_count > 0, "Processor count less than zero")
|
87
83
|
end
|
88
84
|
|
89
85
|
def test_translate_batch
|
90
86
|
table = Sanzang::TranslationTable.new(table_string())
|
91
|
-
|
92
|
-
|
87
|
+
bt = Sanzang::BatchTranslator.new(table)
|
88
|
+
bt.translate_to_dir(
|
93
89
|
Dir.glob(File.join(File.dirname(__FILE__), "utf-8", "file_*.txt")),
|
94
90
|
File.join(File.dirname(__FILE__), "utf-8", "batch"), false)
|
95
91
|
end
|