sanzang 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,168 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ require "optparse"
20
+
21
+ require_relative File.join("..", "translation_table")
22
+ require_relative File.join("..", "translator")
23
+ require_relative File.join("..", "version")
24
+
25
+ module Sanzang::Command
26
+
27
+ # The Sanzang::Command::Reflow class provides a Unix-style command for
28
+ # text reformatting. This reformatting is typically for use prior to
29
+ # processing the text with the Sanzang::Command::Translate. The reason for
30
+ # this is to do initial text transformations to ensure (1) that terms will
31
+ # be translated reliably, and (2) that the final output of the translation
32
+ # will be readable by the user (i.e. lines not too long).
33
+ #
34
+ class Translate
35
+
36
+ # Create a new instance of the Translate class.
37
+ #
38
+ def initialize
39
+ @name = "sanzang-translate"
40
+ @encoding = nil
41
+ @batch_dir = nil
42
+ @infile = nil
43
+ @outfile = nil
44
+ end
45
+
46
+ # Run the Translate command with the given arguments. The parameter _args_
47
+ # would typically be an Array of Unix-style command parameters. Calling
48
+ # this with the "-h" or "--help" option will print full usage information
49
+ # necessary for running this command.
50
+ #
51
+ def run(args)
52
+ parser = option_parser
53
+ parser.parse!(args)
54
+
55
+ if args.length != 1
56
+ puts parser
57
+ return 1
58
+ end
59
+
60
+ set_data_encoding
61
+
62
+ translator = nil
63
+ File.open(args[0], "rb", encoding: @encoding) do |table_file|
64
+ table = Sanzang::TranslationTable.new(table_file)
65
+ translator = Sanzang::Translator.new(table)
66
+ end
67
+
68
+ if @batch_dir != nil
69
+ $stderr.puts "Batch mode (#{translator.processor_count} processors)"
70
+ if not translator.runs_parallel?
71
+ warn 'Gem not available: "parallel"'
72
+ end
73
+ puts translator.translate_batch($stdin.readlines, @batch_dir)
74
+ else
75
+ begin
76
+ fin = @infile ? File.open(@infile, "rb") : $stdin
77
+ fin.binmode.set_encoding(@encoding)
78
+ fout = @outfile ? File.open(@outfile, "wb") : $stdout
79
+ fout.binmode.set_encoding(@encoding)
80
+ translator.translate_io(fin, fout)
81
+ ensure
82
+ if defined?(fin) and fin != $stdin
83
+ fin.close if not fin.closed?
84
+ end
85
+ if defined?(fout) and fin != $stdout
86
+ fout.close if not fout.closed?
87
+ end
88
+ end
89
+ end
90
+
91
+ return 0
92
+ rescue SystemExit => err
93
+ return err.status
94
+ rescue Exception => err
95
+ $stderr.puts err.backtrace
96
+ $stderr.puts "ERROR: #{err.inspect}"
97
+ return 1
98
+ end
99
+
100
+ private
101
+
102
+ def set_data_encoding
103
+ if @encoding == nil
104
+ if Encoding.default_external == Encoding::IBM437
105
+ $stderr.puts "Switching to UTF-8 for text data encoding."
106
+ @encoding = Encoding::UTF_8
107
+ else
108
+ @encoding = Encoding.default_external
109
+ end
110
+ end
111
+ end
112
+
113
+ def option_parser
114
+ OptionParser.new do |pr|
115
+ pr.banner = "Usage: #{@name} [options] table\n"
116
+ pr.banner << "Usage: #{@name} -B output_dir table < file_list\n"
117
+
118
+ pr.banner << "\nTranslate text using simple table rules. Input text "
119
+ pr.banner << "is read from STDIN by\ndefault, and the output is "
120
+ pr.banner << "written to STDOUT by default. In batch mode, the \n"
121
+ pr.banner << "program reads file paths from STDIN, and writes them "
122
+ pr.banner << "to an output directory.\n"
123
+
124
+ pr.banner << "\nExamples:\n"
125
+ pr.banner << " #{@name} -i text.txt -o text.sz.txt table.txt\n"
126
+ pr.banner << " #{@name} -B table.txt output_dir < myfiles.txt\n"
127
+ pr.banner << "\nOptions:\n"
128
+
129
+ pr.on("-h", "--help", "show this help message and exit") do |v|
130
+ puts pr
131
+ exit 0
132
+ end
133
+ pr.on("-B", "--batch-dir=DIR", "process from a queue into DIR") do |v|
134
+ @batch_dir = v
135
+ end
136
+ pr.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
137
+ @encoding = Encoding.find(v)
138
+ end
139
+ pr.on("-L", "--list-encodings", "list possible encodings") do |v|
140
+ puts(Encoding.list.collect {|e| e.to_s }.sort)
141
+ exit 0
142
+ end
143
+ pr.on("-i", "--infile=FILE", "read input text from FILE") do |v|
144
+ @infile = v
145
+ end
146
+ pr.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
147
+ @outfile = v
148
+ end
149
+ pr.on("-P", "--platform", "show platform information") do |v|
150
+ puts "Ruby version: #{RUBY_VERSION}"
151
+ puts "Ruby platform: #{RUBY_PLATFORM}"
152
+ puts "External encoding: #{Encoding::default_external}"
153
+ if Encoding::default_internal != nil
154
+ puts "Internal encoding: #{Encoding::default_internal}"
155
+ end
156
+ exit 0
157
+ end
158
+ pr.on("-V", "--version", "show version number and exit") do |v|
159
+ puts "Sanzang version: #{Sanzang::VERSION}"
160
+ exit 0
161
+ end
162
+ end
163
+ end
164
+
165
+ attr_reader :name
166
+
167
+ end
168
+ end
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ module Sanzang
20
+
21
+ # This class handles formatting of text data especially to prepare the text
22
+ # for direct translation. This involves reformatting and reflowing text so
23
+ # that words are not divided between lines, and so the output is well suited
24
+ # for humans. For practical purposes of readability, lines of text to be
25
+ # translated should be succinct and easily comprehensible. The TextFormatter
26
+ # class includes methods for accomplishing this reformatting.
27
+ #
28
+ class TextFormatter
29
+
30
+ # Given a CJK string of text, reformat the string for greater compatibility
31
+ # with direct translation, and reflow the text based on its punctuation.
32
+ # The first step of this reformatting is to remove any CBETA-style margins
33
+ # at the beginning of each line, which are indicated by the double-bar
34
+ # character ("║" U+2551). An extra space is then inserted after each short
35
+ # line which may indicate that the line is part of a poem, and should be
36
+ # kept separate. Following this, all newlines are removed, and the text is
37
+ # then reformatted according to the remaining punctuation and spacing.
38
+ #
39
+ def reflow_cjk_text(s)
40
+ source_encoding = s.encoding
41
+ s.encode!(Encoding::UTF_8)
42
+
43
+ # Strip all CBETA-style margins
44
+ s.gsub!(/^.*║/, "")
45
+
46
+ # Starts with Hanzi space and short line: add Hanzi space at the end.
47
+ # This is used for avoiding conflicts between poetry and prose.
48
+ s.gsub!(/^( )(.{1,15})$/, "\\1\\2 ")
49
+
50
+ # Collapse all vertical whitespace.
51
+ using_crlf = s.include?("\r")
52
+ s.gsub!(/(\r|\n)/, "")
53
+
54
+ # Ender followed by non-ender: newline in between.
55
+ s.gsub!(/([:,;。?!」』.;:\?])([^:,;。?!」』.;:\?])/,
56
+ "\\1\n\\2")
57
+
58
+ # Non-starter, non-ender, followed by a starter: newline in between.
59
+ s.gsub!(/([^「『 \t:,;。?!」』.;:\?\n])([「『 \t])/,
60
+ "\\1\n\\2")
61
+
62
+ if s[-1] != "\n"
63
+ s << "\n"
64
+ end
65
+
66
+ s.gsub!("\n", "\r\n") if using_crlf
67
+ s.encode!(source_encoding)
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+ #
19
+ module Sanzang
20
+
21
+ # TranslationTable encapsulates the set of rules used for translation by
22
+ # Sanzang::Translator. These rules may be loaded from a string passed in to
23
+ # the constructor, or loaded from an open IO object. The translation rules
24
+ # will then go through basic parsing to ensure the table data is in the
25
+ # correct format, and then the rules are reverse sorted by the length of the
26
+ # source language column. Thereafter, these rules are accessible through the
27
+ # ''records'' attribute, and metadata is available through other accessors
28
+ # and methods. It is the responsibility of Sanzang::Translator object to
29
+ # actually apply the rules of a TranslationTable to some text, as the table
30
+ # merely encapsulates a set of translation rules.
31
+ #
32
+ # The format for translation table data can be summarized as the following:
33
+ #
34
+ # * Plain text with one line per record
35
+ # * Records begin with "~|", end with "|~", and are delimited by "|".
36
+ # * The number of columns in each record must be consistent.
37
+ #
38
+ # An example of this format is the following:
39
+ #
40
+ # ~|zh-term1|en-term1|~
41
+ # ~|zh-term2|en-term2|~
42
+ # ~|zh-term3|en-term3|~
43
+ #
44
+ class TranslationTable
45
+
46
+ # Create a new TranslationTable object from a string or by reading an IO
47
+ # object. If the table parameter is a kind of string, then attempt to parse
48
+ # the table data from this string. Otherwise treat the parameter as an open
49
+ # IO object, and attempt to read the string data from that. After loading
50
+ # and verifying the contents of the translation table, all the records are
51
+ # reverse sorted by length, since this is the order in which they will be
52
+ # applied.
53
+ #
54
+ def initialize(rules)
55
+ contents = rules.kind_of?(String) ? rules : rules.read
56
+ @encoding = contents.encoding
57
+
58
+ left = "~|".encode(@encoding)
59
+ right = "|~".encode(@encoding)
60
+ separator = "|".encode(@encoding)
61
+
62
+ @records = contents.gsub("\r", "").split("\n").collect do |rec|
63
+ rec = rec.strip.gsub(left, "").gsub(right, "").split(separator)
64
+ end
65
+
66
+ if @records.length > 0
67
+ @width = records[0].length
68
+ 0.upto(@records.length - 1) do |i|
69
+ if @records[i].length != @width
70
+ raise "Column mismatch: Line #{i + 1}"
71
+ end
72
+ end
73
+ else
74
+ @width = 0
75
+ end
76
+
77
+ @records.sort! {|x,y| y.length <=> x.length }
78
+ end
79
+
80
+ # Retrieve a record by its numeric index. This is just shorthand for
81
+ # looking at the records attribute directly.
82
+ #
83
+ def [](index)
84
+ @records[index]
85
+ end
86
+
87
+ # Find the record where the source language field is equal to the given
88
+ # parameter.
89
+ #
90
+ def find(term)
91
+ @records.find {|rec| rec[0] == term }
92
+ end
93
+
94
+ # The number of records in the translation table (the table length).
95
+ #
96
+ def length
97
+ @records.length
98
+ end
99
+
100
+ # The number of columns in the translation table (the table width).
101
+ #
102
+ attr_reader :width
103
+
104
+ # The records for the translation table, as an Array.
105
+ #
106
+ attr_reader :records
107
+
108
+ # The text encoding used for all translation table data.
109
+ #
110
+ attr_reader :encoding
111
+
112
+ end
113
+ end
@@ -0,0 +1,174 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ begin
20
+ require "parallel"
21
+ rescue LoadError
22
+ nil
23
+ end
24
+
25
+ module Sanzang
26
+
27
+ # Translator is the main class for performing text translations with Sanzang.
28
+ # A Translator utilizes a TranslationTable, which is passed to it at the time
29
+ # of creation. The Translator can then apply these translation rules,
30
+ # generate full translation listings, and perform translations by reading and
31
+ # writing to IO objects. Finally, Translator supports a batch mode that can
32
+ # utilize multiprocessing if the _Parallel_ module is available, and if the
33
+ # platform supports Kernel#fork. Methods are also available for querying the
34
+ # status of this functionality.
35
+ #
36
+ class Translator
37
+
38
+ # Creates a new Translator object with the given TranslationTable. The
39
+ # TranslationTable stores rules for translation, while the Translator is
40
+ # the worker who applies these rules and can create translation listings.
41
+ #
42
+ def initialize(translation_table)
43
+ @table = translation_table
44
+ end
45
+
46
+ # Returns true if both the _Parallel_ module is available, and is also
47
+ # functioning on this particular implementation of Ruby. Currently the
48
+ # _mingw_ and _mswin_ ports of Ruby do not have Process#fork implemented.
49
+ #
50
+ def runs_parallel?
51
+ if not Process.respond_to?(:fork)
52
+ false
53
+ elsif defined?(Parallel) == "constant" and Parallel.class == Module
54
+ true
55
+ else
56
+ false
57
+ end
58
+ end
59
+
60
+ # Return the number of processors available on the current system. This
61
+ # will return the total number of logical processors, rather than physical
62
+ # processors.
63
+ #
64
+ def processor_count
65
+ runs_parallel? == true ? Parallel.processor_count : 1
66
+ end
67
+
68
+ # Return an Array of all translation rules used by a particular text.
69
+ # These records represent the vocabulary used by the text.
70
+ #
71
+ def text_vocab(source_text)
72
+ new_table = []
73
+ @table.records.each do |record|
74
+ if source_text.include?(record[0])
75
+ new_table << record
76
+ end
77
+ end
78
+ new_table
79
+ end
80
+
81
+ # Use the TranslationTable of the Translator to create translations for
82
+ # each destination language column of the translation table. These
83
+ # result is a simple Array of String objects, with each String object
84
+ # corresponding to a destination language column in the TranslationTable.
85
+ #
86
+ def translate(source_text)
87
+ text_collection = [source_text]
88
+ vocab_terms = text_vocab(source_text)
89
+ 1.upto(@table.width - 1) do |column_i|
90
+ translation = String.new(source_text)
91
+ vocab_terms.each do |term|
92
+ translation.gsub!(term[0], term[column_i])
93
+ end
94
+ text_collection << translation
95
+ end
96
+ text_collection
97
+ end
98
+
99
+ # Generate a translation listing text string, in which the output of
100
+ # Translator#translate is collated and numbered for reference purposes.
101
+ # This is the normal text listing output of the Sanzang Translator.
102
+ #
103
+ def gen_listing(source_text)
104
+ newline = source_text.include?("\r") ? "\r\n" : "\n"
105
+ texts = translate(source_text).collect {|t| t = t.split(newline) }
106
+ listing = "".encode(source_text.encoding)
107
+
108
+ texts[0].length.times do |line_i|
109
+ @table.width.times do |col_i|
110
+ listing << "[#{line_i + 1}.#{col_i + 1}] #{texts[col_i][line_i]}" \
111
+ << newline
112
+ end
113
+ listing << newline
114
+ end
115
+ listing
116
+ end
117
+
118
+ # Read a text from _input_ and write its translation listing to _output_.
119
+ # The parameters _input_ and _output_ can be either String objects or IO
120
+ # objects. If they are strings, then they are interpreted as being file
121
+ # paths. If they are not strings, then the I/O operations are performed on
122
+ # them directly.
123
+ #
124
+ def translate_io(input, output)
125
+ if input.class == String
126
+ input = File.open(input, "r", external_encoding: @table.encoding)
127
+ end
128
+ if output.class == String
129
+ output = File.open(output, "w", external_encoding: @table.encoding)
130
+ end
131
+ output.write(gen_listing(input.read))
132
+ input.close
133
+ output.close
134
+ end
135
+
136
+ # Translate a list of files to some output directory. If the _verbose_
137
+ # parameter is true, then print progress to STDERR. If the value of
138
+ # Translator#runs_parallel? is false, then the batch is processed
139
+ # sequentially, only utilizing one processor. However, if the value is
140
+ # true, then run the batch by utilizing the Parallel module for efficient
141
+ # multiprocessing.
142
+ #
143
+ def translate_batch(fpath_list, out_dir, verbose = true)
144
+ fpath_list.collect! {|f| f.chomp }
145
+
146
+ if not runs_parallel?
147
+ fpath_list.each do |in_fpath|
148
+ out_fpath = File.join(out_dir, File.basename(in_fpath))
149
+ translate_io(in_fpath, out_fpath)
150
+ if verbose
151
+ $stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
152
+ $stderr.flush
153
+ end
154
+ out_fpath
155
+ end
156
+ else
157
+ Parallel.map(fpath_list) do |in_fpath|
158
+ out_fpath = File.join(out_dir, File.basename(in_fpath))
159
+ translate_io(in_fpath, out_fpath)
160
+ if verbose
161
+ $stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
162
+ $stderr.flush
163
+ end
164
+ out_fpath
165
+ end
166
+ end
167
+ end
168
+
169
+ # The TranslationTable used by the Translator
170
+ #
171
+ attr_reader :table
172
+
173
+ end
174
+ end