sanzang 0.0.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -24,27 +24,23 @@ require_relative File.join("..", "version")
24
24
 
25
25
  module Sanzang::Command
26
26
 
27
- # The Sanzang::Command::Reflow class provides a Unix-style command for
28
- # text reformatting. This reformatting is typically for use prior to
29
- # processing the text with the Sanzang::Command::Translate. The reason for
30
- # this is to do initial text transformations to ensure (1) that terms will
31
- # be translated reliably, and (2) that the final output of the translation
32
- # will be readable by the user (i.e. lines not too long).
27
+ # This class provides a command for simple translation of one file or text.
28
+ # Input and output text can be read from either stdin and stdout, or from
29
+ # files. For mass translation of texts, see Sanzang::Command::Batch.
33
30
  #
34
31
  class Translate
35
32
 
36
33
  # Create a new instance of the Translate class.
37
34
  #
38
35
  def initialize
39
- @name = "sanzang-translate"
36
+ @name = "sanzang translate"
40
37
  @encoding = nil
41
- @batch_dir = nil
42
38
  @infile = nil
43
39
  @outfile = nil
44
40
  end
45
41
 
46
- # Run the Translate command with the given arguments. The parameter _args_
47
- # would typically be an Array of Unix-style command parameters. Calling
42
+ # Run the translate command with the given arguments. The parameter _args_
43
+ # would typically be an array of command options and parameters. Calling
48
44
  # this with the "-h" or "--help" option will print full usage information
49
45
  # necessary for running this command.
50
46
  #
@@ -53,7 +49,7 @@ module Sanzang::Command
53
49
  parser.parse!(args)
54
50
 
55
51
  if args.length != 1
56
- puts parser
52
+ $stderr.puts parser
57
53
  return 1
58
54
  end
59
55
 
@@ -61,30 +57,22 @@ module Sanzang::Command
61
57
 
62
58
  translator = nil
63
59
  File.open(args[0], "rb", encoding: @encoding) do |table_file|
64
- table = Sanzang::TranslationTable.new(table_file)
60
+ table = Sanzang::TranslationTable.new(table_file.read)
65
61
  translator = Sanzang::Translator.new(table)
66
62
  end
67
63
 
68
- if @batch_dir != nil
69
- $stderr.puts "Batch mode (#{translator.processor_count} processors)"
70
- if not translator.runs_parallel?
71
- warn 'Gem not available: "parallel"'
64
+ begin
65
+ fin = @infile ? File.open(@infile, "rb") : $stdin
66
+ fin.binmode.set_encoding(@encoding)
67
+ fout = @outfile ? File.open(@outfile, "wb") : $stdout
68
+ fout.binmode.set_encoding(@encoding)
69
+ translator.translate_io(fin, fout)
70
+ ensure
71
+ if defined?(fin) and fin != $stdin
72
+ fin.close if not fin.closed?
72
73
  end
73
- puts translator.translate_batch($stdin.readlines, @batch_dir)
74
- else
75
- begin
76
- fin = @infile ? File.open(@infile, "rb") : $stdin
77
- fin.binmode.set_encoding(@encoding)
78
- fout = @outfile ? File.open(@outfile, "wb") : $stdout
79
- fout.binmode.set_encoding(@encoding)
80
- translator.translate_io(fin, fout)
81
- ensure
82
- if defined?(fin) and fin != $stdin
83
- fin.close if not fin.closed?
84
- end
85
- if defined?(fout) and fin != $stdout
86
- fout.close if not fout.closed?
87
- end
74
+ if defined?(fout) and fin != $stdout
75
+ fout.close if not fout.closed?
88
76
  end
89
77
  end
90
78
 
@@ -93,12 +81,14 @@ module Sanzang::Command
93
81
  return err.status
94
82
  rescue Exception => err
95
83
  $stderr.puts err.backtrace
96
- $stderr.puts "ERROR: #{err.inspect}"
84
+ $stderr.puts "\nERROR: #{err.inspect}\n\n"
97
85
  return 1
98
86
  end
99
87
 
100
88
  private
101
89
 
90
+ # Initialize the encoding for text data if it is not already set
91
+ #
102
92
  def set_data_encoding
103
93
  if @encoding == nil
104
94
  if Encoding.default_external == Encoding::IBM437
@@ -110,58 +100,45 @@ module Sanzang::Command
110
100
  end
111
101
  end
112
102
 
103
+ # An OptionParser for the command
104
+ #
113
105
  def option_parser
114
- OptionParser.new do |pr|
115
- pr.banner = "Usage: #{@name} [options] table\n"
116
- pr.banner << "Usage: #{@name} -B output_dir table < file_list\n"
117
-
118
- pr.banner << "\nTranslate text using simple table rules. Input text "
119
- pr.banner << "is read from STDIN by\ndefault, and the output is "
120
- pr.banner << "written to STDOUT by default. In batch mode, the \n"
121
- pr.banner << "program reads file paths from STDIN, and writes them "
122
- pr.banner << "to an output directory.\n"
123
-
124
- pr.banner << "\nExamples:\n"
125
- pr.banner << " #{@name} -i text.txt -o text.sz.txt table.txt\n"
126
- pr.banner << " #{@name} -B output_dir table.txt < myfiles.txt\n"
127
- pr.banner << "\nOptions:\n"
128
-
129
- pr.on("-h", "--help", "show this help message and exit") do |v|
130
- puts pr
106
+ OptionParser.new do |op|
107
+ op.banner = "Usage: #{@name} [options] table\n"
108
+
109
+ op.banner << "\nTranslate text using simple table rules. Input text "
110
+ op.banner << "is read from STDIN by\ndefault, and the output is "
111
+ op.banner << "written to STDOUT by default.\n"
112
+
113
+ op.banner << "\nExample:\n"
114
+ op.banner << " #{@name} -i text.txt -o text.sz.txt table.txt\n"
115
+ op.banner << "\nOptions:\n"
116
+
117
+ op.on("-h", "--help", "show this help message and exit") do |v|
118
+ puts op
131
119
  exit 0
132
120
  end
133
- pr.on("-B", "--batch-dir=DIR", "process from a queue into DIR") do |v|
134
- @batch_dir = v
135
- end
136
- pr.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
121
+ op.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
137
122
  @encoding = Encoding.find(v)
138
123
  end
139
- pr.on("-L", "--list-encodings", "list possible encodings") do |v|
140
- puts(Encoding.list.collect {|e| e.to_s }.sort)
124
+ op.on("-L", "--list-encodings", "list possible encodings") do |v|
125
+ encodings = Encoding.list.sort do |x,y|
126
+ x.to_s.upcase <=> y.to_s.upcase
127
+ end
128
+ puts encodings
141
129
  exit 0
142
130
  end
143
- pr.on("-i", "--infile=FILE", "read input text from FILE") do |v|
131
+ op.on("-i", "--infile=FILE", "read input text from FILE") do |v|
144
132
  @infile = v
145
133
  end
146
- pr.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
134
+ op.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
147
135
  @outfile = v
148
136
  end
149
- pr.on("-P", "--platform", "show platform information") do |v|
150
- puts "Ruby version: #{RUBY_VERSION}"
151
- puts "Ruby platform: #{RUBY_PLATFORM}"
152
- puts "External encoding: #{Encoding.default_external}"
153
- if Encoding.default_internal != nil
154
- puts "Internal encoding: #{Encoding.default_internal}"
155
- end
156
- exit 0
157
- end
158
- pr.on("-V", "--version", "show version number and exit") do |v|
159
- puts "Sanzang version: #{Sanzang::VERSION}"
160
- exit 0
161
- end
162
137
  end
163
138
  end
164
139
 
140
+ # Name of the command
141
+ #
165
142
  attr_reader :name
166
143
 
167
144
  end
@@ -36,7 +36,7 @@ module Sanzang
36
36
  # kept separate. Following this, all newlines are removed, and the text is
37
37
  # then reformatted according to the remaining punctuation and spacing.
38
38
  #
39
- def reflow_cjk_text(s)
39
+ def reflow_cjk(s)
40
40
  source_encoding = s.encoding
41
41
  s.encode!(Encoding::UTF_8)
42
42
 
@@ -15,41 +15,30 @@
15
15
  #
16
16
  # You should have received a copy of the GNU General Public License along with
17
17
  # this program. If not, see <http://www.gnu.org/licenses/>.
18
- #
18
+
19
19
  module Sanzang
20
20
 
21
- # TranslationTable encapsulates the set of rules used for translation by
22
- # Sanzang::Translator. These rules may be loaded from a string passed in to
23
- # the constructor, or loaded from an open IO object. The translation rules
24
- # will then go through basic parsing to ensure the table data is in the
25
- # correct format, and then the rules are reverse sorted by the length of the
26
- # source language column. Thereafter, these rules are accessible through the
27
- # ''records'' attribute, and metadata is available through other accessors
28
- # and methods. It is the responsibility of Sanzang::Translator object to
29
- # actually apply the rules of a TranslationTable to some text, as the table
30
- # merely encapsulates a set of translation rules.
31
- #
32
- # The format for translation table data can be summarized as the following:
33
- #
34
- # * Plain text with one line per record
35
- # * Records begin with "~|", end with "|~", and are delimited by "|".
36
- # * The number of columns in each record must be consistent.
37
- #
38
- # An example of this format is the following:
39
- #
40
- # ~|zh-term1|en-term1|~
41
- # ~|zh-term2|en-term2|~
42
- # ~|zh-term3|en-term3|~
21
+ # A translation table encapsulates a set of rules for translating with
22
+ # the \Sanzang system. These are essentially read-only objects meant for
23
+ # storing well-defined translation table data.
43
24
  #
44
25
  class TranslationTable
45
26
 
46
- # Create a new TranslationTable object from a string or by reading an IO
47
- # object. If the table parameter is a kind of string, then attempt to parse
48
- # the table data from this string. Otherwise treat the parameter as an open
49
- # IO object, and attempt to read the string data from that. After loading
50
- # and verifying the contents of the translation table, all the records are
51
- # reverse sorted by length, since this is the order in which they will be
52
- # applied.
27
+ # A table is created from a formatted string of translation rules. The
28
+ # string is in the format of delimited text. The text format can be
29
+ # summarized as follows:
30
+ #
31
+ # - Each line of text is a record for a translation rule.
32
+ # - Each record begins with "~|" and ends with "|~".
33
+ # - Fields in the record are separated by the "|" character.
34
+ # - The first field contains the term in the source language.
35
+ # - Subsequent fields are equivalent terms in destination languages.
36
+ # - The number of columns must be consistent for the entire table.
37
+ #
38
+ # The first element in a record is a term in the source language, and
39
+ # subsequent elements are are equivalent terms in destination languages.
40
+ # The number of "columns" in a translation table must be consistent across
41
+ # the entire table.
53
42
  #
54
43
  def initialize(rules)
55
44
  contents = rules.kind_of?(String) ? rules : rules.read
@@ -60,52 +49,48 @@ module Sanzang
60
49
  separator = "|".encode(@encoding)
61
50
 
62
51
  @records = contents.gsub("\r", "").split("\n").collect do |rec|
63
- rec = rec.strip.gsub(left, "").gsub(right, "").split(separator)
52
+ rec.strip.gsub(left, "").gsub(right, "").split(separator)
64
53
  end
65
54
 
66
- if @records.length > 0
67
- @width = records[0].length
68
- 0.upto(@records.length - 1) do |i|
69
- if @records[i].length != @width
70
- raise "Column mismatch: Line #{i + 1}"
71
- end
55
+ @width = records[0].length
56
+ 0.upto(@records.length - 1) do |i|
57
+ if @records[i].length != @width
58
+ raise "Column mismatch: Line #{i + 1}"
72
59
  end
73
- else
74
- @width = 0
75
60
  end
76
61
 
77
- @records.sort! {|x,y| y.length <=> x.length }
62
+ @records.sort! {|x,y| y[0].length <=> x[0].length }
78
63
  end
79
64
 
80
- # Retrieve a record by its numeric index. This is just shorthand for
81
- # looking at the records attribute directly.
65
+ # Retrieve a record by its numeric index.
82
66
  #
83
67
  def [](index)
84
68
  @records[index]
85
69
  end
86
70
 
87
- # Find the record where the source language field is equal to the given
88
- # parameter.
71
+ # Find a record by the source language term (first column).
89
72
  #
90
73
  def find(term)
91
74
  @records.find {|rec| rec[0] == term }
92
75
  end
93
76
 
94
- # The number of records in the translation table (the table length).
77
+ # The number of records in the table
95
78
  #
96
79
  def length
97
80
  @records.length
98
81
  end
99
82
 
100
- # The number of columns in the translation table (the table width).
83
+ # The number of columns in the table
101
84
  #
102
- attr_reader :width
85
+ def width
86
+ @records[0].length
87
+ end
103
88
 
104
- # The records for the translation table, as an Array.
89
+ # The records for the translation table, as an array
105
90
  #
106
91
  attr_reader :records
107
92
 
108
- # The text encoding used for all translation table data.
93
+ # The text encoding used for all translation table data
109
94
  #
110
95
  attr_reader :encoding
111
96
 
@@ -16,22 +16,13 @@
16
16
  # You should have received a copy of the GNU General Public License along with
17
17
  # this program. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
- begin
20
- require "parallel"
21
- rescue LoadError
22
- nil
23
- end
24
-
25
19
  module Sanzang
26
20
 
27
21
  # Translator is the main class for performing text translations with Sanzang.
28
22
  # A Translator utilizes a TranslationTable, which is passed to it at the time
29
23
  # of creation. The Translator can then apply these translation rules,
30
24
  # generate full translation listings, and perform translations by reading and
31
- # writing to IO objects. Finally, Translator supports a batch mode that can
32
- # utilize multiprocessing if the _Parallel_ module is available, and if the
33
- # platform supports Kernel#fork. Methods are also available for querying the
34
- # status of this functionality.
25
+ # writing to IO objects.
35
26
  #
36
27
  class Translator
37
28
 
@@ -43,28 +34,6 @@ module Sanzang
43
34
  @table = translation_table
44
35
  end
45
36
 
46
- # Returns true if both the _Parallel_ module is available, and is also
47
- # functioning on this particular implementation of Ruby. Currently the
48
- # _mingw_ and _mswin_ ports of Ruby do not have Process#fork implemented.
49
- #
50
- def runs_parallel?
51
- if not Process.respond_to?(:fork)
52
- false
53
- elsif defined?(Parallel) == "constant" and Parallel.class == Module
54
- true
55
- else
56
- false
57
- end
58
- end
59
-
60
- # Return the number of processors available on the current system. This
61
- # will return the total number of logical processors, rather than physical
62
- # processors.
63
- #
64
- def processor_count
65
- runs_parallel? == true ? Parallel.processor_count : 1
66
- end
67
-
68
37
  # Return an Array of all translation rules used by a particular text.
69
38
  # These records represent the vocabulary used by the text.
70
39
  #
@@ -133,39 +102,6 @@ module Sanzang
133
102
  output.close
134
103
  end
135
104
 
136
- # Translate a list of files to some output directory. If the _verbose_
137
- # parameter is true, then print progress to STDERR. If the value of
138
- # Translator#runs_parallel? is false, then the batch is processed
139
- # sequentially, only utilizing one processor. However, if the value is
140
- # true, then run the batch by utilizing the Parallel module for efficient
141
- # multiprocessing.
142
- #
143
- def translate_batch(fpath_list, out_dir, verbose = true)
144
- fpath_list.collect! {|f| f.chomp }
145
-
146
- if not runs_parallel?
147
- fpath_list.each do |in_fpath|
148
- out_fpath = File.join(out_dir, File.basename(in_fpath))
149
- translate_io(in_fpath, out_fpath)
150
- if verbose
151
- $stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
152
- $stderr.flush
153
- end
154
- out_fpath
155
- end
156
- else
157
- Parallel.map(fpath_list) do |in_fpath|
158
- out_fpath = File.join(out_dir, File.basename(in_fpath))
159
- translate_io(in_fpath, out_fpath)
160
- if verbose
161
- $stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
162
- $stderr.flush
163
- end
164
- out_fpath
165
- end
166
- end
167
- end
168
-
169
105
  # The TranslationTable used by the Translator
170
106
  #
171
107
  attr_reader :table
@@ -18,7 +18,8 @@
18
18
 
19
19
  module Sanzang
20
20
 
21
- # The current version number of Sanzang.
22
- VERSION = "0.0.3"
21
+ # Current version number of Sanzang
22
+ #
23
+ VERSION = "1.0.0"
23
24
 
24
25
  end
@@ -25,7 +25,7 @@ class TestReflowEncodings < Test::Unit::TestCase
25
25
  text_s1.encode!(encoding)
26
26
  text_s2.encode!(encoding)
27
27
  formatter = Sanzang::TextFormatter.new
28
- assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
28
+ assert_equal(text_s2, formatter.reflow_cjk(text_s1))
29
29
  end
30
30
 
31
31
  # Han characters, simplified and without double vertical bar. The margin
@@ -38,7 +38,7 @@ class TestReflowEncodings < Test::Unit::TestCase
38
38
  text_s1.encode!(encoding)
39
39
  text_s2.encode!(encoding)
40
40
  formatter = Sanzang::TextFormatter.new
41
- assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
41
+ assert_equal(text_s2, formatter.reflow_cjk(text_s1))
42
42
  end
43
43
 
44
44
  # UTF-8 (Traditional Chinese)
@@ -5,10 +5,6 @@ require "test/unit"
5
5
 
6
6
  require_relative File.join("..", "lib", "sanzang")
7
7
 
8
- # assert_nothing_raised
9
- # assert_equal(x, y)
10
- # assert(stmt, "Error message")
11
- #
12
8
  class TestSanzang < Test::Unit::TestCase
13
9
 
14
10
  def table_string
@@ -45,7 +41,7 @@ class TestSanzang < Test::Unit::TestCase
45
41
  def test_translation_table
46
42
  table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
47
43
  fin = File.open(table_path, "rb", encoding: "UTF-8")
48
- table = Sanzang::TranslationTable.new(fin)
44
+ table = Sanzang::TranslationTable.new(fin.read)
49
45
  fin.close
50
46
  assert(table.width.class == Fixnum, "Table width undefined")
51
47
  assert(table.length.class == Fixnum, "Table length undefined")
@@ -60,7 +56,7 @@ class TestSanzang < Test::Unit::TestCase
60
56
  end
61
57
 
62
58
  def test_reflow_cjk_string
63
- text = Sanzang::TextFormatter.new.reflow_cjk_text(stage_1())
59
+ text = Sanzang::TextFormatter.new.reflow_cjk(stage_1())
64
60
  assert_equal(stage_2(), text)
65
61
  end
66
62
 
@@ -74,22 +70,22 @@ class TestSanzang < Test::Unit::TestCase
74
70
  table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
75
71
  s2_path = File.join(File.dirname(__FILE__), "utf-8", "stage_2.txt")
76
72
  s3_path = File.join(File.dirname(__FILE__), "utf-8", "stage_3.txt")
77
- table = Sanzang::TranslationTable.new(table_path)
73
+ table = Sanzang::TranslationTable.new(IO.read(table_path))
78
74
  translator = Sanzang::Translator.new(table)
79
75
  translator.translate_io(s2_path, s3_path)
80
76
  end
81
77
 
82
78
  def test_translator_parallel
83
79
  table = Sanzang::TranslationTable.new(table_string())
84
- translator = Sanzang::Translator.new(table)
85
- translator.runs_parallel?
86
- assert(translator.processor_count > 0, "Processor count less than zero")
80
+ bt = Sanzang::BatchTranslator.new(table)
81
+ bt.forking?
82
+ assert(bt.processor_count > 0, "Processor count less than zero")
87
83
  end
88
84
 
89
85
  def test_translate_batch
90
86
  table = Sanzang::TranslationTable.new(table_string())
91
- translator = Sanzang::Translator.new(table)
92
- translator.translate_batch(
87
+ bt = Sanzang::BatchTranslator.new(table)
88
+ bt.translate_to_dir(
93
89
  Dir.glob(File.join(File.dirname(__FILE__), "utf-8", "file_*.txt")),
94
90
  File.join(File.dirname(__FILE__), "utf-8", "batch"), false)
95
91
  end