sanzang 0.0.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,27 +24,23 @@ require_relative File.join("..", "version")
24
24
 
25
25
  module Sanzang::Command
26
26
 
27
- # The Sanzang::Command::Reflow class provides a Unix-style command for
28
- # text reformatting. This reformatting is typically for use prior to
29
- # processing the text with the Sanzang::Command::Translate. The reason for
30
- # this is to do initial text transformations to ensure (1) that terms will
31
- # be translated reliably, and (2) that the final output of the translation
32
- # will be readable by the user (i.e. lines not too long).
27
+ # This class provides a command for simple translation of one file or text.
28
+ # Input and output text can be read from either stdin and stdout, or from
29
+ # files. For mass translation of texts, see Sanzang::Command::Batch.
33
30
  #
34
31
  class Translate
35
32
 
36
33
  # Create a new instance of the Translate class.
37
34
  #
38
35
  def initialize
39
- @name = "sanzang-translate"
36
+ @name = "sanzang translate"
40
37
  @encoding = nil
41
- @batch_dir = nil
42
38
  @infile = nil
43
39
  @outfile = nil
44
40
  end
45
41
 
46
- # Run the Translate command with the given arguments. The parameter _args_
47
- # would typically be an Array of Unix-style command parameters. Calling
42
+ # Run the translate command with the given arguments. The parameter _args_
43
+ # would typically be an array of command options and parameters. Calling
48
44
  # this with the "-h" or "--help" option will print full usage information
49
45
  # necessary for running this command.
50
46
  #
@@ -53,7 +49,7 @@ module Sanzang::Command
53
49
  parser.parse!(args)
54
50
 
55
51
  if args.length != 1
56
- puts parser
52
+ $stderr.puts parser
57
53
  return 1
58
54
  end
59
55
 
@@ -61,30 +57,22 @@ module Sanzang::Command
61
57
 
62
58
  translator = nil
63
59
  File.open(args[0], "rb", encoding: @encoding) do |table_file|
64
- table = Sanzang::TranslationTable.new(table_file)
60
+ table = Sanzang::TranslationTable.new(table_file.read)
65
61
  translator = Sanzang::Translator.new(table)
66
62
  end
67
63
 
68
- if @batch_dir != nil
69
- $stderr.puts "Batch mode (#{translator.processor_count} processors)"
70
- if not translator.runs_parallel?
71
- warn 'Gem not available: "parallel"'
64
+ begin
65
+ fin = @infile ? File.open(@infile, "rb") : $stdin
66
+ fin.binmode.set_encoding(@encoding)
67
+ fout = @outfile ? File.open(@outfile, "wb") : $stdout
68
+ fout.binmode.set_encoding(@encoding)
69
+ translator.translate_io(fin, fout)
70
+ ensure
71
+ if defined?(fin) and fin != $stdin
72
+ fin.close if not fin.closed?
72
73
  end
73
- puts translator.translate_batch($stdin.readlines, @batch_dir)
74
- else
75
- begin
76
- fin = @infile ? File.open(@infile, "rb") : $stdin
77
- fin.binmode.set_encoding(@encoding)
78
- fout = @outfile ? File.open(@outfile, "wb") : $stdout
79
- fout.binmode.set_encoding(@encoding)
80
- translator.translate_io(fin, fout)
81
- ensure
82
- if defined?(fin) and fin != $stdin
83
- fin.close if not fin.closed?
84
- end
85
- if defined?(fout) and fin != $stdout
86
- fout.close if not fout.closed?
87
- end
74
+ if defined?(fout) and fin != $stdout
75
+ fout.close if not fout.closed?
88
76
  end
89
77
  end
90
78
 
@@ -93,12 +81,14 @@ module Sanzang::Command
93
81
  return err.status
94
82
  rescue Exception => err
95
83
  $stderr.puts err.backtrace
96
- $stderr.puts "ERROR: #{err.inspect}"
84
+ $stderr.puts "\nERROR: #{err.inspect}\n\n"
97
85
  return 1
98
86
  end
99
87
 
100
88
  private
101
89
 
90
+ # Initialize the encoding for text data if it is not already set
91
+ #
102
92
  def set_data_encoding
103
93
  if @encoding == nil
104
94
  if Encoding.default_external == Encoding::IBM437
@@ -110,58 +100,45 @@ module Sanzang::Command
110
100
  end
111
101
  end
112
102
 
103
+ # An OptionParser for the command
104
+ #
113
105
  def option_parser
114
- OptionParser.new do |pr|
115
- pr.banner = "Usage: #{@name} [options] table\n"
116
- pr.banner << "Usage: #{@name} -B output_dir table < file_list\n"
117
-
118
- pr.banner << "\nTranslate text using simple table rules. Input text "
119
- pr.banner << "is read from STDIN by\ndefault, and the output is "
120
- pr.banner << "written to STDOUT by default. In batch mode, the \n"
121
- pr.banner << "program reads file paths from STDIN, and writes them "
122
- pr.banner << "to an output directory.\n"
123
-
124
- pr.banner << "\nExamples:\n"
125
- pr.banner << " #{@name} -i text.txt -o text.sz.txt table.txt\n"
126
- pr.banner << " #{@name} -B output_dir table.txt < myfiles.txt\n"
127
- pr.banner << "\nOptions:\n"
128
-
129
- pr.on("-h", "--help", "show this help message and exit") do |v|
130
- puts pr
106
+ OptionParser.new do |op|
107
+ op.banner = "Usage: #{@name} [options] table\n"
108
+
109
+ op.banner << "\nTranslate text using simple table rules. Input text "
110
+ op.banner << "is read from STDIN by\ndefault, and the output is "
111
+ op.banner << "written to STDOUT by default.\n"
112
+
113
+ op.banner << "\nExample:\n"
114
+ op.banner << " #{@name} -i text.txt -o text.sz.txt table.txt\n"
115
+ op.banner << "\nOptions:\n"
116
+
117
+ op.on("-h", "--help", "show this help message and exit") do |v|
118
+ puts op
131
119
  exit 0
132
120
  end
133
- pr.on("-B", "--batch-dir=DIR", "process from a queue into DIR") do |v|
134
- @batch_dir = v
135
- end
136
- pr.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
121
+ op.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
137
122
  @encoding = Encoding.find(v)
138
123
  end
139
- pr.on("-L", "--list-encodings", "list possible encodings") do |v|
140
- puts(Encoding.list.collect {|e| e.to_s }.sort)
124
+ op.on("-L", "--list-encodings", "list possible encodings") do |v|
125
+ encodings = Encoding.list.sort do |x,y|
126
+ x.to_s.upcase <=> y.to_s.upcase
127
+ end
128
+ puts encodings
141
129
  exit 0
142
130
  end
143
- pr.on("-i", "--infile=FILE", "read input text from FILE") do |v|
131
+ op.on("-i", "--infile=FILE", "read input text from FILE") do |v|
144
132
  @infile = v
145
133
  end
146
- pr.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
134
+ op.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
147
135
  @outfile = v
148
136
  end
149
- pr.on("-P", "--platform", "show platform information") do |v|
150
- puts "Ruby version: #{RUBY_VERSION}"
151
- puts "Ruby platform: #{RUBY_PLATFORM}"
152
- puts "External encoding: #{Encoding.default_external}"
153
- if Encoding.default_internal != nil
154
- puts "Internal encoding: #{Encoding.default_internal}"
155
- end
156
- exit 0
157
- end
158
- pr.on("-V", "--version", "show version number and exit") do |v|
159
- puts "Sanzang version: #{Sanzang::VERSION}"
160
- exit 0
161
- end
162
137
  end
163
138
  end
164
139
 
140
+ # Name of the command
141
+ #
165
142
  attr_reader :name
166
143
 
167
144
  end
@@ -36,7 +36,7 @@ module Sanzang
36
36
  # kept separate. Following this, all newlines are removed, and the text is
37
37
  # then reformatted according to the remaining punctuation and spacing.
38
38
  #
39
- def reflow_cjk_text(s)
39
+ def reflow_cjk(s)
40
40
  source_encoding = s.encoding
41
41
  s.encode!(Encoding::UTF_8)
42
42
 
@@ -15,41 +15,30 @@
15
15
  #
16
16
  # You should have received a copy of the GNU General Public License along with
17
17
  # this program. If not, see <http://www.gnu.org/licenses/>.
18
- #
18
+
19
19
  module Sanzang
20
20
 
21
- # TranslationTable encapsulates the set of rules used for translation by
22
- # Sanzang::Translator. These rules may be loaded from a string passed in to
23
- # the constructor, or loaded from an open IO object. The translation rules
24
- # will then go through basic parsing to ensure the table data is in the
25
- # correct format, and then the rules are reverse sorted by the length of the
26
- # source language column. Thereafter, these rules are accessible through the
27
- # ''records'' attribute, and metadata is available through other accessors
28
- # and methods. It is the responsibility of Sanzang::Translator object to
29
- # actually apply the rules of a TranslationTable to some text, as the table
30
- # merely encapsulates a set of translation rules.
31
- #
32
- # The format for translation table data can be summarized as the following:
33
- #
34
- # * Plain text with one line per record
35
- # * Records begin with "~|", end with "|~", and are delimited by "|".
36
- # * The number of columns in each record must be consistent.
37
- #
38
- # An example of this format is the following:
39
- #
40
- # ~|zh-term1|en-term1|~
41
- # ~|zh-term2|en-term2|~
42
- # ~|zh-term3|en-term3|~
21
+ # A translation table encapsulates a set of rules for translating with
22
+ # the \Sanzang system. These are essentially read-only objects meant for
23
+ # storing well-defined translation table data.
43
24
  #
44
25
  class TranslationTable
45
26
 
46
- # Create a new TranslationTable object from a string or by reading an IO
47
- # object. If the table parameter is a kind of string, then attempt to parse
48
- # the table data from this string. Otherwise treat the parameter as an open
49
- # IO object, and attempt to read the string data from that. After loading
50
- # and verifying the contents of the translation table, all the records are
51
- # reverse sorted by length, since this is the order in which they will be
52
- # applied.
27
+ # A table is created from a formatted string of translation rules. The
28
+ # string is in the format of delimited text. The text format can be
29
+ # summarized as follows:
30
+ #
31
+ # - Each line of text is a record for a translation rule.
32
+ # - Each record begins with "~|" and ends with "|~".
33
+ # - Fields in the record are separated by the "|" character.
34
+ # - The first field contains the term in the source language.
35
+ # - Subsequent fields are equivalent terms in destination languages.
36
+ # - The number of columns must be consistent for the entire table.
37
+ #
38
+ # The first element in a record is a term in the source language, and
39
+ # subsequent elements are are equivalent terms in destination languages.
40
+ # The number of "columns" in a translation table must be consistent across
41
+ # the entire table.
53
42
  #
54
43
  def initialize(rules)
55
44
  contents = rules.kind_of?(String) ? rules : rules.read
@@ -60,52 +49,48 @@ module Sanzang
60
49
  separator = "|".encode(@encoding)
61
50
 
62
51
  @records = contents.gsub("\r", "").split("\n").collect do |rec|
63
- rec = rec.strip.gsub(left, "").gsub(right, "").split(separator)
52
+ rec.strip.gsub(left, "").gsub(right, "").split(separator)
64
53
  end
65
54
 
66
- if @records.length > 0
67
- @width = records[0].length
68
- 0.upto(@records.length - 1) do |i|
69
- if @records[i].length != @width
70
- raise "Column mismatch: Line #{i + 1}"
71
- end
55
+ @width = records[0].length
56
+ 0.upto(@records.length - 1) do |i|
57
+ if @records[i].length != @width
58
+ raise "Column mismatch: Line #{i + 1}"
72
59
  end
73
- else
74
- @width = 0
75
60
  end
76
61
 
77
- @records.sort! {|x,y| y.length <=> x.length }
62
+ @records.sort! {|x,y| y[0].length <=> x[0].length }
78
63
  end
79
64
 
80
- # Retrieve a record by its numeric index. This is just shorthand for
81
- # looking at the records attribute directly.
65
+ # Retrieve a record by its numeric index.
82
66
  #
83
67
  def [](index)
84
68
  @records[index]
85
69
  end
86
70
 
87
- # Find the record where the source language field is equal to the given
88
- # parameter.
71
+ # Find a record by the source language term (first column).
89
72
  #
90
73
  def find(term)
91
74
  @records.find {|rec| rec[0] == term }
92
75
  end
93
76
 
94
- # The number of records in the translation table (the table length).
77
+ # The number of records in the table
95
78
  #
96
79
  def length
97
80
  @records.length
98
81
  end
99
82
 
100
- # The number of columns in the translation table (the table width).
83
+ # The number of columns in the table
101
84
  #
102
- attr_reader :width
85
+ def width
86
+ @records[0].length
87
+ end
103
88
 
104
- # The records for the translation table, as an Array.
89
+ # The records for the translation table, as an array
105
90
  #
106
91
  attr_reader :records
107
92
 
108
- # The text encoding used for all translation table data.
93
+ # The text encoding used for all translation table data
109
94
  #
110
95
  attr_reader :encoding
111
96
 
@@ -16,22 +16,13 @@
16
16
  # You should have received a copy of the GNU General Public License along with
17
17
  # this program. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
- begin
20
- require "parallel"
21
- rescue LoadError
22
- nil
23
- end
24
-
25
19
  module Sanzang
26
20
 
27
21
  # Translator is the main class for performing text translations with Sanzang.
28
22
  # A Translator utilizes a TranslationTable, which is passed to it at the time
29
23
  # of creation. The Translator can then apply these translation rules,
30
24
  # generate full translation listings, and perform translations by reading and
31
- # writing to IO objects. Finally, Translator supports a batch mode that can
32
- # utilize multiprocessing if the _Parallel_ module is available, and if the
33
- # platform supports Kernel#fork. Methods are also available for querying the
34
- # status of this functionality.
25
+ # writing to IO objects.
35
26
  #
36
27
  class Translator
37
28
 
@@ -43,28 +34,6 @@ module Sanzang
43
34
  @table = translation_table
44
35
  end
45
36
 
46
- # Returns true if both the _Parallel_ module is available, and is also
47
- # functioning on this particular implementation of Ruby. Currently the
48
- # _mingw_ and _mswin_ ports of Ruby do not have Process#fork implemented.
49
- #
50
- def runs_parallel?
51
- if not Process.respond_to?(:fork)
52
- false
53
- elsif defined?(Parallel) == "constant" and Parallel.class == Module
54
- true
55
- else
56
- false
57
- end
58
- end
59
-
60
- # Return the number of processors available on the current system. This
61
- # will return the total number of logical processors, rather than physical
62
- # processors.
63
- #
64
- def processor_count
65
- runs_parallel? == true ? Parallel.processor_count : 1
66
- end
67
-
68
37
  # Return an Array of all translation rules used by a particular text.
69
38
  # These records represent the vocabulary used by the text.
70
39
  #
@@ -133,39 +102,6 @@ module Sanzang
133
102
  output.close
134
103
  end
135
104
 
136
- # Translate a list of files to some output directory. If the _verbose_
137
- # parameter is true, then print progress to STDERR. If the value of
138
- # Translator#runs_parallel? is false, then the batch is processed
139
- # sequentially, only utilizing one processor. However, if the value is
140
- # true, then run the batch by utilizing the Parallel module for efficient
141
- # multiprocessing.
142
- #
143
- def translate_batch(fpath_list, out_dir, verbose = true)
144
- fpath_list.collect! {|f| f.chomp }
145
-
146
- if not runs_parallel?
147
- fpath_list.each do |in_fpath|
148
- out_fpath = File.join(out_dir, File.basename(in_fpath))
149
- translate_io(in_fpath, out_fpath)
150
- if verbose
151
- $stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
152
- $stderr.flush
153
- end
154
- out_fpath
155
- end
156
- else
157
- Parallel.map(fpath_list) do |in_fpath|
158
- out_fpath = File.join(out_dir, File.basename(in_fpath))
159
- translate_io(in_fpath, out_fpath)
160
- if verbose
161
- $stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
162
- $stderr.flush
163
- end
164
- out_fpath
165
- end
166
- end
167
- end
168
-
169
105
  # The TranslationTable used by the Translator
170
106
  #
171
107
  attr_reader :table
@@ -18,7 +18,8 @@
18
18
 
19
19
  module Sanzang
20
20
 
21
- # The current version number of Sanzang.
22
- VERSION = "0.0.3"
21
+ # Current version number of Sanzang
22
+ #
23
+ VERSION = "1.0.0"
23
24
 
24
25
  end
@@ -25,7 +25,7 @@ class TestReflowEncodings < Test::Unit::TestCase
25
25
  text_s1.encode!(encoding)
26
26
  text_s2.encode!(encoding)
27
27
  formatter = Sanzang::TextFormatter.new
28
- assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
28
+ assert_equal(text_s2, formatter.reflow_cjk(text_s1))
29
29
  end
30
30
 
31
31
  # Han characters, simplified and without double vertical bar. The margin
@@ -38,7 +38,7 @@ class TestReflowEncodings < Test::Unit::TestCase
38
38
  text_s1.encode!(encoding)
39
39
  text_s2.encode!(encoding)
40
40
  formatter = Sanzang::TextFormatter.new
41
- assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
41
+ assert_equal(text_s2, formatter.reflow_cjk(text_s1))
42
42
  end
43
43
 
44
44
  # UTF-8 (Traditional Chinese)
@@ -5,10 +5,6 @@ require "test/unit"
5
5
 
6
6
  require_relative File.join("..", "lib", "sanzang")
7
7
 
8
- # assert_nothing_raised
9
- # assert_equal(x, y)
10
- # assert(stmt, "Error message")
11
- #
12
8
  class TestSanzang < Test::Unit::TestCase
13
9
 
14
10
  def table_string
@@ -45,7 +41,7 @@ class TestSanzang < Test::Unit::TestCase
45
41
  def test_translation_table
46
42
  table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
47
43
  fin = File.open(table_path, "rb", encoding: "UTF-8")
48
- table = Sanzang::TranslationTable.new(fin)
44
+ table = Sanzang::TranslationTable.new(fin.read)
49
45
  fin.close
50
46
  assert(table.width.class == Fixnum, "Table width undefined")
51
47
  assert(table.length.class == Fixnum, "Table length undefined")
@@ -60,7 +56,7 @@ class TestSanzang < Test::Unit::TestCase
60
56
  end
61
57
 
62
58
  def test_reflow_cjk_string
63
- text = Sanzang::TextFormatter.new.reflow_cjk_text(stage_1())
59
+ text = Sanzang::TextFormatter.new.reflow_cjk(stage_1())
64
60
  assert_equal(stage_2(), text)
65
61
  end
66
62
 
@@ -74,22 +70,22 @@ class TestSanzang < Test::Unit::TestCase
74
70
  table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
75
71
  s2_path = File.join(File.dirname(__FILE__), "utf-8", "stage_2.txt")
76
72
  s3_path = File.join(File.dirname(__FILE__), "utf-8", "stage_3.txt")
77
- table = Sanzang::TranslationTable.new(table_path)
73
+ table = Sanzang::TranslationTable.new(IO.read(table_path))
78
74
  translator = Sanzang::Translator.new(table)
79
75
  translator.translate_io(s2_path, s3_path)
80
76
  end
81
77
 
82
78
  def test_translator_parallel
83
79
  table = Sanzang::TranslationTable.new(table_string())
84
- translator = Sanzang::Translator.new(table)
85
- translator.runs_parallel?
86
- assert(translator.processor_count > 0, "Processor count less than zero")
80
+ bt = Sanzang::BatchTranslator.new(table)
81
+ bt.forking?
82
+ assert(bt.processor_count > 0, "Processor count less than zero")
87
83
  end
88
84
 
89
85
  def test_translate_batch
90
86
  table = Sanzang::TranslationTable.new(table_string())
91
- translator = Sanzang::Translator.new(table)
92
- translator.translate_batch(
87
+ bt = Sanzang::BatchTranslator.new(table)
88
+ bt.translate_to_dir(
93
89
  Dir.glob(File.join(File.dirname(__FILE__), "utf-8", "file_*.txt")),
94
90
  File.join(File.dirname(__FILE__), "utf-8", "batch"), false)
95
91
  end