sanzang 0.0.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,4 +18,4 @@
18
18
 
19
19
  require_relative File.join("..", "lib", "sanzang")
20
20
 
21
- Kernel.exit(Sanzang::Command::Reflow.new.run(ARGV))
21
+ Kernel.exit(Sanzang::Command::SanzangCmd.new.run(ARGV))
@@ -1,19 +1,6 @@
1
1
  #!/usr/bin/env ruby -w
2
2
  # -*- encoding: UTF-8 -*-
3
-
4
- # == Description
5
- #
6
- # The Sanzang module contains a basic infrastructure for machine translation
7
- # using a simple direct translation method that does not attempt to change the
8
- # underlying grammar of the source text. The Sanzang module also contains
9
- # functionality for preparing source texts by reformatting them in a manner
10
- # that will facilitates both machine translation as well as the readability of
11
- # the final translation listing that is generated. All program source code for
12
- # the Sanzang system is contained within the Sanzang module, with code for the
13
- # Sanzang commands being located in the Sanzang::Command module.
14
- #
15
- # == Copyright
16
- #
3
+ #--
17
4
  # Copyright (C) 2012 Lapis Lazuli Texts
18
5
  #
19
6
  # This program is free software: you can redistribute it and/or modify it under
@@ -28,38 +15,28 @@
28
15
  #
29
16
  # You should have received a copy of the GNU General Public License along with
30
17
  # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ # All program source code for the translation system is contained under the
20
+ # Sanzang module, and code for the \Sanzang commands is located in the
21
+ # Sanzang::Command module.
31
22
  #
32
- module Sanzang; end
23
+ module Sanzang
24
+ end
33
25
 
34
26
  require_relative File.join("sanzang", "text_formatter")
35
27
  require_relative File.join("sanzang", "translation_table")
36
28
  require_relative File.join("sanzang", "translator")
29
+ require_relative File.join("sanzang", "batch_translator")
37
30
  require_relative File.join("sanzang", "version")
38
31
 
39
- # == Description
40
- #
41
32
  # The Sanzang::Command module contains Unix style commands utilizing the
42
33
  # Sanzang module. Each class is typically a different command, with usage
43
34
  # information given when running the command with the "-h" or "--help" options.
44
35
  #
45
- # == Copyright
46
- #
47
- # Copyright (C) 2012 Lapis Lazuli Texts
48
- #
49
- # This program is free software: you can redistribute it and/or modify it under
50
- # the terms of the GNU General Public License as published by the Free Software
51
- # Foundation, either version 3 of the License, or (at your option) any later
52
- # version.
53
- #
54
- # This program is distributed in the hope that it will be useful, but WITHOUT
55
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
56
- # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
57
- # details.
58
- #
59
- # You should have received a copy of the GNU General Public License along with
60
- # this program. If not, see <http://www.gnu.org/licenses/>.
61
- #
62
- module Sanzang::Command; end
36
+ module Sanzang::Command
37
+ end
63
38
 
39
+ require_relative File.join("sanzang", "command", "batch")
64
40
  require_relative File.join("sanzang", "command", "reflow")
41
+ require_relative File.join("sanzang", "command", "sanzang_cmd")
65
42
  require_relative File.join("sanzang", "command", "translate")
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ require "parallel"
20
+
21
+ require_relative "translator"
22
+
23
+ module Sanzang
24
+
25
+ # BatchTranslator can handle batches of files for translation, and may also
26
+ # be able to translate them in parallel using multiprocessing, if your Ruby
27
+ # virtual machine supports it. This class inherits from Translator.
28
+ #
29
+ class BatchTranslator < Translator
30
+
31
+ # Evaluates to true if this Ruby can execute the fork(2) system call.
32
+ #
33
+ def forking?
34
+ Process.respond_to?(:fork)
35
+ end
36
+
37
+ # The number of logical processors detected on the current system.
38
+ #
39
+ def processor_count
40
+ Parallel.processor_count
41
+ end
42
+
43
+ # Translate a batch of files. The main parameter is an array, each element
44
+ # of which should be a two-dimensional array with the first element being
45
+ # the input file path, and the second element being the output file path.
46
+ # If the _verbose_ parameter is true, then print progress to STDERR. The
47
+ # return value is an array containing all the output file paths.
48
+ #
49
+ def translate_batch(fpath_pairs, verbose = true, jobs = nil)
50
+ if not forking?
51
+ jobs = 0
52
+ end
53
+ Parallel.map(fpath_pairs, :in_processes => jobs) do |f1,f2|
54
+ translate_io(f1, f2)
55
+ if verbose
56
+ $stderr.write "[#{Process.pid}] #{File.expand_path(f2)} \n"
57
+ $stderr.flush
58
+ end
59
+ f2
60
+ end
61
+ end
62
+
63
+ # Translate a list of files to some output directory. The names of the
64
+ # files written to the output directory will be the same as those of their
65
+ # respective input files. If the _verbose_ parameter is true, then print
66
+ # progress to STDERR.
67
+ #
68
+ def translate_to_dir(in_fpaths, out_dir, verbose = true, jobs = nil)
69
+ pairs = []
70
+ in_fpaths.each do |f1|
71
+ pairs << [f1, File.join(out_dir, File.basename(f1))]
72
+ end
73
+ translate_batch(pairs, verbose, jobs)
74
+ end
75
+
76
+ end
77
+ end
@@ -0,0 +1,131 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ require "optparse"
20
+
21
+ require_relative File.join("..", "translation_table")
22
+ require_relative File.join("..", "batch_translator")
23
+ require_relative File.join("..", "version")
24
+
25
+ module Sanzang::Command
26
+
27
+ # This class implements a command for batch translation of texts. The command
28
+ # presumes that the list of input files will be read from $stdin, while the
29
+ # output files will be written to a single directory. Usage information can
30
+ # be accessed by passing in the "-h" or "--help" options.
31
+ #
32
+ class Batch
33
+
34
+ # Create a new instance of the batch command.
35
+ #
36
+ def initialize
37
+ @name = "sanzang batch"
38
+ @encoding = nil
39
+ @outdir = nil
40
+ @jobs = nil
41
+ end
42
+
43
+ # Run the batch command with the given arguments. The parameter _args_
44
+ # would typically be an array of command options and parameters. Calling
45
+ # this method with the "-h" or "--help" option will print full usage
46
+ # information necessary for running the command. This method will return
47
+ # either 0 (success) or 1 (failure).
48
+ #
49
+ def run(args)
50
+ parser = option_parser
51
+ parser.parse!(args)
52
+
53
+ if args.length != 2
54
+ $stderr.puts parser
55
+ return 1
56
+ end
57
+
58
+ set_data_encoding
59
+
60
+ translator = nil
61
+ File.open(args[0], "rb", encoding: @encoding) do |table_file|
62
+ table = Sanzang::TranslationTable.new(table_file.read)
63
+ translator = Sanzang::BatchTranslator.new(table)
64
+ end
65
+
66
+ $stdin.binmode.set_encoding(@encoding)
67
+ puts translator.translate_to_dir($stdin.read.split, args[1], true, @jobs)
68
+ return 0
69
+ rescue SystemExit => err
70
+ return err.status
71
+ rescue Exception => err
72
+ $stderr.puts err.backtrace
73
+ $stderr.puts "\nERROR: #{err.inspect}\n\n"
74
+ return 1
75
+ end
76
+
77
+ private
78
+
79
+ # Set the encoding for text data if it is not already set
80
+ #
81
+ def set_data_encoding
82
+ if @encoding == nil
83
+ if Encoding.default_external == Encoding::IBM437
84
+ $stderr.puts "Switching to UTF-8 for text data encoding."
85
+ @encoding = Encoding::UTF_8
86
+ else
87
+ @encoding = Encoding.default_external
88
+ end
89
+ end
90
+ end
91
+
92
+ # Return an OptionParser object for this command
93
+ #
94
+ def option_parser
95
+ OptionParser.new do |op|
96
+ op.banner = "Usage: #{@name} [options] table output_dir < queue\n"
97
+
98
+ op.banner << "\nBatch translate files concurrently. A list of files "
99
+ op.banner << "is read from STDIN, while\nprogress information is "
100
+ op.banner << "printed to STDERR. The list of output files written is\n"
101
+ op.banner << "printed to STDOUT at the end of the batch. The "
102
+ op.banner << "output directory is specified as\na parameter.\n"
103
+
104
+ op.banner << "\nOptions:\n"
105
+
106
+ op.on("-h", "--help", "show this help message and exit") do |v|
107
+ puts op
108
+ exit 0
109
+ end
110
+ op.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
111
+ @encoding = Encoding.find(v)
112
+ end
113
+ op.on("-L", "--list-encodings", "list possible encodings") do |v|
114
+ encodings = Encoding.list.sort do |x,y|
115
+ x.to_s.upcase <=> y.to_s.upcase
116
+ end
117
+ puts encodings
118
+ exit 0
119
+ end
120
+ op.on("-j", "--jobs=N", "allow N concurrent processes") do |v|
121
+ @jobs = v.to_i
122
+ end
123
+ end
124
+ end
125
+
126
+ # Name of the command
127
+ #
128
+ attr_reader :name
129
+
130
+ end
131
+ end
@@ -23,26 +23,26 @@ require_relative File.join("..", "version")
23
23
 
24
24
  module Sanzang::Command
25
25
 
26
- # The Sanzang::Command::Reflow class provides a Unix-style command for
27
- # text reformatting. This reformatting is typically for use prior to
28
- # processing the text with the Sanzang::Command::Translate. The reason for
29
- # this is to do initial text transformations to ensure (1) that terms will
30
- # be translated reliably, and (2) that the final output of the translation
31
- # will be readable by the user (i.e. lines not too long).
26
+ # This class provides a command for text reformatting for CJK languages. This
27
+ # reformatting is typically for use prior to processing the text with the
28
+ # translation commands. The reason for doing this is so that initial text
29
+ # transformations will be done to ensure (1) that terms will be translated
30
+ # reliably, and (2) that the final output of the translation will be readable
31
+ # by the user (i.e. lines not too long).
32
32
  #
33
33
  class Reflow
34
34
 
35
- # Create a new instance of the Reflow class.
35
+ # Create a new instance of the reflow command
36
36
  #
37
37
  def initialize
38
- @name = "sanzang-reflow"
38
+ @name = "sanzang reflow"
39
39
  @encoding = Encoding.default_external
40
40
  @infile = nil
41
41
  @outfile = nil
42
42
  end
43
43
 
44
- # Run the Reflow command with the given arguments. The parameter _args_
45
- # would typically be an Array of Unix-style command parameters. Calling
44
+ # Run the reflow command with the given arguments. The parameter _args_
45
+ # would typically be an array of command options and parameters. Calling
46
46
  # this with the "-h" or "--help" option will print full usage information
47
47
  # necessary for running this command.
48
48
  #
@@ -51,7 +51,7 @@ module Sanzang::Command
51
51
  parser.parse!(args)
52
52
 
53
53
  if args.length != 0
54
- puts(parser)
54
+ $stderr.puts(parser)
55
55
  return 1
56
56
  end
57
57
 
@@ -62,7 +62,7 @@ module Sanzang::Command
62
62
  fin.binmode.set_encoding(@encoding)
63
63
  fout = @outfile ? File.open(@outfile, "w") : $stdout
64
64
  fout.binmode.set_encoding(@encoding)
65
- fout.write(Sanzang::TextFormatter.new.reflow_cjk_text(fin.read))
65
+ fout.write(Sanzang::TextFormatter.new.reflow_cjk(fin.read))
66
66
  ensure
67
67
  if defined?(fin) and fin != $stdin
68
68
  fin.close if not fin.closed?
@@ -77,12 +77,14 @@ module Sanzang::Command
77
77
  return err.status
78
78
  rescue Exception => err
79
79
  $stderr.puts err.backtrace
80
- $stderr.puts "ERROR: #{err.inspect}"
80
+ $stderr.puts "\nERROR: #{err.inspect}\n\n"
81
81
  return 1
82
82
  end
83
83
 
84
84
  private
85
85
 
86
+ # Initialize the encoding for text data if it is not already set
87
+ #
86
88
  def set_data_encoding
87
89
  if @encoding == nil
88
90
  if Encoding.default_external == Encoding::IBM437
@@ -94,41 +96,42 @@ module Sanzang::Command
94
96
  end
95
97
  end
96
98
 
99
+ # An OptionParser for the command
100
+ #
97
101
  def option_parser
98
- OptionParser.new do |pr|
99
- pr.banner = "Usage: #{@name} [options]\n"
102
+ OptionParser.new do |op|
103
+ op.banner = "Usage: #{@name} [options]\n"
100
104
 
101
- pr.banner << "\nReformat text file contents into lines based on "
102
- pr.banner << "spacing, punctuation, etc.\n"
103
- pr.banner << "\nExamples:\n"
104
- pr.banner << " #{@name} -i in/mytext.txt -o out/mytext.txt\n"
105
- pr.banner << "\nOptions:\n"
105
+ op.banner << "\nReformat text file contents into lines based on "
106
+ op.banner << "spacing, punctuation, etc.\n"
107
+ op.banner << "\nExamples:\n"
108
+ op.banner << " #{@name} -i in/mytext.txt -o out/mytext.txt\n"
109
+ op.banner << "\nOptions:\n"
106
110
 
107
- pr.on("-h", "--help", "show this help message and exit") do |v|
108
- puts pr
111
+ op.on("-h", "--help", "show this help message and exit") do |v|
112
+ puts op
109
113
  exit 0
110
114
  end
111
- pr.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
115
+ op.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
112
116
  @encoding = Encoding.find(v)
113
117
  end
114
- pr.on("-L", "--list-encodings", "list possible encodings") do |v|
115
- puts(Encoding.list.collect {|e| e.to_s }.sort)
118
+ op.on("-L", "--list-encodings", "list possible encodings") do |v|
119
+ encodings = Encoding.list.sort do |x,y|
120
+ x.to_s.upcase <=> y.to_s.upcase
121
+ end
122
+ puts encodings
116
123
  exit 0
117
124
  end
118
- pr.on("-i", "--infile=FILE", "read input text from FILE") do |v|
125
+ op.on("-i", "--infile=FILE", "read input text from FILE") do |v|
119
126
  @infile = v
120
127
  end
121
- pr.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
128
+ op.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
122
129
  @outfile = v
123
130
  end
124
- pr.on("-V", "--version", "show version number and exit") do |v|
125
- puts "Sanzang version: #{Sanzang::VERSION}"
126
- exit 0
127
- end
128
131
  end
129
132
  end
130
133
 
131
- # The standard name for the command.
134
+ # The name of the command
132
135
  #
133
136
  attr_reader :name
134
137
 
@@ -0,0 +1,132 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ require "optparse"
20
+ require "parallel"
21
+
22
+ require_relative "reflow"
23
+ require_relative "translate"
24
+ require_relative "batch"
25
+
26
+ require_relative File.join("..", "version")
27
+
28
+ module Sanzang::Command
29
+
30
+ # This class provides a frontend for all Sanzang operations and subcommands.
31
+ #
32
+ class SanzangCmd
33
+
34
+ # Create a new instance of the sanzang command
35
+ #
36
+ def initialize
37
+ @name = "sanzang"
38
+ @commands = [
39
+ ["batch", Sanzang::Command::Batch],
40
+ ["reflow", Sanzang::Command::Reflow],
41
+ ["translate", Sanzang::Command::Translate]
42
+ ]
43
+ end
44
+
45
+ # Run the sanzang command with the given arguments. If the first argument
46
+ # is the name of a sanzang subcommand or the beginning of a subcommand,
47
+ # then that subcommand is executed. The sanzang command also accepts
48
+ # several options such as showing usage and platform information.
49
+ #
50
+ def run(args)
51
+ parser = option_parser
52
+
53
+ if args.length < 1
54
+ $stderr.puts parser
55
+ return 1
56
+ end
57
+
58
+ @commands.each do |key,cmd|
59
+ if key.start_with?(args[0])
60
+ return cmd.new.run(args[1..-1])
61
+ end
62
+ end
63
+
64
+ parser.parse!(args)
65
+
66
+ $stderr.puts parser
67
+ return 1
68
+ rescue SystemExit => err
69
+ return err.status
70
+ rescue Exception => err
71
+ $stderr.puts err.backtrace
72
+ $stderr.puts "ERROR: #{err.inspect}"
73
+ return 1
74
+ end
75
+
76
+ # A string giving a listing of platform information
77
+ #
78
+ def platform_info
79
+ info = "Ruby platform: #{RUBY_PLATFORM}\n"
80
+ info << "Ruby version: #{RUBY_VERSION}\n"
81
+ info << "External encoding: #{Encoding.default_external}\n"
82
+ info << "Internal encoding: #{Encoding.default_internal or 'none'}\n"
83
+ info << "Fork implemented: #{Process.respond_to?(:fork)}\n"
84
+ info << "Parallel version: #{Parallel::VERSION}\n"
85
+ info << "Processors found: #{Parallel.processor_count}\n"
86
+ info << "Sanzang version: #{Sanzang::VERSION}\n"
87
+ end
88
+
89
+ # This is a string giving a brief one-line summary of version information
90
+ #
91
+ def version_info
92
+ "sanzang #{Sanzang::VERSION} [ruby_#{RUBY_VERSION}] [#{RUBY_PLATFORM}]"
93
+ end
94
+
95
+ private
96
+
97
+ # An OptionParser object for parsing command options and parameters
98
+ #
99
+ def option_parser
100
+ OptionParser.new do |op|
101
+ op.banner = "Usage: #{@name} [options]\n"
102
+ op.banner << "Usage: #{@name} <command> [options] [args]\n\n"
103
+
104
+ op.banner << "Use \"--help\" with commands for usage information.\n"
105
+
106
+ op.banner << "\nSanzang commands:\n"
107
+ op.banner << " batch translate many files in parallel\n"
108
+ op.banner << " reflow format CJK text for translation\n"
109
+ op.banner << " translate standard single text translation\n"
110
+ op.banner << "\nOptions:\n"
111
+
112
+ op.on("-h", "--help", "show this help message and exit") do |v|
113
+ puts op
114
+ exit 0
115
+ end
116
+ op.on("-P", "--platform", "show platform information and exit") do |v|
117
+ puts platform_info
118
+ exit 0
119
+ end
120
+ op.on("-V", "--version", "show version number and exit") do |v|
121
+ puts version_info
122
+ exit 0
123
+ end
124
+ end
125
+ end
126
+
127
+ # Name of the command
128
+ #
129
+ attr_reader :name
130
+
131
+ end
132
+ end