sanzang 0.0.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -18,4 +18,4 @@
18
18
 
19
19
  require_relative File.join("..", "lib", "sanzang")
20
20
 
21
- Kernel.exit(Sanzang::Command::Reflow.new.run(ARGV))
21
+ Kernel.exit(Sanzang::Command::SanzangCmd.new.run(ARGV))
@@ -1,19 +1,6 @@
1
1
  #!/usr/bin/env ruby -w
2
2
  # -*- encoding: UTF-8 -*-
3
-
4
- # == Description
5
- #
6
- # The Sanzang module contains a basic infrastructure for machine translation
7
- # using a simple direct translation method that does not attempt to change the
8
- # underlying grammar of the source text. The Sanzang module also contains
9
- # functionality for preparing source texts by reformatting them in a manner
10
- # that will facilitates both machine translation as well as the readability of
11
- # the final translation listing that is generated. All program source code for
12
- # the Sanzang system is contained within the Sanzang module, with code for the
13
- # Sanzang commands being located in the Sanzang::Command module.
14
- #
15
- # == Copyright
16
- #
3
+ #--
17
4
  # Copyright (C) 2012 Lapis Lazuli Texts
18
5
  #
19
6
  # This program is free software: you can redistribute it and/or modify it under
@@ -28,38 +15,28 @@
28
15
  #
29
16
  # You should have received a copy of the GNU General Public License along with
30
17
  # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ # All program source code for the translation system is contained under the
20
+ # Sanzang module, and code for the \Sanzang commands is located in the
21
+ # Sanzang::Command module.
31
22
  #
32
- module Sanzang; end
23
+ module Sanzang
24
+ end
33
25
 
34
26
  require_relative File.join("sanzang", "text_formatter")
35
27
  require_relative File.join("sanzang", "translation_table")
36
28
  require_relative File.join("sanzang", "translator")
29
+ require_relative File.join("sanzang", "batch_translator")
37
30
  require_relative File.join("sanzang", "version")
38
31
 
39
- # == Description
40
- #
41
32
  # The Sanzang::Command module contains Unix style commands utilizing the
42
33
  # Sanzang module. Each class is typically a different command, with usage
43
34
  # information given when running the command with the "-h" or "--help" options.
44
35
  #
45
- # == Copyright
46
- #
47
- # Copyright (C) 2012 Lapis Lazuli Texts
48
- #
49
- # This program is free software: you can redistribute it and/or modify it under
50
- # the terms of the GNU General Public License as published by the Free Software
51
- # Foundation, either version 3 of the License, or (at your option) any later
52
- # version.
53
- #
54
- # This program is distributed in the hope that it will be useful, but WITHOUT
55
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
56
- # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
57
- # details.
58
- #
59
- # You should have received a copy of the GNU General Public License along with
60
- # this program. If not, see <http://www.gnu.org/licenses/>.
61
- #
62
- module Sanzang::Command; end
36
+ module Sanzang::Command
37
+ end
63
38
 
39
+ require_relative File.join("sanzang", "command", "batch")
64
40
  require_relative File.join("sanzang", "command", "reflow")
41
+ require_relative File.join("sanzang", "command", "sanzang_cmd")
65
42
  require_relative File.join("sanzang", "command", "translate")
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ require "parallel"
20
+
21
+ require_relative "translator"
22
+
23
+ module Sanzang
24
+
25
+ # BatchTranslator can handle batches of files for translation, and may also
26
+ # be able to translate them in parallel using multiprocessing, if your Ruby
27
+ # virtual machine supports it. This class inherits from Translator.
28
+ #
29
+ class BatchTranslator < Translator
30
+
31
+ # Evaluates to true if this Ruby can execute the fork(2) system call.
32
+ #
33
+ def forking?
34
+ Process.respond_to?(:fork)
35
+ end
36
+
37
+ # The number of logical processors detected on the current system.
38
+ #
39
+ def processor_count
40
+ Parallel.processor_count
41
+ end
42
+
43
+ # Translate a batch of files. The main parameter is an array, each element
44
+ # of which should be a two-dimensional array with the first element being
45
+ # the input file path, and the second element being the output file path.
46
+ # If the _verbose_ parameter is true, then print progress to STDERR. The
47
+ # return value is an array containing all the output file paths.
48
+ #
49
+ def translate_batch(fpath_pairs, verbose = true, jobs = nil)
50
+ if not forking?
51
+ jobs = 0
52
+ end
53
+ Parallel.map(fpath_pairs, :in_processes => jobs) do |f1,f2|
54
+ translate_io(f1, f2)
55
+ if verbose
56
+ $stderr.write "[#{Process.pid}] #{File.expand_path(f2)} \n"
57
+ $stderr.flush
58
+ end
59
+ f2
60
+ end
61
+ end
62
+
63
+ # Translate a list of files to some output directory. The names of the
64
+ # files written to the output directory will be the same as those of their
65
+ # respective input files. If the _verbose_ parameter is true, then print
66
+ # progress to STDERR.
67
+ #
68
+ def translate_to_dir(in_fpaths, out_dir, verbose = true, jobs = nil)
69
+ pairs = []
70
+ in_fpaths.each do |f1|
71
+ pairs << [f1, File.join(out_dir, File.basename(f1))]
72
+ end
73
+ translate_batch(pairs, verbose, jobs)
74
+ end
75
+
76
+ end
77
+ end
@@ -0,0 +1,131 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ require "optparse"
20
+
21
+ require_relative File.join("..", "translation_table")
22
+ require_relative File.join("..", "batch_translator")
23
+ require_relative File.join("..", "version")
24
+
25
+ module Sanzang::Command
26
+
27
+ # This class implements a command for batch translation of texts. The command
28
+ # presumes that the list of input files will be read from $stdin, while the
29
+ # output files will be written to a single directory. Usage information can
30
+ # be accessed by passing in the "-h" or "--help" options.
31
+ #
32
+ class Batch
33
+
34
+ # Create a new instance of the batch command.
35
+ #
36
+ def initialize
37
+ @name = "sanzang batch"
38
+ @encoding = nil
39
+ @outdir = nil
40
+ @jobs = nil
41
+ end
42
+
43
+ # Run the batch command with the given arguments. The parameter _args_
44
+ # would typically be an array of command options and parameters. Calling
45
+ # this method with the "-h" or "--help" option will print full usage
46
+ # information necessary for running the command. This method will return
47
+ # either 0 (success) or 1 (failure).
48
+ #
49
+ def run(args)
50
+ parser = option_parser
51
+ parser.parse!(args)
52
+
53
+ if args.length != 2
54
+ $stderr.puts parser
55
+ return 1
56
+ end
57
+
58
+ set_data_encoding
59
+
60
+ translator = nil
61
+ File.open(args[0], "rb", encoding: @encoding) do |table_file|
62
+ table = Sanzang::TranslationTable.new(table_file.read)
63
+ translator = Sanzang::BatchTranslator.new(table)
64
+ end
65
+
66
+ $stdin.binmode.set_encoding(@encoding)
67
+ puts translator.translate_to_dir($stdin.read.split, args[1], true, @jobs)
68
+ return 0
69
+ rescue SystemExit => err
70
+ return err.status
71
+ rescue Exception => err
72
+ $stderr.puts err.backtrace
73
+ $stderr.puts "\nERROR: #{err.inspect}\n\n"
74
+ return 1
75
+ end
76
+
77
+ private
78
+
79
+ # Set the encoding for text data if it is not already set
80
+ #
81
+ def set_data_encoding
82
+ if @encoding == nil
83
+ if Encoding.default_external == Encoding::IBM437
84
+ $stderr.puts "Switching to UTF-8 for text data encoding."
85
+ @encoding = Encoding::UTF_8
86
+ else
87
+ @encoding = Encoding.default_external
88
+ end
89
+ end
90
+ end
91
+
92
+ # Return an OptionParser object for this command
93
+ #
94
+ def option_parser
95
+ OptionParser.new do |op|
96
+ op.banner = "Usage: #{@name} [options] table output_dir < queue\n"
97
+
98
+ op.banner << "\nBatch translate files concurrently. A list of files "
99
+ op.banner << "is read from STDIN, while\nprogress information is "
100
+ op.banner << "printed to STDERR. The list of output files written is\n"
101
+ op.banner << "printed to STDOUT at the end of the batch. The "
102
+ op.banner << "output directory is specified as\na parameter.\n"
103
+
104
+ op.banner << "\nOptions:\n"
105
+
106
+ op.on("-h", "--help", "show this help message and exit") do |v|
107
+ puts op
108
+ exit 0
109
+ end
110
+ op.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
111
+ @encoding = Encoding.find(v)
112
+ end
113
+ op.on("-L", "--list-encodings", "list possible encodings") do |v|
114
+ encodings = Encoding.list.sort do |x,y|
115
+ x.to_s.upcase <=> y.to_s.upcase
116
+ end
117
+ puts encodings
118
+ exit 0
119
+ end
120
+ op.on("-j", "--jobs=N", "allow N concurrent processes") do |v|
121
+ @jobs = v.to_i
122
+ end
123
+ end
124
+ end
125
+
126
+ # Name of the command
127
+ #
128
+ attr_reader :name
129
+
130
+ end
131
+ end
@@ -23,26 +23,26 @@ require_relative File.join("..", "version")
23
23
 
24
24
  module Sanzang::Command
25
25
 
26
- # The Sanzang::Command::Reflow class provides a Unix-style command for
27
- # text reformatting. This reformatting is typically for use prior to
28
- # processing the text with the Sanzang::Command::Translate. The reason for
29
- # this is to do initial text transformations to ensure (1) that terms will
30
- # be translated reliably, and (2) that the final output of the translation
31
- # will be readable by the user (i.e. lines not too long).
26
+ # This class provides a command for text reformatting for CJK languages. This
27
+ # reformatting is typically for use prior to processing the text with the
28
+ # translation commands. The reason for doing this is so that initial text
29
+ # transformations will be done to ensure (1) that terms will be translated
30
+ # reliably, and (2) that the final output of the translation will be readable
31
+ # by the user (i.e. lines not too long).
32
32
  #
33
33
  class Reflow
34
34
 
35
- # Create a new instance of the Reflow class.
35
+ # Create a new instance of the reflow command
36
36
  #
37
37
  def initialize
38
- @name = "sanzang-reflow"
38
+ @name = "sanzang reflow"
39
39
  @encoding = Encoding.default_external
40
40
  @infile = nil
41
41
  @outfile = nil
42
42
  end
43
43
 
44
- # Run the Reflow command with the given arguments. The parameter _args_
45
- # would typically be an Array of Unix-style command parameters. Calling
44
+ # Run the reflow command with the given arguments. The parameter _args_
45
+ # would typically be an array of command options and parameters. Calling
46
46
  # this with the "-h" or "--help" option will print full usage information
47
47
  # necessary for running this command.
48
48
  #
@@ -51,7 +51,7 @@ module Sanzang::Command
51
51
  parser.parse!(args)
52
52
 
53
53
  if args.length != 0
54
- puts(parser)
54
+ $stderr.puts(parser)
55
55
  return 1
56
56
  end
57
57
 
@@ -62,7 +62,7 @@ module Sanzang::Command
62
62
  fin.binmode.set_encoding(@encoding)
63
63
  fout = @outfile ? File.open(@outfile, "w") : $stdout
64
64
  fout.binmode.set_encoding(@encoding)
65
- fout.write(Sanzang::TextFormatter.new.reflow_cjk_text(fin.read))
65
+ fout.write(Sanzang::TextFormatter.new.reflow_cjk(fin.read))
66
66
  ensure
67
67
  if defined?(fin) and fin != $stdin
68
68
  fin.close if not fin.closed?
@@ -77,12 +77,14 @@ module Sanzang::Command
77
77
  return err.status
78
78
  rescue Exception => err
79
79
  $stderr.puts err.backtrace
80
- $stderr.puts "ERROR: #{err.inspect}"
80
+ $stderr.puts "\nERROR: #{err.inspect}\n\n"
81
81
  return 1
82
82
  end
83
83
 
84
84
  private
85
85
 
86
+ # Initialize the encoding for text data if it is not already set
87
+ #
86
88
  def set_data_encoding
87
89
  if @encoding == nil
88
90
  if Encoding.default_external == Encoding::IBM437
@@ -94,41 +96,42 @@ module Sanzang::Command
94
96
  end
95
97
  end
96
98
 
99
+ # An OptionParser for the command
100
+ #
97
101
  def option_parser
98
- OptionParser.new do |pr|
99
- pr.banner = "Usage: #{@name} [options]\n"
102
+ OptionParser.new do |op|
103
+ op.banner = "Usage: #{@name} [options]\n"
100
104
 
101
- pr.banner << "\nReformat text file contents into lines based on "
102
- pr.banner << "spacing, punctuation, etc.\n"
103
- pr.banner << "\nExamples:\n"
104
- pr.banner << " #{@name} -i in/mytext.txt -o out/mytext.txt\n"
105
- pr.banner << "\nOptions:\n"
105
+ op.banner << "\nReformat text file contents into lines based on "
106
+ op.banner << "spacing, punctuation, etc.\n"
107
+ op.banner << "\nExamples:\n"
108
+ op.banner << " #{@name} -i in/mytext.txt -o out/mytext.txt\n"
109
+ op.banner << "\nOptions:\n"
106
110
 
107
- pr.on("-h", "--help", "show this help message and exit") do |v|
108
- puts pr
111
+ op.on("-h", "--help", "show this help message and exit") do |v|
112
+ puts op
109
113
  exit 0
110
114
  end
111
- pr.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
115
+ op.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
112
116
  @encoding = Encoding.find(v)
113
117
  end
114
- pr.on("-L", "--list-encodings", "list possible encodings") do |v|
115
- puts(Encoding.list.collect {|e| e.to_s }.sort)
118
+ op.on("-L", "--list-encodings", "list possible encodings") do |v|
119
+ encodings = Encoding.list.sort do |x,y|
120
+ x.to_s.upcase <=> y.to_s.upcase
121
+ end
122
+ puts encodings
116
123
  exit 0
117
124
  end
118
- pr.on("-i", "--infile=FILE", "read input text from FILE") do |v|
125
+ op.on("-i", "--infile=FILE", "read input text from FILE") do |v|
119
126
  @infile = v
120
127
  end
121
- pr.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
128
+ op.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
122
129
  @outfile = v
123
130
  end
124
- pr.on("-V", "--version", "show version number and exit") do |v|
125
- puts "Sanzang version: #{Sanzang::VERSION}"
126
- exit 0
127
- end
128
131
  end
129
132
  end
130
133
 
131
- # The standard name for the command.
134
+ # The name of the command
132
135
  #
133
136
  attr_reader :name
134
137
 
@@ -0,0 +1,132 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ require "optparse"
20
+ require "parallel"
21
+
22
+ require_relative "reflow"
23
+ require_relative "translate"
24
+ require_relative "batch"
25
+
26
+ require_relative File.join("..", "version")
27
+
28
+ module Sanzang::Command
29
+
30
+ # This class provides a frontend for all Sanzang operations and subcommands.
31
+ #
32
+ class SanzangCmd
33
+
34
+ # Create a new instance of the sanzang command
35
+ #
36
+ def initialize
37
+ @name = "sanzang"
38
+ @commands = [
39
+ ["batch", Sanzang::Command::Batch],
40
+ ["reflow", Sanzang::Command::Reflow],
41
+ ["translate", Sanzang::Command::Translate]
42
+ ]
43
+ end
44
+
45
+ # Run the sanzang command with the given arguments. If the first argument
46
+ # is the name of a sanzang subcommand or the beginning of a subcommand,
47
+ # then that subcommand is executed. The sanzang command also accepts
48
+ # several options such as showing usage and platform information.
49
+ #
50
+ def run(args)
51
+ parser = option_parser
52
+
53
+ if args.length < 1
54
+ $stderr.puts parser
55
+ return 1
56
+ end
57
+
58
+ @commands.each do |key,cmd|
59
+ if key.start_with?(args[0])
60
+ return cmd.new.run(args[1..-1])
61
+ end
62
+ end
63
+
64
+ parser.parse!(args)
65
+
66
+ $stderr.puts parser
67
+ return 1
68
+ rescue SystemExit => err
69
+ return err.status
70
+ rescue Exception => err
71
+ $stderr.puts err.backtrace
72
+ $stderr.puts "ERROR: #{err.inspect}"
73
+ return 1
74
+ end
75
+
76
+ # A string giving a listing of platform information
77
+ #
78
+ def platform_info
79
+ info = "Ruby platform: #{RUBY_PLATFORM}\n"
80
+ info << "Ruby version: #{RUBY_VERSION}\n"
81
+ info << "External encoding: #{Encoding.default_external}\n"
82
+ info << "Internal encoding: #{Encoding.default_internal or 'none'}\n"
83
+ info << "Fork implemented: #{Process.respond_to?(:fork)}\n"
84
+ info << "Parallel version: #{Parallel::VERSION}\n"
85
+ info << "Processors found: #{Parallel.processor_count}\n"
86
+ info << "Sanzang version: #{Sanzang::VERSION}\n"
87
+ end
88
+
89
+ # This is a string giving a brief one-line summary of version information
90
+ #
91
+ def version_info
92
+ "sanzang #{Sanzang::VERSION} [ruby_#{RUBY_VERSION}] [#{RUBY_PLATFORM}]"
93
+ end
94
+
95
+ private
96
+
97
+ # An OptionParser object for parsing command options and parameters
98
+ #
99
+ def option_parser
100
+ OptionParser.new do |op|
101
+ op.banner = "Usage: #{@name} [options]\n"
102
+ op.banner << "Usage: #{@name} <command> [options] [args]\n\n"
103
+
104
+ op.banner << "Use \"--help\" with commands for usage information.\n"
105
+
106
+ op.banner << "\nSanzang commands:\n"
107
+ op.banner << " batch translate many files in parallel\n"
108
+ op.banner << " reflow format CJK text for translation\n"
109
+ op.banner << " translate standard single text translation\n"
110
+ op.banner << "\nOptions:\n"
111
+
112
+ op.on("-h", "--help", "show this help message and exit") do |v|
113
+ puts op
114
+ exit 0
115
+ end
116
+ op.on("-P", "--platform", "show platform information and exit") do |v|
117
+ puts platform_info
118
+ exit 0
119
+ end
120
+ op.on("-V", "--version", "show version number and exit") do |v|
121
+ puts version_info
122
+ exit 0
123
+ end
124
+ end
125
+ end
126
+
127
+ # Name of the command
128
+ #
129
+ attr_reader :name
130
+
131
+ end
132
+ end