sanzang 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HACKING +54 -0
- data/LICENSE +628 -0
- data/README +280 -0
- data/bin/sanzang-reflow +21 -0
- data/bin/sanzang-translate +21 -0
- data/lib/sanzang.rb +65 -0
- data/lib/sanzang/command/reflow.rb +136 -0
- data/lib/sanzang/command/translate.rb +168 -0
- data/lib/sanzang/text_formatter.rb +71 -0
- data/lib/sanzang/translation_table.rb +113 -0
- data/lib/sanzang/translator.rb +174 -0
- data/lib/sanzang/version.rb +24 -0
- data/test/tc_commands.rb +17 -0
- data/test/tc_reflow_encodings.rb +98 -0
- data/test/tc_simple_translation.rb +97 -0
- data/test/utf-8/batch/file_1.txt +8 -0
- data/test/utf-8/batch/file_2.txt +8 -0
- data/test/utf-8/batch/file_3.txt +8 -0
- data/test/utf-8/batch/file_4.txt +8 -0
- data/test/utf-8/file_1.txt +2 -0
- data/test/utf-8/file_2.txt +2 -0
- data/test/utf-8/file_3.txt +2 -0
- data/test/utf-8/file_4.txt +2 -0
- data/test/utf-8/stage_1.txt +1 -0
- data/test/utf-8/stage_2.txt +2 -0
- data/test/utf-8/stage_3.txt +4 -0
- data/test/utf-8/table.txt +8 -0
- metadata +102 -0
@@ -0,0 +1,168 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
#--
|
4
|
+
# Copyright (C) 2012 Lapis Lazuli Texts
|
5
|
+
#
|
6
|
+
# This program is free software: you can redistribute it and/or modify it under
|
7
|
+
# the terms of the GNU General Public License as published by the Free Software
|
8
|
+
# Foundation, either version 3 of the License, or (at your option) any later
|
9
|
+
# version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful, but WITHOUT
|
12
|
+
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
13
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
14
|
+
# details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License along with
|
17
|
+
# this program. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
|
19
|
+
require "optparse"
|
20
|
+
|
21
|
+
require_relative File.join("..", "translation_table")
|
22
|
+
require_relative File.join("..", "translator")
|
23
|
+
require_relative File.join("..", "version")
|
24
|
+
|
25
|
+
module Sanzang::Command
|
26
|
+
|
27
|
+
# The Sanzang::Command::Reflow class provides a Unix-style command for
|
28
|
+
# text reformatting. This reformatting is typically for use prior to
|
29
|
+
# processing the text with the Sanzang::Command::Translate. The reason for
|
30
|
+
# this is to do initial text transformations to ensure (1) that terms will
|
31
|
+
# be translated reliably, and (2) that the final output of the translation
|
32
|
+
# will be readable by the user (i.e. lines not too long).
|
33
|
+
#
|
34
|
+
class Translate
|
35
|
+
|
36
|
+
# Create a new instance of the Translate class.
|
37
|
+
#
|
38
|
+
def initialize
|
39
|
+
@name = "sanzang-translate"
|
40
|
+
@encoding = nil
|
41
|
+
@batch_dir = nil
|
42
|
+
@infile = nil
|
43
|
+
@outfile = nil
|
44
|
+
end
|
45
|
+
|
46
|
+
# Run the Translate command with the given arguments. The parameter _args_
|
47
|
+
# would typically be an Array of Unix-style command parameters. Calling
|
48
|
+
# this with the "-h" or "--help" option will print full usage information
|
49
|
+
# necessary for running this command.
|
50
|
+
#
|
51
|
+
def run(args)
|
52
|
+
parser = option_parser
|
53
|
+
parser.parse!(args)
|
54
|
+
|
55
|
+
if args.length != 1
|
56
|
+
puts parser
|
57
|
+
return 1
|
58
|
+
end
|
59
|
+
|
60
|
+
set_data_encoding
|
61
|
+
|
62
|
+
translator = nil
|
63
|
+
File.open(args[0], "rb", encoding: @encoding) do |table_file|
|
64
|
+
table = Sanzang::TranslationTable.new(table_file)
|
65
|
+
translator = Sanzang::Translator.new(table)
|
66
|
+
end
|
67
|
+
|
68
|
+
if @batch_dir != nil
|
69
|
+
$stderr.puts "Batch mode (#{translator.processor_count} processors)"
|
70
|
+
if not translator.runs_parallel?
|
71
|
+
warn 'Gem not available: "parallel"'
|
72
|
+
end
|
73
|
+
puts translator.translate_batch($stdin.readlines, @batch_dir)
|
74
|
+
else
|
75
|
+
begin
|
76
|
+
fin = @infile ? File.open(@infile, "rb") : $stdin
|
77
|
+
fin.binmode.set_encoding(@encoding)
|
78
|
+
fout = @outfile ? File.open(@outfile, "wb") : $stdout
|
79
|
+
fout.binmode.set_encoding(@encoding)
|
80
|
+
translator.translate_io(fin, fout)
|
81
|
+
ensure
|
82
|
+
if defined?(fin) and fin != $stdin
|
83
|
+
fin.close if not fin.closed?
|
84
|
+
end
|
85
|
+
if defined?(fout) and fin != $stdout
|
86
|
+
fout.close if not fout.closed?
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
return 0
|
92
|
+
rescue SystemExit => err
|
93
|
+
return err.status
|
94
|
+
rescue Exception => err
|
95
|
+
$stderr.puts err.backtrace
|
96
|
+
$stderr.puts "ERROR: #{err.inspect}"
|
97
|
+
return 1
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def set_data_encoding
|
103
|
+
if @encoding == nil
|
104
|
+
if Encoding.default_external == Encoding::IBM437
|
105
|
+
$stderr.puts "Switching to UTF-8 for text data encoding."
|
106
|
+
@encoding = Encoding::UTF_8
|
107
|
+
else
|
108
|
+
@encoding = Encoding.default_external
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def option_parser
|
114
|
+
OptionParser.new do |pr|
|
115
|
+
pr.banner = "Usage: #{@name} [options] table\n"
|
116
|
+
pr.banner << "Usage: #{@name} -B output_dir table < file_list\n"
|
117
|
+
|
118
|
+
pr.banner << "\nTranslate text using simple table rules. Input text "
|
119
|
+
pr.banner << "is read from STDIN by\ndefault, and the output is "
|
120
|
+
pr.banner << "written to STDOUT by default. In batch mode, the \n"
|
121
|
+
pr.banner << "program reads file paths from STDIN, and writes them "
|
122
|
+
pr.banner << "to an output directory.\n"
|
123
|
+
|
124
|
+
pr.banner << "\nExamples:\n"
|
125
|
+
pr.banner << " #{@name} -i text.txt -o text.sz.txt table.txt\n"
|
126
|
+
pr.banner << " #{@name} -B table.txt output_dir < myfiles.txt\n"
|
127
|
+
pr.banner << "\nOptions:\n"
|
128
|
+
|
129
|
+
pr.on("-h", "--help", "show this help message and exit") do |v|
|
130
|
+
puts pr
|
131
|
+
exit 0
|
132
|
+
end
|
133
|
+
pr.on("-B", "--batch-dir=DIR", "process from a queue into DIR") do |v|
|
134
|
+
@batch_dir = v
|
135
|
+
end
|
136
|
+
pr.on("-E", "--encoding=ENC", "set data encoding to ENC") do |v|
|
137
|
+
@encoding = Encoding.find(v)
|
138
|
+
end
|
139
|
+
pr.on("-L", "--list-encodings", "list possible encodings") do |v|
|
140
|
+
puts(Encoding.list.collect {|e| e.to_s }.sort)
|
141
|
+
exit 0
|
142
|
+
end
|
143
|
+
pr.on("-i", "--infile=FILE", "read input text from FILE") do |v|
|
144
|
+
@infile = v
|
145
|
+
end
|
146
|
+
pr.on("-o", "--outfile=FILE", "write output text to FILE") do |v|
|
147
|
+
@outfile = v
|
148
|
+
end
|
149
|
+
pr.on("-P", "--platform", "show platform information") do |v|
|
150
|
+
puts "Ruby version: #{RUBY_VERSION}"
|
151
|
+
puts "Ruby platform: #{RUBY_PLATFORM}"
|
152
|
+
puts "External encoding: #{Encoding::default_external}"
|
153
|
+
if Encoding::default_internal != nil
|
154
|
+
puts "Internal encoding: #{Encoding::default_internal}"
|
155
|
+
end
|
156
|
+
exit 0
|
157
|
+
end
|
158
|
+
pr.on("-V", "--version", "show version number and exit") do |v|
|
159
|
+
puts "Sanzang version: #{Sanzang::VERSION}"
|
160
|
+
exit 0
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
attr_reader :name
|
166
|
+
|
167
|
+
end
|
168
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
#--
|
4
|
+
# Copyright (C) 2012 Lapis Lazuli Texts
|
5
|
+
#
|
6
|
+
# This program is free software: you can redistribute it and/or modify it under
|
7
|
+
# the terms of the GNU General Public License as published by the Free Software
|
8
|
+
# Foundation, either version 3 of the License, or (at your option) any later
|
9
|
+
# version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful, but WITHOUT
|
12
|
+
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
13
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
14
|
+
# details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License along with
|
17
|
+
# this program. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
|
19
|
+
module Sanzang
|
20
|
+
|
21
|
+
# This class handles formatting of text data especially to prepare the text
|
22
|
+
# for direct translation. This involves reformatting and reflowing text so
|
23
|
+
# that words are not divided between lines, and so the output is well suited
|
24
|
+
# for humans. For practical purposes of readability, lines of text to be
|
25
|
+
# translated should be succinct and easily comprehensible. The TextFormatter
|
26
|
+
# class includes methods for accomplishing this reformatting.
|
27
|
+
#
|
28
|
+
class TextFormatter
|
29
|
+
|
30
|
+
# Given a CJK string of text, reformat the string for greater compatibility
|
31
|
+
# with direct translation, and reflow the text based on its punctuation.
|
32
|
+
# The first step of this reformatting is to remove any CBETA-style margins
|
33
|
+
# at the beginning of each line, which are indicated by the double-bar
|
34
|
+
# character ("║" U+2551). An extra space is then inserted after each short
|
35
|
+
# line which may indicate that the line is part of a poem, and should be
|
36
|
+
# kept separate. Following this, all newlines are removed, and the text is
|
37
|
+
# then reformatted according to the remaining punctuation and spacing.
|
38
|
+
#
|
39
|
+
def reflow_cjk_text(s)
|
40
|
+
source_encoding = s.encoding
|
41
|
+
s.encode!(Encoding::UTF_8)
|
42
|
+
|
43
|
+
# Strip all CBETA-style margins
|
44
|
+
s.gsub!(/^.*║/, "")
|
45
|
+
|
46
|
+
# Starts with Hanzi space and short line: add Hanzi space at the end.
|
47
|
+
# This is used for avoiding conflicts between poetry and prose.
|
48
|
+
s.gsub!(/^( )(.{1,15})$/, "\\1\\2 ")
|
49
|
+
|
50
|
+
# Collapse all vertical whitespace.
|
51
|
+
using_crlf = s.include?("\r")
|
52
|
+
s.gsub!(/(\r|\n)/, "")
|
53
|
+
|
54
|
+
# Ender followed by non-ender: newline in between.
|
55
|
+
s.gsub!(/([:,;。?!」』.;:\?])([^:,;。?!」』.;:\?])/,
|
56
|
+
"\\1\n\\2")
|
57
|
+
|
58
|
+
# Non-starter, non-ender, followed by a starter: newline in between.
|
59
|
+
s.gsub!(/([^「『 \t:,;。?!」』.;:\?\n])([「『 \t])/,
|
60
|
+
"\\1\n\\2")
|
61
|
+
|
62
|
+
if s[-1] != "\n"
|
63
|
+
s << "\n"
|
64
|
+
end
|
65
|
+
|
66
|
+
s.gsub!("\n", "\r\n") if using_crlf
|
67
|
+
s.encode!(source_encoding)
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
#--
|
4
|
+
# Copyright (C) 2012 Lapis Lazuli Texts
|
5
|
+
#
|
6
|
+
# This program is free software: you can redistribute it and/or modify it under
|
7
|
+
# the terms of the GNU General Public License as published by the Free Software
|
8
|
+
# Foundation, either version 3 of the License, or (at your option) any later
|
9
|
+
# version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful, but WITHOUT
|
12
|
+
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
13
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
14
|
+
# details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License along with
|
17
|
+
# this program. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
#
|
19
|
+
module Sanzang
|
20
|
+
|
21
|
+
# TranslationTable encapsulates the set of rules used for translation by
|
22
|
+
# Sanzang::Translator. These rules may be loaded from a string passed in to
|
23
|
+
# the constructor, or loaded from an open IO object. The translation rules
|
24
|
+
# will then go through basic parsing to ensure the table data is in the
|
25
|
+
# correct format, and then the rules are reverse sorted by the length of the
|
26
|
+
# source language column. Thereafter, these rules are accessible through the
|
27
|
+
# ''records'' attribute, and metadata is available through other accessors
|
28
|
+
# and methods. It is the responsibility of Sanzang::Translator object to
|
29
|
+
# actually apply the rules of a TranslationTable to some text, as the table
|
30
|
+
# merely encapsulates a set of translation rules.
|
31
|
+
#
|
32
|
+
# The format for translation table data can be summarized as the following:
|
33
|
+
#
|
34
|
+
# * Plain text with one line per record
|
35
|
+
# * Records begin with "~|", end with "|~", and are delimited by "|".
|
36
|
+
# * The number of columns in each record must be consistent.
|
37
|
+
#
|
38
|
+
# An example of this format is the following:
|
39
|
+
#
|
40
|
+
# ~|zh-term1|en-term1|~
|
41
|
+
# ~|zh-term2|en-term2|~
|
42
|
+
# ~|zh-term3|en-term3|~
|
43
|
+
#
|
44
|
+
class TranslationTable
|
45
|
+
|
46
|
+
# Create a new TranslationTable object from a string or by reading an IO
|
47
|
+
# object. If the table parameter is a kind of string, then attempt to parse
|
48
|
+
# the table data from this string. Otherwise treat the parameter as an open
|
49
|
+
# IO object, and attempt to read the string data from that. After loading
|
50
|
+
# and verifying the contents of the translation table, all the records are
|
51
|
+
# reverse sorted by length, since this is the order in which they will be
|
52
|
+
# applied.
|
53
|
+
#
|
54
|
+
def initialize(rules)
|
55
|
+
contents = rules.kind_of?(String) ? rules : rules.read
|
56
|
+
@encoding = contents.encoding
|
57
|
+
|
58
|
+
left = "~|".encode(@encoding)
|
59
|
+
right = "|~".encode(@encoding)
|
60
|
+
separator = "|".encode(@encoding)
|
61
|
+
|
62
|
+
@records = contents.gsub("\r", "").split("\n").collect do |rec|
|
63
|
+
rec = rec.strip.gsub(left, "").gsub(right, "").split(separator)
|
64
|
+
end
|
65
|
+
|
66
|
+
if @records.length > 0
|
67
|
+
@width = records[0].length
|
68
|
+
0.upto(@records.length - 1) do |i|
|
69
|
+
if @records[i].length != @width
|
70
|
+
raise "Column mismatch: Line #{i + 1}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
else
|
74
|
+
@width = 0
|
75
|
+
end
|
76
|
+
|
77
|
+
@records.sort! {|x,y| y.length <=> x.length }
|
78
|
+
end
|
79
|
+
|
80
|
+
# Retrieve a record by its numeric index. This is just shorthand for
|
81
|
+
# looking at the records attribute directly.
|
82
|
+
#
|
83
|
+
def [](index)
|
84
|
+
@records[index]
|
85
|
+
end
|
86
|
+
|
87
|
+
# Find the record where the source language field is equal to the given
|
88
|
+
# parameter.
|
89
|
+
#
|
90
|
+
def find(term)
|
91
|
+
@records.find {|rec| rec[0] == term }
|
92
|
+
end
|
93
|
+
|
94
|
+
# The number of records in the translation table (the table length).
|
95
|
+
#
|
96
|
+
def length
|
97
|
+
@records.length
|
98
|
+
end
|
99
|
+
|
100
|
+
# The number of columns in the translation table (the table width).
|
101
|
+
#
|
102
|
+
attr_reader :width
|
103
|
+
|
104
|
+
# The records for the translation table, as an Array.
|
105
|
+
#
|
106
|
+
attr_reader :records
|
107
|
+
|
108
|
+
# The text encoding used for all translation table data.
|
109
|
+
#
|
110
|
+
attr_reader :encoding
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,174 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
#--
|
4
|
+
# Copyright (C) 2012 Lapis Lazuli Texts
|
5
|
+
#
|
6
|
+
# This program is free software: you can redistribute it and/or modify it under
|
7
|
+
# the terms of the GNU General Public License as published by the Free Software
|
8
|
+
# Foundation, either version 3 of the License, or (at your option) any later
|
9
|
+
# version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful, but WITHOUT
|
12
|
+
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
13
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
14
|
+
# details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License along with
|
17
|
+
# this program. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
|
19
|
+
begin
|
20
|
+
require "parallel"
|
21
|
+
rescue LoadError
|
22
|
+
nil
|
23
|
+
end
|
24
|
+
|
25
|
+
module Sanzang
|
26
|
+
|
27
|
+
# Translator is the main class for performing text translations with Sanzang.
|
28
|
+
# A Translator utilizes a TranslationTable, which is passed to it at the time
|
29
|
+
# of creation. The Translator can then apply these translation rules,
|
30
|
+
# generate full translation listings, and perform translations by reading and
|
31
|
+
# writing to IO objects. Finally, Translator supports a batch mode that can
|
32
|
+
# utilize multiprocessing if the _Parallel_ module is available, and if the
|
33
|
+
# platform supports Kernel#fork. Methods are also available for querying the
|
34
|
+
# status of this functionality.
|
35
|
+
#
|
36
|
+
class Translator
|
37
|
+
|
38
|
+
# Creates a new Translator object with the given TranslationTable. The
|
39
|
+
# TranslationTable stores rules for translation, while the Translator is
|
40
|
+
# the worker who applies these rules and can create translation listings.
|
41
|
+
#
|
42
|
+
def initialize(translation_table)
|
43
|
+
@table = translation_table
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns true if both the _Parallel_ module is available, and is also
|
47
|
+
# functioning on this particular implementation of Ruby. Currently the
|
48
|
+
# _mingw_ and _mswin_ ports of Ruby do not have Process#fork implemented.
|
49
|
+
#
|
50
|
+
def runs_parallel?
|
51
|
+
if not Process.respond_to?(:fork)
|
52
|
+
false
|
53
|
+
elsif defined?(Parallel) == "constant" and Parallel.class == Module
|
54
|
+
true
|
55
|
+
else
|
56
|
+
false
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Return the number of processors available on the current system. This
|
61
|
+
# will return the total number of logical processors, rather than physical
|
62
|
+
# processors.
|
63
|
+
#
|
64
|
+
def processor_count
|
65
|
+
runs_parallel? == true ? Parallel.processor_count : 1
|
66
|
+
end
|
67
|
+
|
68
|
+
# Return an Array of all translation rules used by a particular text.
|
69
|
+
# These records represent the vocabulary used by the text.
|
70
|
+
#
|
71
|
+
def text_vocab(source_text)
|
72
|
+
new_table = []
|
73
|
+
@table.records.each do |record|
|
74
|
+
if source_text.include?(record[0])
|
75
|
+
new_table << record
|
76
|
+
end
|
77
|
+
end
|
78
|
+
new_table
|
79
|
+
end
|
80
|
+
|
81
|
+
# Use the TranslationTable of the Translator to create translations for
|
82
|
+
# each destination language column of the translation table. These
|
83
|
+
# result is a simple Array of String objects, with each String object
|
84
|
+
# corresponding to a destination language column in the TranslationTable.
|
85
|
+
#
|
86
|
+
def translate(source_text)
|
87
|
+
text_collection = [source_text]
|
88
|
+
vocab_terms = text_vocab(source_text)
|
89
|
+
1.upto(@table.width - 1) do |column_i|
|
90
|
+
translation = String.new(source_text)
|
91
|
+
vocab_terms.each do |term|
|
92
|
+
translation.gsub!(term[0], term[column_i])
|
93
|
+
end
|
94
|
+
text_collection << translation
|
95
|
+
end
|
96
|
+
text_collection
|
97
|
+
end
|
98
|
+
|
99
|
+
# Generate a translation listing text string, in which the output of
|
100
|
+
# Translator#translate is collated and numbered for reference purposes.
|
101
|
+
# This is the normal text listing output of the Sanzang Translator.
|
102
|
+
#
|
103
|
+
def gen_listing(source_text)
|
104
|
+
newline = source_text.include?("\r") ? "\r\n" : "\n"
|
105
|
+
texts = translate(source_text).collect {|t| t = t.split(newline) }
|
106
|
+
listing = "".encode(source_text.encoding)
|
107
|
+
|
108
|
+
texts[0].length.times do |line_i|
|
109
|
+
@table.width.times do |col_i|
|
110
|
+
listing << "[#{line_i + 1}.#{col_i + 1}] #{texts[col_i][line_i]}" \
|
111
|
+
<< newline
|
112
|
+
end
|
113
|
+
listing << newline
|
114
|
+
end
|
115
|
+
listing
|
116
|
+
end
|
117
|
+
|
118
|
+
# Read a text from _input_ and write its translation listing to _output_.
|
119
|
+
# The parameters _input_ and _output_ can be either String objects or IO
|
120
|
+
# objects. If they are strings, then they are interpreted as being file
|
121
|
+
# paths. If they are not strings, then the I/O operations are performed on
|
122
|
+
# them directly.
|
123
|
+
#
|
124
|
+
def translate_io(input, output)
|
125
|
+
if input.class == String
|
126
|
+
input = File.open(input, "r", external_encoding: @table.encoding)
|
127
|
+
end
|
128
|
+
if output.class == String
|
129
|
+
output = File.open(output, "w", external_encoding: @table.encoding)
|
130
|
+
end
|
131
|
+
output.write(gen_listing(input.read))
|
132
|
+
input.close
|
133
|
+
output.close
|
134
|
+
end
|
135
|
+
|
136
|
+
# Translate a list of files to some output directory. If the _verbose_
|
137
|
+
# parameter is true, then print progress to STDERR. If the value of
|
138
|
+
# Translator#runs_parallel? is false, then the batch is processed
|
139
|
+
# sequentially, only utilizing one processor. However, if the value is
|
140
|
+
# true, then run the batch by utilizing the Parallel module for efficient
|
141
|
+
# multiprocessing.
|
142
|
+
#
|
143
|
+
def translate_batch(fpath_list, out_dir, verbose = true)
|
144
|
+
fpath_list.collect! {|f| f.chomp }
|
145
|
+
|
146
|
+
if not runs_parallel?
|
147
|
+
fpath_list.each do |in_fpath|
|
148
|
+
out_fpath = File.join(out_dir, File.basename(in_fpath))
|
149
|
+
translate_io(in_fpath, out_fpath)
|
150
|
+
if verbose
|
151
|
+
$stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
|
152
|
+
$stderr.flush
|
153
|
+
end
|
154
|
+
out_fpath
|
155
|
+
end
|
156
|
+
else
|
157
|
+
Parallel.map(fpath_list) do |in_fpath|
|
158
|
+
out_fpath = File.join(out_dir, File.basename(in_fpath))
|
159
|
+
translate_io(in_fpath, out_fpath)
|
160
|
+
if verbose
|
161
|
+
$stderr.write "[#{Process.pid}] #{File.expand_path(out_fpath)} \n"
|
162
|
+
$stderr.flush
|
163
|
+
end
|
164
|
+
out_fpath
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# The TranslationTable used by the Translator
|
170
|
+
#
|
171
|
+
attr_reader :table
|
172
|
+
|
173
|
+
end
|
174
|
+
end
|