sanzang 1.1.2 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/NEWS.rdoc +6 -0
- data/README.rdoc +1 -1
- data/lib/sanzang.rb +3 -3
- data/lib/sanzang/batch_translator.rb +0 -5
- data/lib/sanzang/command/batch.rb +2 -6
- data/lib/sanzang/command/reflow.rb +2 -6
- data/lib/sanzang/command/sanzang_cmd.rb +6 -11
- data/lib/sanzang/command/translate.rb +1 -6
- data/lib/sanzang/{text_formatter.rb → formatting.rb} +9 -11
- data/lib/sanzang/translation_table.rb +54 -4
- data/lib/sanzang/translator.rb +4 -7
- data/lib/sanzang/version.rb +1 -1
- data/test/tc_reflow_encodings.rb +2 -4
- data/test/tc_simple_translation.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a5bd5aafaee153f9753cdeedf952c8581eaf770b
|
4
|
+
data.tar.gz: 111fc689cb0f19ca0dd20356f50d7483e6a51420
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17a663b61ef1523e24d76d7d37a65c56ff8d73c2bbb43e3aa226437dbc4b224a96b7ebaa02370bd8054490c2d57885d24f6be5844f6767fa59565705c3c37fa4
|
7
|
+
data.tar.gz: a453e356504a8c334cefcf42455769bcb79bba82af4a48138780a3bbd3a086c474c3fa37659c3a7c33ca38f30c78e5f8237a9caf2f8cc316d95374b71bf31920
|
data/NEWS.rdoc
CHANGED
@@ -2,6 +2,12 @@
|
|
2
2
|
|
3
3
|
== Release History
|
4
4
|
|
5
|
+
=== v1.2.0
|
6
|
+
* Updated vocab building code for faster term matching
|
7
|
+
* Refactored the TextFormatter class into a Formatting module
|
8
|
+
* Added methods for merging translation tables into one another
|
9
|
+
* Consolidated Sanzang module "requires" into a central location
|
10
|
+
|
5
11
|
=== v1.1.2
|
6
12
|
* Cleaned up table loading code to be faster and simpler
|
7
13
|
* Added RDoc option to set documentation encoding to UTF-8 (RDoc 3.x)
|
data/README.rdoc
CHANGED
@@ -40,6 +40,6 @@ command to verify your installation and print version information.
|
|
40
40
|
|
41
41
|
This command should show a summary of your \Sanzang version and environment.
|
42
42
|
|
43
|
-
sanzang 1.
|
43
|
+
sanzang 1.2.0 ruby-2.1.1 x86_64-linux (UTF-8)
|
44
44
|
|
45
45
|
You now have \Sanzang installed on your computer.
|
data/lib/sanzang.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# coding: UTF-8
|
2
2
|
#--
|
3
|
-
# Copyright (C) 2012-
|
3
|
+
# Copyright (C) 2012-2014 Lapis Lazuli Texts
|
4
4
|
#
|
5
5
|
# This program is free software: you can redistribute it and/or modify it under
|
6
6
|
# the terms of the GNU General Public License as published by the Free Software
|
@@ -22,11 +22,11 @@
|
|
22
22
|
module Sanzang
|
23
23
|
end
|
24
24
|
|
25
|
-
require_relative File.join("sanzang", "batch_translator")
|
26
25
|
require_relative File.join("sanzang", "platform")
|
27
|
-
require_relative File.join("sanzang", "
|
26
|
+
require_relative File.join("sanzang", "formatting")
|
28
27
|
require_relative File.join("sanzang", "translation_table")
|
29
28
|
require_relative File.join("sanzang", "translator")
|
29
|
+
require_relative File.join("sanzang", "batch_translator")
|
30
30
|
require_relative File.join("sanzang", "version")
|
31
31
|
|
32
32
|
# The Sanzang::Command module contains Unix style commands utilizing the
|
@@ -15,11 +15,6 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
require "parallel"
|
19
|
-
|
20
|
-
require_relative "platform"
|
21
|
-
require_relative "translator"
|
22
|
-
|
23
18
|
module Sanzang
|
24
19
|
|
25
20
|
# BatchTranslator can handle batches of files for translation, and may also
|
@@ -15,12 +15,8 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
require
|
19
|
-
|
20
|
-
require_relative File.join("..", "platform")
|
21
|
-
require_relative File.join("..", "translation_table")
|
22
|
-
require_relative File.join("..", "batch_translator")
|
23
|
-
require_relative File.join("..", "version")
|
18
|
+
require 'optparse'
|
19
|
+
require 'parallel'
|
24
20
|
|
25
21
|
module Sanzang::Command
|
26
22
|
|
@@ -15,11 +15,7 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
require
|
19
|
-
|
20
|
-
require_relative File.join("..", "platform")
|
21
|
-
require_relative File.join("..", "text_formatter")
|
22
|
-
require_relative File.join("..", "version")
|
18
|
+
require 'optparse'
|
23
19
|
|
24
20
|
module Sanzang::Command
|
25
21
|
|
@@ -61,7 +57,7 @@ module Sanzang::Command
|
|
61
57
|
fin.binmode.set_encoding(@encoding)
|
62
58
|
fout = @outfile ? File.open(@outfile, "w") : $stdout
|
63
59
|
fout.binmode.set_encoding(@encoding)
|
64
|
-
fout.write(Sanzang::
|
60
|
+
fout.write(Sanzang::Formatting.reflow_cjk(fin.read))
|
65
61
|
ensure
|
66
62
|
if defined?(fin) and fin.class == File
|
67
63
|
fin.close if not fin.closed?
|
@@ -15,15 +15,7 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
require
|
19
|
-
require "parallel"
|
20
|
-
|
21
|
-
require_relative "reflow"
|
22
|
-
require_relative "translate"
|
23
|
-
require_relative "batch"
|
24
|
-
|
25
|
-
require_relative File.join("..", "platform")
|
26
|
-
require_relative File.join("..", "version")
|
18
|
+
require 'optparse'
|
27
19
|
|
28
20
|
module Sanzang::Command
|
29
21
|
|
@@ -80,6 +72,8 @@ module Sanzang::Command
|
|
80
72
|
# A string giving a listing of platform information
|
81
73
|
#
|
82
74
|
def platform_info
|
75
|
+
require 'parallel'
|
76
|
+
|
83
77
|
info = "host_arch = #{Sanzang::Platform.machine_arch}\n"
|
84
78
|
info << "host_os = #{Sanzang::Platform.os_name}\n"
|
85
79
|
info << "host_processors = #{Sanzang::Platform.processor_count}\n"
|
@@ -96,8 +90,9 @@ module Sanzang::Command
|
|
96
90
|
# This is a string giving a brief one-line summary of version information
|
97
91
|
#
|
98
92
|
def version_info
|
99
|
-
"sanzang #{Sanzang::VERSION}
|
100
|
-
+ "
|
93
|
+
"sanzang #{Sanzang::VERSION} ruby-#{RUBY_VERSION} #{RUBY_PLATFORM} " \
|
94
|
+
+ "(#{Sanzang::Platform.data_encoding})"
|
95
|
+
# "sanzang #{Sanzang::VERSION} #{Sanzang::Platform.data_encoding}"
|
101
96
|
end
|
102
97
|
|
103
98
|
# Name of the command
|
@@ -15,12 +15,7 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
require
|
19
|
-
|
20
|
-
require_relative File.join("..", "platform")
|
21
|
-
require_relative File.join("..", "translation_table")
|
22
|
-
require_relative File.join("..", "translator")
|
23
|
-
require_relative File.join("..", "version")
|
18
|
+
require 'optparse'
|
24
19
|
|
25
20
|
module Sanzang::Command
|
26
21
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# coding: UTF-8
|
2
2
|
#--
|
3
|
-
# Copyright (C) 2012-
|
3
|
+
# Copyright (C) 2012-2014 Lapis Lazuli Texts
|
4
4
|
#
|
5
5
|
# This program is free software: you can redistribute it and/or modify it under
|
6
6
|
# the terms of the GNU General Public License as published by the Free Software
|
@@ -15,16 +15,14 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
module
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
#
|
27
|
-
class TextFormatter
|
18
|
+
# This module handles formatting of text data especially to prepare the text
|
19
|
+
# for direct translation. This involves reformatting and reflowing text so
|
20
|
+
# that words are not divided between lines, and so the output is well suited
|
21
|
+
# for humans. For practical purposes of readability, lines of text to be
|
22
|
+
# translated should be succinct and easily comprehensible.
|
23
|
+
#
|
24
|
+
module Sanzang::Formatting
|
25
|
+
class << self
|
28
26
|
|
29
27
|
# Given a CJK string of text, reformat the string for greater compatibility
|
30
28
|
# with direct translation, and reflow the text based on its punctuation.
|
@@ -17,9 +17,8 @@
|
|
17
17
|
|
18
18
|
module Sanzang
|
19
19
|
|
20
|
-
# A translation table encapsulates a set of rules for translating with
|
21
|
-
#
|
22
|
-
# storing well-defined translation table data.
|
20
|
+
# A translation table encapsulates a set of rules for translating with the
|
21
|
+
# \Sanzang system.
|
23
22
|
#
|
24
23
|
class TranslationTable
|
25
24
|
|
@@ -39,6 +38,8 @@ module Sanzang
|
|
39
38
|
# - Subsequent fields are equivalent terms in destination languages.
|
40
39
|
# - The number of columns must be consistent for the entire table.
|
41
40
|
#
|
41
|
+
# The rules passed in here may either be a file descriptor or a string.
|
42
|
+
#
|
42
43
|
def initialize(rules)
|
43
44
|
contents = rules.kind_of?(String) ? rules : rules.read
|
44
45
|
@source_encoding = contents.encoding
|
@@ -51,8 +52,9 @@ module Sanzang
|
|
51
52
|
end
|
52
53
|
|
53
54
|
@records = contents.strip.split("\n").collect {|r| r.strip.split("|") }
|
55
|
+
@sorted = false
|
54
56
|
check_dims
|
55
|
-
|
57
|
+
#sort!
|
56
58
|
end
|
57
59
|
|
58
60
|
# Retrieve a record by its numeric index.
|
@@ -76,10 +78,18 @@ module Sanzang
|
|
76
78
|
end
|
77
79
|
end
|
78
80
|
|
81
|
+
# Check if the table records are sorted
|
82
|
+
#
|
83
|
+
def sorted?
|
84
|
+
@sorted
|
85
|
+
end
|
86
|
+
|
79
87
|
# Reverse sort all records by length
|
80
88
|
#
|
81
89
|
def sort!
|
82
90
|
@records.sort! {|x,y| y[0].size <=> x[0].size }
|
91
|
+
@sorted = true
|
92
|
+
nil
|
83
93
|
end
|
84
94
|
|
85
95
|
# The text encoding used internally for all translation table data
|
@@ -94,6 +104,46 @@ module Sanzang
|
|
94
104
|
@records.find {|rec| rec[0] == term }
|
95
105
|
end
|
96
106
|
|
107
|
+
# Convert to a hash. The original records are the values.
|
108
|
+
#
|
109
|
+
# For example: "A" => ["A", "B", "C"]
|
110
|
+
#
|
111
|
+
def to_h
|
112
|
+
h = Hash.new
|
113
|
+
@records.each {|rec| h[rec[0]] = rec if not h[rec[0]] }
|
114
|
+
h
|
115
|
+
end
|
116
|
+
|
117
|
+
# Only include unique source values. The resulting table is unsorted.
|
118
|
+
#
|
119
|
+
def uniq!
|
120
|
+
@records = to_h.values
|
121
|
+
@sorted = false
|
122
|
+
nil
|
123
|
+
end
|
124
|
+
|
125
|
+
# Merge another table into this one. If the same source term exists in
|
126
|
+
# both tables, then the record from the other table will be used instead.
|
127
|
+
# Note: after a merge, the resulting table is unsorted.
|
128
|
+
#
|
129
|
+
def merge!(tab2)
|
130
|
+
if tab2.width != width
|
131
|
+
raise "Table widths must match when merging tables"
|
132
|
+
end
|
133
|
+
h1 = to_h
|
134
|
+
tab2.records.each do |rec|
|
135
|
+
h1[rec[0]] = rec
|
136
|
+
end
|
137
|
+
@records = h1.values
|
138
|
+
@sorted = false
|
139
|
+
end
|
140
|
+
|
141
|
+
# Return a CSV formatted string
|
142
|
+
#
|
143
|
+
def to_csv
|
144
|
+
@records.map {|r| r.join("|") }.join("\n")
|
145
|
+
end
|
146
|
+
|
97
147
|
# The number of records in the table
|
98
148
|
#
|
99
149
|
def length
|
data/lib/sanzang/translator.rb
CHANGED
@@ -37,13 +37,10 @@ module Sanzang
|
|
37
37
|
# These records represent the vocabulary used by the text.
|
38
38
|
#
|
39
39
|
def text_vocab(source_text)
|
40
|
-
|
41
|
-
@table.records.
|
42
|
-
|
43
|
-
new_table << record
|
44
|
-
end
|
40
|
+
text_copy = String.new(source_text)
|
41
|
+
@table.records.select do |r|
|
42
|
+
text_copy.include?(r[0]) ? text_copy.gsub!(r[0], "\x1F") : false
|
45
43
|
end
|
46
|
-
new_table
|
47
44
|
end
|
48
45
|
|
49
46
|
# Use the TranslationTable of the Translator to create translations for
|
@@ -80,7 +77,7 @@ module Sanzang
|
|
80
77
|
|
81
78
|
listing = ""
|
82
79
|
texts[0].length.times do |line_i|
|
83
|
-
|
80
|
+
texts.length.times do |col_i|
|
84
81
|
listing << "[#{pos + line_i}.#{col_i + 1}] #{texts[col_i][line_i]}" \
|
85
82
|
<< newline
|
86
83
|
end
|
data/lib/sanzang/version.rb
CHANGED
data/test/tc_reflow_encodings.rb
CHANGED
@@ -23,8 +23,7 @@ class TestReflowEncodings < Test::Unit::TestCase
|
|
23
23
|
text_s2 = " 大唐三藏法師玄奘奉\n 詔譯\n \n"
|
24
24
|
text_s1.encode!(encoding)
|
25
25
|
text_s2.encode!(encoding)
|
26
|
-
|
27
|
-
assert_equal(text_s2, formatter.reflow_cjk(text_s1))
|
26
|
+
assert_equal(text_s2, Sanzang::Formatting.reflow_cjk(text_s1))
|
28
27
|
end
|
29
28
|
|
30
29
|
# Han characters, simplified and without double vertical bar. The margin
|
@@ -36,8 +35,7 @@ class TestReflowEncodings < Test::Unit::TestCase
|
|
36
35
|
text_s2 = " 大唐三藏法师玄奘奉\n 诏译\n \n"
|
37
36
|
text_s1.encode!(encoding)
|
38
37
|
text_s2.encode!(encoding)
|
39
|
-
|
40
|
-
assert_equal(text_s2, formatter.reflow_cjk(text_s1))
|
38
|
+
assert_equal(text_s2, Sanzang::Formatting.reflow_cjk(text_s1))
|
41
39
|
end
|
42
40
|
|
43
41
|
# UTF-8 (Traditional Chinese)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanzang
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lapis Lazuli Texts
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02
|
11
|
+
date: 2014-03-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: parallel
|
@@ -52,8 +52,8 @@ files:
|
|
52
52
|
- lib/sanzang/command/reflow.rb
|
53
53
|
- lib/sanzang/command/sanzang_cmd.rb
|
54
54
|
- lib/sanzang/command/translate.rb
|
55
|
+
- lib/sanzang/formatting.rb
|
55
56
|
- lib/sanzang/platform.rb
|
56
|
-
- lib/sanzang/text_formatter.rb
|
57
57
|
- lib/sanzang/translation_table.rb
|
58
58
|
- lib/sanzang/translator.rb
|
59
59
|
- lib/sanzang/version.rb
|
@@ -90,7 +90,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
90
90
|
requirements:
|
91
91
|
- parallel ~> 0.8
|
92
92
|
rubyforge_project:
|
93
|
-
rubygems_version: 2.2.
|
93
|
+
rubygems_version: 2.2.2
|
94
94
|
signing_key:
|
95
95
|
specification_version: 4
|
96
96
|
summary: Machine translation from CJK languages
|