sanzang 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS.rdoc +6 -0
- data/README.rdoc +1 -1
- data/lib/sanzang.rb +3 -3
- data/lib/sanzang/batch_translator.rb +0 -5
- data/lib/sanzang/command/batch.rb +2 -6
- data/lib/sanzang/command/reflow.rb +2 -6
- data/lib/sanzang/command/sanzang_cmd.rb +6 -11
- data/lib/sanzang/command/translate.rb +1 -6
- data/lib/sanzang/{text_formatter.rb → formatting.rb} +9 -11
- data/lib/sanzang/translation_table.rb +54 -4
- data/lib/sanzang/translator.rb +4 -7
- data/lib/sanzang/version.rb +1 -1
- data/test/tc_reflow_encodings.rb +2 -4
- data/test/tc_simple_translation.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a5bd5aafaee153f9753cdeedf952c8581eaf770b
|
4
|
+
data.tar.gz: 111fc689cb0f19ca0dd20356f50d7483e6a51420
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17a663b61ef1523e24d76d7d37a65c56ff8d73c2bbb43e3aa226437dbc4b224a96b7ebaa02370bd8054490c2d57885d24f6be5844f6767fa59565705c3c37fa4
|
7
|
+
data.tar.gz: a453e356504a8c334cefcf42455769bcb79bba82af4a48138780a3bbd3a086c474c3fa37659c3a7c33ca38f30c78e5f8237a9caf2f8cc316d95374b71bf31920
|
data/NEWS.rdoc
CHANGED
@@ -2,6 +2,12 @@
|
|
2
2
|
|
3
3
|
== Release History
|
4
4
|
|
5
|
+
=== v1.2.0
|
6
|
+
* Updated vocab building code for faster term matching
|
7
|
+
* Refactored the TextFormatter class into a Formatting module
|
8
|
+
* Added methods for merging translation tables into one another
|
9
|
+
* Consolidated Sanzang module "requires" into a central location
|
10
|
+
|
5
11
|
=== v1.1.2
|
6
12
|
* Cleaned up table loading code to be faster and simpler
|
7
13
|
* Added RDoc option to set documentation encoding to UTF-8 (RDoc 3.x)
|
data/README.rdoc
CHANGED
@@ -40,6 +40,6 @@ command to verify your installation and print version information.
|
|
40
40
|
|
41
41
|
This command should show a summary of your \Sanzang version and environment.
|
42
42
|
|
43
|
-
sanzang 1.
|
43
|
+
sanzang 1.2.0 ruby-2.1.1 x86_64-linux (UTF-8)
|
44
44
|
|
45
45
|
You now have \Sanzang installed on your computer.
|
data/lib/sanzang.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# coding: UTF-8
|
2
2
|
#--
|
3
|
-
# Copyright (C) 2012-
|
3
|
+
# Copyright (C) 2012-2014 Lapis Lazuli Texts
|
4
4
|
#
|
5
5
|
# This program is free software: you can redistribute it and/or modify it under
|
6
6
|
# the terms of the GNU General Public License as published by the Free Software
|
@@ -22,11 +22,11 @@
|
|
22
22
|
module Sanzang
|
23
23
|
end
|
24
24
|
|
25
|
-
require_relative File.join("sanzang", "batch_translator")
|
26
25
|
require_relative File.join("sanzang", "platform")
|
27
|
-
require_relative File.join("sanzang", "
|
26
|
+
require_relative File.join("sanzang", "formatting")
|
28
27
|
require_relative File.join("sanzang", "translation_table")
|
29
28
|
require_relative File.join("sanzang", "translator")
|
29
|
+
require_relative File.join("sanzang", "batch_translator")
|
30
30
|
require_relative File.join("sanzang", "version")
|
31
31
|
|
32
32
|
# The Sanzang::Command module contains Unix style commands utilizing the
|
@@ -15,11 +15,6 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
require "parallel"
|
19
|
-
|
20
|
-
require_relative "platform"
|
21
|
-
require_relative "translator"
|
22
|
-
|
23
18
|
module Sanzang
|
24
19
|
|
25
20
|
# BatchTranslator can handle batches of files for translation, and may also
|
@@ -15,12 +15,8 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
require
|
19
|
-
|
20
|
-
require_relative File.join("..", "platform")
|
21
|
-
require_relative File.join("..", "translation_table")
|
22
|
-
require_relative File.join("..", "batch_translator")
|
23
|
-
require_relative File.join("..", "version")
|
18
|
+
require 'optparse'
|
19
|
+
require 'parallel'
|
24
20
|
|
25
21
|
module Sanzang::Command
|
26
22
|
|
@@ -15,11 +15,7 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
require
|
19
|
-
|
20
|
-
require_relative File.join("..", "platform")
|
21
|
-
require_relative File.join("..", "text_formatter")
|
22
|
-
require_relative File.join("..", "version")
|
18
|
+
require 'optparse'
|
23
19
|
|
24
20
|
module Sanzang::Command
|
25
21
|
|
@@ -61,7 +57,7 @@ module Sanzang::Command
|
|
61
57
|
fin.binmode.set_encoding(@encoding)
|
62
58
|
fout = @outfile ? File.open(@outfile, "w") : $stdout
|
63
59
|
fout.binmode.set_encoding(@encoding)
|
64
|
-
fout.write(Sanzang::
|
60
|
+
fout.write(Sanzang::Formatting.reflow_cjk(fin.read))
|
65
61
|
ensure
|
66
62
|
if defined?(fin) and fin.class == File
|
67
63
|
fin.close if not fin.closed?
|
@@ -15,15 +15,7 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
require
|
19
|
-
require "parallel"
|
20
|
-
|
21
|
-
require_relative "reflow"
|
22
|
-
require_relative "translate"
|
23
|
-
require_relative "batch"
|
24
|
-
|
25
|
-
require_relative File.join("..", "platform")
|
26
|
-
require_relative File.join("..", "version")
|
18
|
+
require 'optparse'
|
27
19
|
|
28
20
|
module Sanzang::Command
|
29
21
|
|
@@ -80,6 +72,8 @@ module Sanzang::Command
|
|
80
72
|
# A string giving a listing of platform information
|
81
73
|
#
|
82
74
|
def platform_info
|
75
|
+
require 'parallel'
|
76
|
+
|
83
77
|
info = "host_arch = #{Sanzang::Platform.machine_arch}\n"
|
84
78
|
info << "host_os = #{Sanzang::Platform.os_name}\n"
|
85
79
|
info << "host_processors = #{Sanzang::Platform.processor_count}\n"
|
@@ -96,8 +90,9 @@ module Sanzang::Command
|
|
96
90
|
# This is a string giving a brief one-line summary of version information
|
97
91
|
#
|
98
92
|
def version_info
|
99
|
-
"sanzang #{Sanzang::VERSION}
|
100
|
-
+ "
|
93
|
+
"sanzang #{Sanzang::VERSION} ruby-#{RUBY_VERSION} #{RUBY_PLATFORM} " \
|
94
|
+
+ "(#{Sanzang::Platform.data_encoding})"
|
95
|
+
# "sanzang #{Sanzang::VERSION} #{Sanzang::Platform.data_encoding}"
|
101
96
|
end
|
102
97
|
|
103
98
|
# Name of the command
|
@@ -15,12 +15,7 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
require
|
19
|
-
|
20
|
-
require_relative File.join("..", "platform")
|
21
|
-
require_relative File.join("..", "translation_table")
|
22
|
-
require_relative File.join("..", "translator")
|
23
|
-
require_relative File.join("..", "version")
|
18
|
+
require 'optparse'
|
24
19
|
|
25
20
|
module Sanzang::Command
|
26
21
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# coding: UTF-8
|
2
2
|
#--
|
3
|
-
# Copyright (C) 2012-
|
3
|
+
# Copyright (C) 2012-2014 Lapis Lazuli Texts
|
4
4
|
#
|
5
5
|
# This program is free software: you can redistribute it and/or modify it under
|
6
6
|
# the terms of the GNU General Public License as published by the Free Software
|
@@ -15,16 +15,14 @@
|
|
15
15
|
# You should have received a copy of the GNU General Public License along with
|
16
16
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
17
17
|
|
18
|
-
module
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
#
|
27
|
-
class TextFormatter
|
18
|
+
# This module handles formatting of text data especially to prepare the text
|
19
|
+
# for direct translation. This involves reformatting and reflowing text so
|
20
|
+
# that words are not divided between lines, and so the output is well suited
|
21
|
+
# for humans. For practical purposes of readability, lines of text to be
|
22
|
+
# translated should be succinct and easily comprehensible.
|
23
|
+
#
|
24
|
+
module Sanzang::Formatting
|
25
|
+
class << self
|
28
26
|
|
29
27
|
# Given a CJK string of text, reformat the string for greater compatibility
|
30
28
|
# with direct translation, and reflow the text based on its punctuation.
|
@@ -17,9 +17,8 @@
|
|
17
17
|
|
18
18
|
module Sanzang
|
19
19
|
|
20
|
-
# A translation table encapsulates a set of rules for translating with
|
21
|
-
#
|
22
|
-
# storing well-defined translation table data.
|
20
|
+
# A translation table encapsulates a set of rules for translating with the
|
21
|
+
# \Sanzang system.
|
23
22
|
#
|
24
23
|
class TranslationTable
|
25
24
|
|
@@ -39,6 +38,8 @@ module Sanzang
|
|
39
38
|
# - Subsequent fields are equivalent terms in destination languages.
|
40
39
|
# - The number of columns must be consistent for the entire table.
|
41
40
|
#
|
41
|
+
# The rules passed in here may either be a file descriptor or a string.
|
42
|
+
#
|
42
43
|
def initialize(rules)
|
43
44
|
contents = rules.kind_of?(String) ? rules : rules.read
|
44
45
|
@source_encoding = contents.encoding
|
@@ -51,8 +52,9 @@ module Sanzang
|
|
51
52
|
end
|
52
53
|
|
53
54
|
@records = contents.strip.split("\n").collect {|r| r.strip.split("|") }
|
55
|
+
@sorted = false
|
54
56
|
check_dims
|
55
|
-
|
57
|
+
#sort!
|
56
58
|
end
|
57
59
|
|
58
60
|
# Retrieve a record by its numeric index.
|
@@ -76,10 +78,18 @@ module Sanzang
|
|
76
78
|
end
|
77
79
|
end
|
78
80
|
|
81
|
+
# Check if the table records are sorted
|
82
|
+
#
|
83
|
+
def sorted?
|
84
|
+
@sorted
|
85
|
+
end
|
86
|
+
|
79
87
|
# Reverse sort all records by length
|
80
88
|
#
|
81
89
|
def sort!
|
82
90
|
@records.sort! {|x,y| y[0].size <=> x[0].size }
|
91
|
+
@sorted = true
|
92
|
+
nil
|
83
93
|
end
|
84
94
|
|
85
95
|
# The text encoding used internally for all translation table data
|
@@ -94,6 +104,46 @@ module Sanzang
|
|
94
104
|
@records.find {|rec| rec[0] == term }
|
95
105
|
end
|
96
106
|
|
107
|
+
# Convert to a hash. The original records are the values.
|
108
|
+
#
|
109
|
+
# For example: "A" => ["A", "B", "C"]
|
110
|
+
#
|
111
|
+
def to_h
|
112
|
+
h = Hash.new
|
113
|
+
@records.each {|rec| h[rec[0]] = rec if not h[rec[0]] }
|
114
|
+
h
|
115
|
+
end
|
116
|
+
|
117
|
+
# Only include unique source values. The resulting table is unsorted.
|
118
|
+
#
|
119
|
+
def uniq!
|
120
|
+
@records = to_h.values
|
121
|
+
@sorted = false
|
122
|
+
nil
|
123
|
+
end
|
124
|
+
|
125
|
+
# Merge another table into this one. If the same source term exists in
|
126
|
+
# both tables, then the record from the other table will be used instead.
|
127
|
+
# Note: after a merge, the resulting table is unsorted.
|
128
|
+
#
|
129
|
+
def merge!(tab2)
|
130
|
+
if tab2.width != width
|
131
|
+
raise "Table widths must match when merging tables"
|
132
|
+
end
|
133
|
+
h1 = to_h
|
134
|
+
tab2.records.each do |rec|
|
135
|
+
h1[rec[0]] = rec
|
136
|
+
end
|
137
|
+
@records = h1.values
|
138
|
+
@sorted = false
|
139
|
+
end
|
140
|
+
|
141
|
+
# Return a CSV formatted string
|
142
|
+
#
|
143
|
+
def to_csv
|
144
|
+
@records.map {|r| r.join("|") }.join("\n")
|
145
|
+
end
|
146
|
+
|
97
147
|
# The number of records in the table
|
98
148
|
#
|
99
149
|
def length
|
data/lib/sanzang/translator.rb
CHANGED
@@ -37,13 +37,10 @@ module Sanzang
|
|
37
37
|
# These records represent the vocabulary used by the text.
|
38
38
|
#
|
39
39
|
def text_vocab(source_text)
|
40
|
-
|
41
|
-
@table.records.
|
42
|
-
|
43
|
-
new_table << record
|
44
|
-
end
|
40
|
+
text_copy = String.new(source_text)
|
41
|
+
@table.records.select do |r|
|
42
|
+
text_copy.include?(r[0]) ? text_copy.gsub!(r[0], "\x1F") : false
|
45
43
|
end
|
46
|
-
new_table
|
47
44
|
end
|
48
45
|
|
49
46
|
# Use the TranslationTable of the Translator to create translations for
|
@@ -80,7 +77,7 @@ module Sanzang
|
|
80
77
|
|
81
78
|
listing = ""
|
82
79
|
texts[0].length.times do |line_i|
|
83
|
-
|
80
|
+
texts.length.times do |col_i|
|
84
81
|
listing << "[#{pos + line_i}.#{col_i + 1}] #{texts[col_i][line_i]}" \
|
85
82
|
<< newline
|
86
83
|
end
|
data/lib/sanzang/version.rb
CHANGED
data/test/tc_reflow_encodings.rb
CHANGED
@@ -23,8 +23,7 @@ class TestReflowEncodings < Test::Unit::TestCase
|
|
23
23
|
text_s2 = " 大唐三藏法師玄奘奉\n 詔譯\n \n"
|
24
24
|
text_s1.encode!(encoding)
|
25
25
|
text_s2.encode!(encoding)
|
26
|
-
|
27
|
-
assert_equal(text_s2, formatter.reflow_cjk(text_s1))
|
26
|
+
assert_equal(text_s2, Sanzang::Formatting.reflow_cjk(text_s1))
|
28
27
|
end
|
29
28
|
|
30
29
|
# Han characters, simplified and without double vertical bar. The margin
|
@@ -36,8 +35,7 @@ class TestReflowEncodings < Test::Unit::TestCase
|
|
36
35
|
text_s2 = " 大唐三藏法师玄奘奉\n 诏译\n \n"
|
37
36
|
text_s1.encode!(encoding)
|
38
37
|
text_s2.encode!(encoding)
|
39
|
-
|
40
|
-
assert_equal(text_s2, formatter.reflow_cjk(text_s1))
|
38
|
+
assert_equal(text_s2, Sanzang::Formatting.reflow_cjk(text_s1))
|
41
39
|
end
|
42
40
|
|
43
41
|
# UTF-8 (Traditional Chinese)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanzang
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lapis Lazuli Texts
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02
|
11
|
+
date: 2014-03-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: parallel
|
@@ -52,8 +52,8 @@ files:
|
|
52
52
|
- lib/sanzang/command/reflow.rb
|
53
53
|
- lib/sanzang/command/sanzang_cmd.rb
|
54
54
|
- lib/sanzang/command/translate.rb
|
55
|
+
- lib/sanzang/formatting.rb
|
55
56
|
- lib/sanzang/platform.rb
|
56
|
-
- lib/sanzang/text_formatter.rb
|
57
57
|
- lib/sanzang/translation_table.rb
|
58
58
|
- lib/sanzang/translator.rb
|
59
59
|
- lib/sanzang/version.rb
|
@@ -90,7 +90,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
90
90
|
requirements:
|
91
91
|
- parallel ~> 0.8
|
92
92
|
rubyforge_project:
|
93
|
-
rubygems_version: 2.2.
|
93
|
+
rubygems_version: 2.2.2
|
94
94
|
signing_key:
|
95
95
|
specification_version: 4
|
96
96
|
summary: Machine translation from CJK languages
|