sanzang 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HACKING +54 -0
- data/LICENSE +628 -0
- data/README +280 -0
- data/bin/sanzang-reflow +21 -0
- data/bin/sanzang-translate +21 -0
- data/lib/sanzang.rb +65 -0
- data/lib/sanzang/command/reflow.rb +136 -0
- data/lib/sanzang/command/translate.rb +168 -0
- data/lib/sanzang/text_formatter.rb +71 -0
- data/lib/sanzang/translation_table.rb +113 -0
- data/lib/sanzang/translator.rb +174 -0
- data/lib/sanzang/version.rb +24 -0
- data/test/tc_commands.rb +17 -0
- data/test/tc_reflow_encodings.rb +98 -0
- data/test/tc_simple_translation.rb +97 -0
- data/test/utf-8/batch/file_1.txt +8 -0
- data/test/utf-8/batch/file_2.txt +8 -0
- data/test/utf-8/batch/file_3.txt +8 -0
- data/test/utf-8/batch/file_4.txt +8 -0
- data/test/utf-8/file_1.txt +2 -0
- data/test/utf-8/file_2.txt +2 -0
- data/test/utf-8/file_3.txt +2 -0
- data/test/utf-8/file_4.txt +2 -0
- data/test/utf-8/stage_1.txt +1 -0
- data/test/utf-8/stage_2.txt +2 -0
- data/test/utf-8/stage_3.txt +4 -0
- data/test/utf-8/table.txt +8 -0
- metadata +102 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
#--
|
4
|
+
# Copyright (C) 2012 Lapis Lazuli Texts
|
5
|
+
#
|
6
|
+
# This program is free software: you can redistribute it and/or modify it under
|
7
|
+
# the terms of the GNU General Public License as published by the Free Software
|
8
|
+
# Foundation, either version 3 of the License, or (at your option) any later
|
9
|
+
# version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful, but WITHOUT
|
12
|
+
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
13
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
14
|
+
# details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License along with
|
17
|
+
# this program. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
|
19
|
+
module Sanzang
|
20
|
+
|
21
|
+
# The current version number of Sanzang.
|
22
|
+
VERSION = "0.0.1"
|
23
|
+
|
24
|
+
end
|
data/test/tc_commands.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
|
4
|
+
require "test/unit"
|
5
|
+
|
6
|
+
require_relative File.join("..", "lib", "sanzang")
|
7
|
+
|
8
|
+
class TestCommands < Test::Unit::TestCase
|
9
|
+
|
10
|
+
def run_reflow(args)
|
11
|
+
end
|
12
|
+
|
13
|
+
def run_translate(args)
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
|
4
|
+
require "test/unit"
|
5
|
+
|
6
|
+
require_relative File.join("..", "lib", "sanzang")
|
7
|
+
|
8
|
+
# Test "reflow" operation with all major encodings for conversion and accuracy.
|
9
|
+
#
|
10
|
+
# Most encodings deemed as "important" here are Unicode encodings and those
|
11
|
+
# commonly used for Chinese. Some encodings do not function due to converters
|
12
|
+
# for these encodings being unimplemented in Ruby 1.9. Such encodings include
|
13
|
+
# the following:
|
14
|
+
#
|
15
|
+
# * EUC-TW (Traditional Chinese)
|
16
|
+
#
|
17
|
+
class TestReflowEncodings < Test::Unit::TestCase
|
18
|
+
|
19
|
+
# Han characters, traditional, including a CBETA-style margin, which should
|
20
|
+
# be automatically stripped out by the text formatter.
|
21
|
+
#
|
22
|
+
def reflow_zh_hant(encoding)
|
23
|
+
text_s1 = "T31n1586_p0060a19(00)║ 大唐三藏法師玄奘奉 詔譯"
|
24
|
+
text_s2 = " 大唐三藏法師玄奘奉\n 詔譯\n \n"
|
25
|
+
text_s1.encode!(encoding)
|
26
|
+
text_s2.encode!(encoding)
|
27
|
+
formatter = Sanzang::TextFormatter.new
|
28
|
+
assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
|
29
|
+
end
|
30
|
+
|
31
|
+
# Han characters, simplified and without double vertical bar. The margin
|
32
|
+
# was dropped from the text due to GB2312 not supporting the "double bar"
|
33
|
+
# (U+2551) character.
|
34
|
+
#
|
35
|
+
def reflow_zh_hans(encoding)
|
36
|
+
text_s1 = " 大唐三藏法师玄奘奉 诏译"
|
37
|
+
text_s2 = " 大唐三藏法师玄奘奉\n 诏译\n \n"
|
38
|
+
text_s1.encode!(encoding)
|
39
|
+
text_s2.encode!(encoding)
|
40
|
+
formatter = Sanzang::TextFormatter.new
|
41
|
+
assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
|
42
|
+
end
|
43
|
+
|
44
|
+
# UTF-8 (Traditional Chinese)
|
45
|
+
#
|
46
|
+
def test_reflow_hanzi_utf_8
|
47
|
+
reflow_zh_hant("UTF-8")
|
48
|
+
end
|
49
|
+
|
50
|
+
# UTF-16LE (Traditional Chinese)
|
51
|
+
#
|
52
|
+
def test_reflow_hanzi_utf_16le
|
53
|
+
reflow_zh_hant("UTF-16LE")
|
54
|
+
end
|
55
|
+
|
56
|
+
# UTF-16BE (Traditional Chinese)
|
57
|
+
#
|
58
|
+
def test_reflow_hanzi_utf_16be
|
59
|
+
reflow_zh_hant("UTF-16BE")
|
60
|
+
end
|
61
|
+
|
62
|
+
# UTF-32LE (Traditional Chinese)
|
63
|
+
#
|
64
|
+
def test_reflow_hanzi_utf_32le
|
65
|
+
reflow_zh_hant("UTF-32LE")
|
66
|
+
end
|
67
|
+
|
68
|
+
# UTF-32BE (Traditional Chinese)
|
69
|
+
#
|
70
|
+
def test_reflow_hanzi_utf_32be
|
71
|
+
reflow_zh_hant("UTF-32BE")
|
72
|
+
end
|
73
|
+
|
74
|
+
# Big5 (Traditional Chinese)
|
75
|
+
#
|
76
|
+
def test_reflow_hanzi_big5
|
77
|
+
reflow_zh_hant("Big5")
|
78
|
+
end
|
79
|
+
|
80
|
+
# GB2312 (Simplified Chinese)
|
81
|
+
# Double vertical bar glyph (U+2551) is not present in GB2312
|
82
|
+
#
|
83
|
+
def test_reflow_hanzi_gb2312
|
84
|
+
reflow_zh_hans("GB2312")
|
85
|
+
end
|
86
|
+
|
87
|
+
# GBK (Traditional Chinese)
|
88
|
+
#
|
89
|
+
def test_reflow_hanzi_gbk
|
90
|
+
reflow_zh_hant("GBK")
|
91
|
+
end
|
92
|
+
|
93
|
+
# GB18030 (Traditional Chinese)
|
94
|
+
#
|
95
|
+
def test_reflow_hanzi_gb18030
|
96
|
+
reflow_zh_hant("GB18030")
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
|
4
|
+
require "test/unit"
|
5
|
+
|
6
|
+
require_relative File.join("..", "lib", "sanzang")
|
7
|
+
|
8
|
+
# assert_nothing_raised
|
9
|
+
# assert_equal(x, y)
|
10
|
+
# assert(stmt, "Error message")
|
11
|
+
#
|
12
|
+
class TestSanzang < Test::Unit::TestCase
|
13
|
+
|
14
|
+
def table_string
|
15
|
+
"~|三藏| sānzàng| tripiṭaka|~
|
16
|
+
~|法師| fǎshī| dharma-master|~
|
17
|
+
~|玄奘| xuánzàng| xuanzang|~
|
18
|
+
~|奉| fèng| reverently|~
|
19
|
+
~|唐| táng| tang|~
|
20
|
+
~|大| dà| great|~
|
21
|
+
~|詔| zhào| imperial-order|~
|
22
|
+
~|譯| yì| translate/interpret|~"
|
23
|
+
end
|
24
|
+
|
25
|
+
def stage_1
|
26
|
+
"T31n1586_p0060a19(00)║ 大唐三藏法師玄奘奉 詔譯\r\n"
|
27
|
+
end
|
28
|
+
|
29
|
+
def stage_2
|
30
|
+
" 大唐三藏法師玄奘奉\r\n 詔譯\r\n"
|
31
|
+
end
|
32
|
+
|
33
|
+
def stage_3
|
34
|
+
"[1.1] 大唐三藏法師玄奘奉\r\n" \
|
35
|
+
<< "[1.2] dà táng sānzàng fǎshī xuánzàng fèng\r\n" \
|
36
|
+
<< "[1.3] great tang tripiṭaka dharma-master xuanzang " \
|
37
|
+
<< "reverently\r\n" \
|
38
|
+
<< "\r\n" \
|
39
|
+
<< "[2.1] 詔譯\r\n" \
|
40
|
+
<< "[2.2] zhào yì\r\n" \
|
41
|
+
<< "[2.3] imperial-order translate/interpret\r\n" \
|
42
|
+
<< "\r\n"
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_translation_table
|
46
|
+
table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
|
47
|
+
fin = File.open(table_path, "rb", encoding: "UTF-8")
|
48
|
+
table = Sanzang::TranslationTable.new(fin)
|
49
|
+
fin.close
|
50
|
+
assert(table.width.class == Fixnum, "Table width undefined")
|
51
|
+
assert(table.length.class == Fixnum, "Table length undefined")
|
52
|
+
assert(table.records.class == Array, "Table contents not an array")
|
53
|
+
rec0_length = table.records[0].length
|
54
|
+
table.records.each do |rec|
|
55
|
+
assert(rec.class == Array, "Malformed table records")
|
56
|
+
assert(rec.length == rec0_length, "Inconsistent table records")
|
57
|
+
end
|
58
|
+
assert(table.width > 0, "Zero-width table")
|
59
|
+
assert(table.length > 0, "Zero-length table")
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_reflow_cjk_string
|
63
|
+
text = Sanzang::TextFormatter.new.reflow_cjk_text(stage_1())
|
64
|
+
assert_equal(stage_2(), text)
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_translate_string
|
68
|
+
table = Sanzang::TranslationTable.new(table_string())
|
69
|
+
text = Sanzang::Translator.new(table).gen_listing(stage_2())
|
70
|
+
assert_equal(stage_3(), text)
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_translate_file
|
74
|
+
table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
|
75
|
+
s2_path = File.join(File.dirname(__FILE__), "utf-8", "stage_2.txt")
|
76
|
+
s3_path = File.join(File.dirname(__FILE__), "utf-8", "stage_3.txt")
|
77
|
+
table = Sanzang::TranslationTable.new(table_path)
|
78
|
+
translator = Sanzang::Translator.new(table)
|
79
|
+
translator.translate_io(s2_path, s3_path)
|
80
|
+
end
|
81
|
+
|
82
|
+
def test_translator_parallel
|
83
|
+
table = Sanzang::TranslationTable.new(table_string())
|
84
|
+
translator = Sanzang::Translator.new(table)
|
85
|
+
translator.runs_parallel?
|
86
|
+
assert(translator.processor_count > 0, "Processor count less than zero")
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_translate_batch
|
90
|
+
table = Sanzang::TranslationTable.new(table_string())
|
91
|
+
translator = Sanzang::Translator.new(table)
|
92
|
+
translator.translate_batch(
|
93
|
+
Dir.glob(File.join(File.dirname(__FILE__), "utf-8", "file_*.txt")),
|
94
|
+
File.join(File.dirname(__FILE__), "utf-8", "batch"), false)
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
T31n1586_p0060a19(00)║ 大唐三藏法師玄奘奉 詔譯
|
metadata
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sanzang
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Lapis Lazuli Texts
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-10-02 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: parallel
|
16
|
+
requirement: &81673770 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.5.18
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *81673770
|
25
|
+
description: Sanzang is an application built for direct machine translation of natural
|
26
|
+
languages. This application is particularly suitable as a translation aid for for
|
27
|
+
ancient Chinese texts. Sanzang uses simple direct translation rules organized into
|
28
|
+
translation tables, which are stored in a straightforward text format. Batch translations
|
29
|
+
utilize multiprocessing to translate files in parallel, naturally scaling to the
|
30
|
+
number of processors available. Sanzang is available under the GNU General Public
|
31
|
+
License, version 3.
|
32
|
+
email:
|
33
|
+
- lapislazulitexts@gmail.com
|
34
|
+
executables:
|
35
|
+
- sanzang-reflow
|
36
|
+
- sanzang-translate
|
37
|
+
extensions: []
|
38
|
+
extra_rdoc_files:
|
39
|
+
- HACKING
|
40
|
+
- LICENSE
|
41
|
+
- README
|
42
|
+
files:
|
43
|
+
- bin/sanzang-reflow
|
44
|
+
- bin/sanzang-translate
|
45
|
+
- test/tc_commands.rb
|
46
|
+
- test/utf-8/batch/file_4.txt
|
47
|
+
- test/utf-8/batch/file_3.txt
|
48
|
+
- test/utf-8/batch/file_1.txt
|
49
|
+
- test/utf-8/batch/file_2.txt
|
50
|
+
- test/utf-8/stage_3.txt
|
51
|
+
- test/utf-8/file_4.txt
|
52
|
+
- test/utf-8/file_3.txt
|
53
|
+
- test/utf-8/file_1.txt
|
54
|
+
- test/utf-8/stage_1.txt
|
55
|
+
- test/utf-8/table.txt
|
56
|
+
- test/utf-8/file_2.txt
|
57
|
+
- test/utf-8/stage_2.txt
|
58
|
+
- test/tc_reflow_encodings.rb
|
59
|
+
- test/tc_simple_translation.rb
|
60
|
+
- lib/sanzang.rb
|
61
|
+
- lib/sanzang/text_formatter.rb
|
62
|
+
- lib/sanzang/translation_table.rb
|
63
|
+
- lib/sanzang/version.rb
|
64
|
+
- lib/sanzang/command/reflow.rb
|
65
|
+
- lib/sanzang/command/translate.rb
|
66
|
+
- lib/sanzang/translator.rb
|
67
|
+
- HACKING
|
68
|
+
- LICENSE
|
69
|
+
- README
|
70
|
+
homepage: http://www.lapislazulitexts.com
|
71
|
+
licenses:
|
72
|
+
- GPL-3
|
73
|
+
post_install_message:
|
74
|
+
rdoc_options:
|
75
|
+
- --main
|
76
|
+
- README
|
77
|
+
- --title
|
78
|
+
- Sanzang
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
none: false
|
83
|
+
requirements:
|
84
|
+
- - ! '>='
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: 1.9.0
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
none: false
|
89
|
+
requirements:
|
90
|
+
- - ! '>='
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements: []
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 1.8.11
|
96
|
+
signing_key:
|
97
|
+
specification_version: 3
|
98
|
+
summary: Sanzang
|
99
|
+
test_files:
|
100
|
+
- test/tc_commands.rb
|
101
|
+
- test/tc_reflow_encodings.rb
|
102
|
+
- test/tc_simple_translation.rb
|