sanzang 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/HACKING +54 -0
- data/LICENSE +628 -0
- data/README +280 -0
- data/bin/sanzang-reflow +21 -0
- data/bin/sanzang-translate +21 -0
- data/lib/sanzang.rb +65 -0
- data/lib/sanzang/command/reflow.rb +136 -0
- data/lib/sanzang/command/translate.rb +168 -0
- data/lib/sanzang/text_formatter.rb +71 -0
- data/lib/sanzang/translation_table.rb +113 -0
- data/lib/sanzang/translator.rb +174 -0
- data/lib/sanzang/version.rb +24 -0
- data/test/tc_commands.rb +17 -0
- data/test/tc_reflow_encodings.rb +98 -0
- data/test/tc_simple_translation.rb +97 -0
- data/test/utf-8/batch/file_1.txt +8 -0
- data/test/utf-8/batch/file_2.txt +8 -0
- data/test/utf-8/batch/file_3.txt +8 -0
- data/test/utf-8/batch/file_4.txt +8 -0
- data/test/utf-8/file_1.txt +2 -0
- data/test/utf-8/file_2.txt +2 -0
- data/test/utf-8/file_3.txt +2 -0
- data/test/utf-8/file_4.txt +2 -0
- data/test/utf-8/stage_1.txt +1 -0
- data/test/utf-8/stage_2.txt +2 -0
- data/test/utf-8/stage_3.txt +4 -0
- data/test/utf-8/table.txt +8 -0
- metadata +102 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
#--
|
4
|
+
# Copyright (C) 2012 Lapis Lazuli Texts
|
5
|
+
#
|
6
|
+
# This program is free software: you can redistribute it and/or modify it under
|
7
|
+
# the terms of the GNU General Public License as published by the Free Software
|
8
|
+
# Foundation, either version 3 of the License, or (at your option) any later
|
9
|
+
# version.
|
10
|
+
#
|
11
|
+
# This program is distributed in the hope that it will be useful, but WITHOUT
|
12
|
+
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
13
|
+
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
14
|
+
# details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License along with
|
17
|
+
# this program. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
|
19
|
+
module Sanzang
|
20
|
+
|
21
|
+
# The current version number of Sanzang.
|
22
|
+
VERSION = "0.0.1"
|
23
|
+
|
24
|
+
end
|
data/test/tc_commands.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
|
4
|
+
require "test/unit"
|
5
|
+
|
6
|
+
require_relative File.join("..", "lib", "sanzang")
|
7
|
+
|
8
|
+
class TestCommands < Test::Unit::TestCase
|
9
|
+
|
10
|
+
def run_reflow(args)
|
11
|
+
end
|
12
|
+
|
13
|
+
def run_translate(args)
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
|
4
|
+
require "test/unit"
|
5
|
+
|
6
|
+
require_relative File.join("..", "lib", "sanzang")
|
7
|
+
|
8
|
+
# Test "reflow" operation with all major encodings for conversion and accuracy.
|
9
|
+
#
|
10
|
+
# Most encodings deemed as "important" here are Unicode encodings and those
|
11
|
+
# commonly used for Chinese. Some encodings do not function due to converters
|
12
|
+
# for these encodings being unimplemented in Ruby 1.9. Such encodings include
|
13
|
+
# the following:
|
14
|
+
#
|
15
|
+
# * EUC-TW (Traditional Chinese)
|
16
|
+
#
|
17
|
+
class TestReflowEncodings < Test::Unit::TestCase
|
18
|
+
|
19
|
+
# Han characters, traditional, including a CBETA-style margin, which should
|
20
|
+
# be automatically stripped out by the text formatter.
|
21
|
+
#
|
22
|
+
def reflow_zh_hant(encoding)
|
23
|
+
text_s1 = "T31n1586_p0060a19(00)║ 大唐三藏法師玄奘奉 詔譯"
|
24
|
+
text_s2 = " 大唐三藏法師玄奘奉\n 詔譯\n \n"
|
25
|
+
text_s1.encode!(encoding)
|
26
|
+
text_s2.encode!(encoding)
|
27
|
+
formatter = Sanzang::TextFormatter.new
|
28
|
+
assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
|
29
|
+
end
|
30
|
+
|
31
|
+
# Han characters, simplified and without double vertical bar. The margin
|
32
|
+
# was dropped from the text due to GB2312 not supporting the "double bar"
|
33
|
+
# (U+2551) character.
|
34
|
+
#
|
35
|
+
def reflow_zh_hans(encoding)
|
36
|
+
text_s1 = " 大唐三藏法师玄奘奉 诏译"
|
37
|
+
text_s2 = " 大唐三藏法师玄奘奉\n 诏译\n \n"
|
38
|
+
text_s1.encode!(encoding)
|
39
|
+
text_s2.encode!(encoding)
|
40
|
+
formatter = Sanzang::TextFormatter.new
|
41
|
+
assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
|
42
|
+
end
|
43
|
+
|
44
|
+
# UTF-8 (Traditional Chinese)
|
45
|
+
#
|
46
|
+
def test_reflow_hanzi_utf_8
|
47
|
+
reflow_zh_hant("UTF-8")
|
48
|
+
end
|
49
|
+
|
50
|
+
# UTF-16LE (Traditional Chinese)
|
51
|
+
#
|
52
|
+
def test_reflow_hanzi_utf_16le
|
53
|
+
reflow_zh_hant("UTF-16LE")
|
54
|
+
end
|
55
|
+
|
56
|
+
# UTF-16BE (Traditional Chinese)
|
57
|
+
#
|
58
|
+
def test_reflow_hanzi_utf_16be
|
59
|
+
reflow_zh_hant("UTF-16BE")
|
60
|
+
end
|
61
|
+
|
62
|
+
# UTF-32LE (Traditional Chinese)
|
63
|
+
#
|
64
|
+
def test_reflow_hanzi_utf_32le
|
65
|
+
reflow_zh_hant("UTF-32LE")
|
66
|
+
end
|
67
|
+
|
68
|
+
# UTF-32BE (Traditional Chinese)
|
69
|
+
#
|
70
|
+
def test_reflow_hanzi_utf_32be
|
71
|
+
reflow_zh_hant("UTF-32BE")
|
72
|
+
end
|
73
|
+
|
74
|
+
# Big5 (Traditional Chinese)
|
75
|
+
#
|
76
|
+
def test_reflow_hanzi_big5
|
77
|
+
reflow_zh_hant("Big5")
|
78
|
+
end
|
79
|
+
|
80
|
+
# GB2312 (Simplified Chinese)
|
81
|
+
# Double vertical bar glyph (U+2551) is not present in GB2312
|
82
|
+
#
|
83
|
+
def test_reflow_hanzi_gb2312
|
84
|
+
reflow_zh_hans("GB2312")
|
85
|
+
end
|
86
|
+
|
87
|
+
# GBK (Traditional Chinese)
|
88
|
+
#
|
89
|
+
def test_reflow_hanzi_gbk
|
90
|
+
reflow_zh_hant("GBK")
|
91
|
+
end
|
92
|
+
|
93
|
+
# GB18030 (Traditional Chinese)
|
94
|
+
#
|
95
|
+
def test_reflow_hanzi_gb18030
|
96
|
+
reflow_zh_hant("GB18030")
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- encoding: UTF-8 -*-
|
3
|
+
|
4
|
+
require "test/unit"
|
5
|
+
|
6
|
+
require_relative File.join("..", "lib", "sanzang")
|
7
|
+
|
8
|
+
# assert_nothing_raised
|
9
|
+
# assert_equal(x, y)
|
10
|
+
# assert(stmt, "Error message")
|
11
|
+
#
|
12
|
+
class TestSanzang < Test::Unit::TestCase
|
13
|
+
|
14
|
+
def table_string
|
15
|
+
"~|三藏| sānzàng| tripiṭaka|~
|
16
|
+
~|法師| fǎshī| dharma-master|~
|
17
|
+
~|玄奘| xuánzàng| xuanzang|~
|
18
|
+
~|奉| fèng| reverently|~
|
19
|
+
~|唐| táng| tang|~
|
20
|
+
~|大| dà| great|~
|
21
|
+
~|詔| zhào| imperial-order|~
|
22
|
+
~|譯| yì| translate/interpret|~"
|
23
|
+
end
|
24
|
+
|
25
|
+
def stage_1
|
26
|
+
"T31n1586_p0060a19(00)║ 大唐三藏法師玄奘奉 詔譯\r\n"
|
27
|
+
end
|
28
|
+
|
29
|
+
def stage_2
|
30
|
+
" 大唐三藏法師玄奘奉\r\n 詔譯\r\n"
|
31
|
+
end
|
32
|
+
|
33
|
+
def stage_3
|
34
|
+
"[1.1] 大唐三藏法師玄奘奉\r\n" \
|
35
|
+
<< "[1.2] dà táng sānzàng fǎshī xuánzàng fèng\r\n" \
|
36
|
+
<< "[1.3] great tang tripiṭaka dharma-master xuanzang " \
|
37
|
+
<< "reverently\r\n" \
|
38
|
+
<< "\r\n" \
|
39
|
+
<< "[2.1] 詔譯\r\n" \
|
40
|
+
<< "[2.2] zhào yì\r\n" \
|
41
|
+
<< "[2.3] imperial-order translate/interpret\r\n" \
|
42
|
+
<< "\r\n"
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_translation_table
|
46
|
+
table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
|
47
|
+
fin = File.open(table_path, "rb", encoding: "UTF-8")
|
48
|
+
table = Sanzang::TranslationTable.new(fin)
|
49
|
+
fin.close
|
50
|
+
assert(table.width.class == Fixnum, "Table width undefined")
|
51
|
+
assert(table.length.class == Fixnum, "Table length undefined")
|
52
|
+
assert(table.records.class == Array, "Table contents not an array")
|
53
|
+
rec0_length = table.records[0].length
|
54
|
+
table.records.each do |rec|
|
55
|
+
assert(rec.class == Array, "Malformed table records")
|
56
|
+
assert(rec.length == rec0_length, "Inconsistent table records")
|
57
|
+
end
|
58
|
+
assert(table.width > 0, "Zero-width table")
|
59
|
+
assert(table.length > 0, "Zero-length table")
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_reflow_cjk_string
|
63
|
+
text = Sanzang::TextFormatter.new.reflow_cjk_text(stage_1())
|
64
|
+
assert_equal(stage_2(), text)
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_translate_string
|
68
|
+
table = Sanzang::TranslationTable.new(table_string())
|
69
|
+
text = Sanzang::Translator.new(table).gen_listing(stage_2())
|
70
|
+
assert_equal(stage_3(), text)
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_translate_file
|
74
|
+
table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
|
75
|
+
s2_path = File.join(File.dirname(__FILE__), "utf-8", "stage_2.txt")
|
76
|
+
s3_path = File.join(File.dirname(__FILE__), "utf-8", "stage_3.txt")
|
77
|
+
table = Sanzang::TranslationTable.new(table_path)
|
78
|
+
translator = Sanzang::Translator.new(table)
|
79
|
+
translator.translate_io(s2_path, s3_path)
|
80
|
+
end
|
81
|
+
|
82
|
+
def test_translator_parallel
|
83
|
+
table = Sanzang::TranslationTable.new(table_string())
|
84
|
+
translator = Sanzang::Translator.new(table)
|
85
|
+
translator.runs_parallel?
|
86
|
+
assert(translator.processor_count > 0, "Processor count less than zero")
|
87
|
+
end
|
88
|
+
|
89
|
+
def test_translate_batch
|
90
|
+
table = Sanzang::TranslationTable.new(table_string())
|
91
|
+
translator = Sanzang::Translator.new(table)
|
92
|
+
translator.translate_batch(
|
93
|
+
Dir.glob(File.join(File.dirname(__FILE__), "utf-8", "file_*.txt")),
|
94
|
+
File.join(File.dirname(__FILE__), "utf-8", "batch"), false)
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
T31n1586_p0060a19(00)║ 大唐三藏法師玄奘奉 詔譯
|
metadata
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sanzang
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Lapis Lazuli Texts
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-10-02 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: parallel
|
16
|
+
requirement: &81673770 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 0.5.18
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *81673770
|
25
|
+
description: Sanzang is an application built for direct machine translation of natural
|
26
|
+
languages. This application is particularly suitable as a translation aid for for
|
27
|
+
ancient Chinese texts. Sanzang uses simple direct translation rules organized into
|
28
|
+
translation tables, which are stored in a straightforward text format. Batch translations
|
29
|
+
utilize multiprocessing to translate files in parallel, naturally scaling to the
|
30
|
+
number of processors available. Sanzang is available under the GNU General Public
|
31
|
+
License, version 3.
|
32
|
+
email:
|
33
|
+
- lapislazulitexts@gmail.com
|
34
|
+
executables:
|
35
|
+
- sanzang-reflow
|
36
|
+
- sanzang-translate
|
37
|
+
extensions: []
|
38
|
+
extra_rdoc_files:
|
39
|
+
- HACKING
|
40
|
+
- LICENSE
|
41
|
+
- README
|
42
|
+
files:
|
43
|
+
- bin/sanzang-reflow
|
44
|
+
- bin/sanzang-translate
|
45
|
+
- test/tc_commands.rb
|
46
|
+
- test/utf-8/batch/file_4.txt
|
47
|
+
- test/utf-8/batch/file_3.txt
|
48
|
+
- test/utf-8/batch/file_1.txt
|
49
|
+
- test/utf-8/batch/file_2.txt
|
50
|
+
- test/utf-8/stage_3.txt
|
51
|
+
- test/utf-8/file_4.txt
|
52
|
+
- test/utf-8/file_3.txt
|
53
|
+
- test/utf-8/file_1.txt
|
54
|
+
- test/utf-8/stage_1.txt
|
55
|
+
- test/utf-8/table.txt
|
56
|
+
- test/utf-8/file_2.txt
|
57
|
+
- test/utf-8/stage_2.txt
|
58
|
+
- test/tc_reflow_encodings.rb
|
59
|
+
- test/tc_simple_translation.rb
|
60
|
+
- lib/sanzang.rb
|
61
|
+
- lib/sanzang/text_formatter.rb
|
62
|
+
- lib/sanzang/translation_table.rb
|
63
|
+
- lib/sanzang/version.rb
|
64
|
+
- lib/sanzang/command/reflow.rb
|
65
|
+
- lib/sanzang/command/translate.rb
|
66
|
+
- lib/sanzang/translator.rb
|
67
|
+
- HACKING
|
68
|
+
- LICENSE
|
69
|
+
- README
|
70
|
+
homepage: http://www.lapislazulitexts.com
|
71
|
+
licenses:
|
72
|
+
- GPL-3
|
73
|
+
post_install_message:
|
74
|
+
rdoc_options:
|
75
|
+
- --main
|
76
|
+
- README
|
77
|
+
- --title
|
78
|
+
- Sanzang
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
none: false
|
83
|
+
requirements:
|
84
|
+
- - ! '>='
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: 1.9.0
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
none: false
|
89
|
+
requirements:
|
90
|
+
- - ! '>='
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements: []
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 1.8.11
|
96
|
+
signing_key:
|
97
|
+
specification_version: 3
|
98
|
+
summary: Sanzang
|
99
|
+
test_files:
|
100
|
+
- test/tc_commands.rb
|
101
|
+
- test/tc_reflow_encodings.rb
|
102
|
+
- test/tc_simple_translation.rb
|