sanzang 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ module Sanzang
20
+
21
+ # The current version number of Sanzang.
22
+ VERSION = "0.0.1"
23
+
24
+ end
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+
4
+ require "test/unit"
5
+
6
+ require_relative File.join("..", "lib", "sanzang")
7
+
8
+ class TestCommands < Test::Unit::TestCase
9
+
10
+ def run_reflow(args)
11
+ end
12
+
13
+ def run_translate(args)
14
+ end
15
+
16
+
17
+ end
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+
4
+ require "test/unit"
5
+
6
+ require_relative File.join("..", "lib", "sanzang")
7
+
8
+ # Test "reflow" operation with all major encodings for conversion and accuracy.
9
+ #
10
+ # Most encodings deemed as "important" here are Unicode encodings and those
11
+ # commonly used for Chinese. Some encodings do not function due to converters
12
+ # for these encodings being unimplemented in Ruby 1.9. Such encodings include
13
+ # the following:
14
+ #
15
+ # * EUC-TW (Traditional Chinese)
16
+ #
17
+ class TestReflowEncodings < Test::Unit::TestCase
18
+
19
+ # Han characters, traditional, including a CBETA-style margin, which should
20
+ # be automatically stripped out by the text formatter.
21
+ #
22
+ def reflow_zh_hant(encoding)
23
+ text_s1 = "T31n1586_p0060a19(00)║    大唐三藏法師玄奘奉 詔譯"
24
+ text_s2 = "    大唐三藏法師玄奘奉\n 詔譯\n \n"
25
+ text_s1.encode!(encoding)
26
+ text_s2.encode!(encoding)
27
+ formatter = Sanzang::TextFormatter.new
28
+ assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
29
+ end
30
+
31
+ # Han characters, simplified and without double vertical bar. The margin
32
+ # was dropped from the text due to GB2312 not supporting the "double bar"
33
+ # (U+2551) character.
34
+ #
35
+ def reflow_zh_hans(encoding)
36
+ text_s1 = "    大唐三藏法师玄奘奉 诏译"
37
+ text_s2 = "    大唐三藏法师玄奘奉\n 诏译\n \n"
38
+ text_s1.encode!(encoding)
39
+ text_s2.encode!(encoding)
40
+ formatter = Sanzang::TextFormatter.new
41
+ assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
42
+ end
43
+
44
+ # UTF-8 (Traditional Chinese)
45
+ #
46
+ def test_reflow_hanzi_utf_8
47
+ reflow_zh_hant("UTF-8")
48
+ end
49
+
50
+ # UTF-16LE (Traditional Chinese)
51
+ #
52
+ def test_reflow_hanzi_utf_16le
53
+ reflow_zh_hant("UTF-16LE")
54
+ end
55
+
56
+ # UTF-16BE (Traditional Chinese)
57
+ #
58
+ def test_reflow_hanzi_utf_16be
59
+ reflow_zh_hant("UTF-16BE")
60
+ end
61
+
62
+ # UTF-32LE (Traditional Chinese)
63
+ #
64
+ def test_reflow_hanzi_utf_32le
65
+ reflow_zh_hant("UTF-32LE")
66
+ end
67
+
68
+ # UTF-32BE (Traditional Chinese)
69
+ #
70
+ def test_reflow_hanzi_utf_32be
71
+ reflow_zh_hant("UTF-32BE")
72
+ end
73
+
74
+ # Big5 (Traditional Chinese)
75
+ #
76
+ def test_reflow_hanzi_big5
77
+ reflow_zh_hant("Big5")
78
+ end
79
+
80
+ # GB2312 (Simplified Chinese)
81
+ # Double vertical bar glyph (U+2551) is not present in GB2312
82
+ #
83
+ def test_reflow_hanzi_gb2312
84
+ reflow_zh_hans("GB2312")
85
+ end
86
+
87
+ # GBK (Traditional Chinese)
88
+ #
89
+ def test_reflow_hanzi_gbk
90
+ reflow_zh_hant("GBK")
91
+ end
92
+
93
+ # GB18030 (Traditional Chinese)
94
+ #
95
+ def test_reflow_hanzi_gb18030
96
+ reflow_zh_hant("GB18030")
97
+ end
98
+ end
@@ -0,0 +1,97 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+
4
+ require "test/unit"
5
+
6
+ require_relative File.join("..", "lib", "sanzang")
7
+
8
+ # assert_nothing_raised
9
+ # assert_equal(x, y)
10
+ # assert(stmt, "Error message")
11
+ #
12
+ class TestSanzang < Test::Unit::TestCase
13
+
14
+ def table_string
15
+ "~|三藏| sānzàng| tripiṭaka|~
16
+ ~|法師| fǎshī| dharma-master|~
17
+ ~|玄奘| xuánzàng| xuanzang|~
18
+ ~|奉| fèng| reverently|~
19
+ ~|唐| táng| tang|~
20
+ ~|大| dà| great|~
21
+ ~|詔| zhào| imperial-order|~
22
+ ~|譯| yì| translate/interpret|~"
23
+ end
24
+
25
+ def stage_1
26
+ "T31n1586_p0060a19(00)║    大唐三藏法師玄奘奉 詔譯\r\n"
27
+ end
28
+
29
+ def stage_2
30
+ "    大唐三藏法師玄奘奉\r\n 詔譯\r\n"
31
+ end
32
+
33
+ def stage_3
34
+ "[1.1]     大唐三藏法師玄奘奉\r\n" \
35
+ << "[1.2]      dà táng sānzàng fǎshī xuánzàng fèng\r\n" \
36
+ << "[1.3]      great tang tripiṭaka dharma-master xuanzang " \
37
+ << "reverently\r\n" \
38
+ << "\r\n" \
39
+ << "[2.1]  詔譯\r\n" \
40
+ << "[2.2]   zhào yì\r\n" \
41
+ << "[2.3]   imperial-order translate/interpret\r\n" \
42
+ << "\r\n"
43
+ end
44
+
45
+ def test_translation_table
46
+ table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
47
+ fin = File.open(table_path, "rb", encoding: "UTF-8")
48
+ table = Sanzang::TranslationTable.new(fin)
49
+ fin.close
50
+ assert(table.width.class == Fixnum, "Table width undefined")
51
+ assert(table.length.class == Fixnum, "Table length undefined")
52
+ assert(table.records.class == Array, "Table contents not an array")
53
+ rec0_length = table.records[0].length
54
+ table.records.each do |rec|
55
+ assert(rec.class == Array, "Malformed table records")
56
+ assert(rec.length == rec0_length, "Inconsistent table records")
57
+ end
58
+ assert(table.width > 0, "Zero-width table")
59
+ assert(table.length > 0, "Zero-length table")
60
+ end
61
+
62
+ def test_reflow_cjk_string
63
+ text = Sanzang::TextFormatter.new.reflow_cjk_text(stage_1())
64
+ assert_equal(stage_2(), text)
65
+ end
66
+
67
+ def test_translate_string
68
+ table = Sanzang::TranslationTable.new(table_string())
69
+ text = Sanzang::Translator.new(table).gen_listing(stage_2())
70
+ assert_equal(stage_3(), text)
71
+ end
72
+
73
+ def test_translate_file
74
+ table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
75
+ s2_path = File.join(File.dirname(__FILE__), "utf-8", "stage_2.txt")
76
+ s3_path = File.join(File.dirname(__FILE__), "utf-8", "stage_3.txt")
77
+ table = Sanzang::TranslationTable.new(table_path)
78
+ translator = Sanzang::Translator.new(table)
79
+ translator.translate_io(s2_path, s3_path)
80
+ end
81
+
82
+ def test_translator_parallel
83
+ table = Sanzang::TranslationTable.new(table_string())
84
+ translator = Sanzang::Translator.new(table)
85
+ translator.runs_parallel?
86
+ assert(translator.processor_count > 0, "Processor count less than zero")
87
+ end
88
+
89
+ def test_translate_batch
90
+ table = Sanzang::TranslationTable.new(table_string())
91
+ translator = Sanzang::Translator.new(table)
92
+ translator.translate_batch(
93
+ Dir.glob(File.join(File.dirname(__FILE__), "utf-8", "file_*.txt")),
94
+ File.join(File.dirname(__FILE__), "utf-8", "batch"), false)
95
+ end
96
+
97
+ end
@@ -0,0 +1,8 @@
1
+ [1.1]     大唐三藏法師玄奘奉
2
+ [1.2]      dà táng sānzàng fǎshī xuánzàng fèng
3
+ [1.3]      great tang tripiṭaka dharma-master xuanzang reverently
4
+
5
+ [2.1]  詔譯
6
+ [2.2]   zhào yì
7
+ [2.3]   imperial-order translate/interpret
8
+
@@ -0,0 +1,8 @@
1
+ [1.1]     大唐三藏法師玄奘奉
2
+ [1.2]      dà táng sānzàng fǎshī xuánzàng fèng
3
+ [1.3]      great tang tripiṭaka dharma-master xuanzang reverently
4
+
5
+ [2.1]  詔譯
6
+ [2.2]   zhào yì
7
+ [2.3]   imperial-order translate/interpret
8
+
@@ -0,0 +1,8 @@
1
+ [1.1]     大唐三藏法師玄奘奉
2
+ [1.2]      dà táng sānzàng fǎshī xuánzàng fèng
3
+ [1.3]      great tang tripiṭaka dharma-master xuanzang reverently
4
+
5
+ [2.1]  詔譯
6
+ [2.2]   zhào yì
7
+ [2.3]   imperial-order translate/interpret
8
+
@@ -0,0 +1,8 @@
1
+ [1.1]     大唐三藏法師玄奘奉
2
+ [1.2]      dà táng sānzàng fǎshī xuánzàng fèng
3
+ [1.3]      great tang tripiṭaka dharma-master xuanzang reverently
4
+
5
+ [2.1]  詔譯
6
+ [2.2]   zhào yì
7
+ [2.3]   imperial-order translate/interpret
8
+
@@ -0,0 +1,2 @@
1
+     大唐三藏法師玄奘奉
2
+  詔譯
@@ -0,0 +1,2 @@
1
+     大唐三藏法師玄奘奉
2
+  詔譯
@@ -0,0 +1,2 @@
1
+     大唐三藏法師玄奘奉
2
+  詔譯
@@ -0,0 +1,2 @@
1
+     大唐三藏法師玄奘奉
2
+  詔譯
@@ -0,0 +1 @@
1
+ T31n1586_p0060a19(00)║    大唐三藏法師玄奘奉 詔譯
@@ -0,0 +1,2 @@
1
+     大唐三藏法師玄奘奉
2
+  詔譯
@@ -0,0 +1,4 @@
1
+ [1.1]     大唐三藏法師玄奘奉
2
+
3
+ [2.1]  詔譯
4
+
@@ -0,0 +1,8 @@
1
+ 三藏| sānzàng| tripiṭaka
2
+ 法師| fǎshī| dharma-master
3
+ 玄奘| xuánzàng| xuanzang
4
+ 奉| fèng| reverently
5
+ 唐| táng| tang
6
+ 大| dà| great
7
+ 詔| zhào| imperial-order
8
+ 譯| yì| translate/interpret
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sanzang
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Lapis Lazuli Texts
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-10-02 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: parallel
16
+ requirement: &81673770 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 0.5.18
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *81673770
25
+ description: Sanzang is an application built for direct machine translation of natural
26
+ languages. This application is particularly suitable as a translation aid for for
27
+ ancient Chinese texts. Sanzang uses simple direct translation rules organized into
28
+ translation tables, which are stored in a straightforward text format. Batch translations
29
+ utilize multiprocessing to translate files in parallel, naturally scaling to the
30
+ number of processors available. Sanzang is available under the GNU General Public
31
+ License, version 3.
32
+ email:
33
+ - lapislazulitexts@gmail.com
34
+ executables:
35
+ - sanzang-reflow
36
+ - sanzang-translate
37
+ extensions: []
38
+ extra_rdoc_files:
39
+ - HACKING
40
+ - LICENSE
41
+ - README
42
+ files:
43
+ - bin/sanzang-reflow
44
+ - bin/sanzang-translate
45
+ - test/tc_commands.rb
46
+ - test/utf-8/batch/file_4.txt
47
+ - test/utf-8/batch/file_3.txt
48
+ - test/utf-8/batch/file_1.txt
49
+ - test/utf-8/batch/file_2.txt
50
+ - test/utf-8/stage_3.txt
51
+ - test/utf-8/file_4.txt
52
+ - test/utf-8/file_3.txt
53
+ - test/utf-8/file_1.txt
54
+ - test/utf-8/stage_1.txt
55
+ - test/utf-8/table.txt
56
+ - test/utf-8/file_2.txt
57
+ - test/utf-8/stage_2.txt
58
+ - test/tc_reflow_encodings.rb
59
+ - test/tc_simple_translation.rb
60
+ - lib/sanzang.rb
61
+ - lib/sanzang/text_formatter.rb
62
+ - lib/sanzang/translation_table.rb
63
+ - lib/sanzang/version.rb
64
+ - lib/sanzang/command/reflow.rb
65
+ - lib/sanzang/command/translate.rb
66
+ - lib/sanzang/translator.rb
67
+ - HACKING
68
+ - LICENSE
69
+ - README
70
+ homepage: http://www.lapislazulitexts.com
71
+ licenses:
72
+ - GPL-3
73
+ post_install_message:
74
+ rdoc_options:
75
+ - --main
76
+ - README
77
+ - --title
78
+ - Sanzang
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ none: false
83
+ requirements:
84
+ - - ! '>='
85
+ - !ruby/object:Gem::Version
86
+ version: 1.9.0
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ none: false
89
+ requirements:
90
+ - - ! '>='
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 1.8.11
96
+ signing_key:
97
+ specification_version: 3
98
+ summary: Sanzang
99
+ test_files:
100
+ - test/tc_commands.rb
101
+ - test/tc_reflow_encodings.rb
102
+ - test/tc_simple_translation.rb