sanzang 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+ #--
4
+ # Copyright (C) 2012 Lapis Lazuli Texts
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify it under
7
+ # the terms of the GNU General Public License as published by the Free Software
8
+ # Foundation, either version 3 of the License, or (at your option) any later
9
+ # version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful, but WITHOUT
12
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13
+ # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14
+ # details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License along with
17
+ # this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ module Sanzang
20
+
21
+ # The current version number of Sanzang.
22
+ VERSION = "0.0.1"
23
+
24
+ end
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+
4
+ require "test/unit"
5
+
6
+ require_relative File.join("..", "lib", "sanzang")
7
+
8
+ class TestCommands < Test::Unit::TestCase
9
+
10
+ def run_reflow(args)
11
+ end
12
+
13
+ def run_translate(args)
14
+ end
15
+
16
+
17
+ end
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+
4
+ require "test/unit"
5
+
6
+ require_relative File.join("..", "lib", "sanzang")
7
+
8
+ # Test "reflow" operation with all major encodings for conversion and accuracy.
9
+ #
10
+ # Most encodings deemed as "important" here are Unicode encodings and those
11
+ # commonly used for Chinese. Some encodings do not function due to converters
12
+ # for these encodings being unimplemented in Ruby 1.9. Such encodings include
13
+ # the following:
14
+ #
15
+ # * EUC-TW (Traditional Chinese)
16
+ #
17
+ class TestReflowEncodings < Test::Unit::TestCase
18
+
19
+ # Han characters, traditional, including a CBETA-style margin, which should
20
+ # be automatically stripped out by the text formatter.
21
+ #
22
+ def reflow_zh_hant(encoding)
23
+ text_s1 = "T31n1586_p0060a19(00)║    大唐三藏法師玄奘奉 詔譯"
24
+ text_s2 = "    大唐三藏法師玄奘奉\n 詔譯\n \n"
25
+ text_s1.encode!(encoding)
26
+ text_s2.encode!(encoding)
27
+ formatter = Sanzang::TextFormatter.new
28
+ assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
29
+ end
30
+
31
+ # Han characters, simplified and without double vertical bar. The margin
32
+ # was dropped from the text due to GB2312 not supporting the "double bar"
33
+ # (U+2551) character.
34
+ #
35
+ def reflow_zh_hans(encoding)
36
+ text_s1 = "    大唐三藏法师玄奘奉 诏译"
37
+ text_s2 = "    大唐三藏法师玄奘奉\n 诏译\n \n"
38
+ text_s1.encode!(encoding)
39
+ text_s2.encode!(encoding)
40
+ formatter = Sanzang::TextFormatter.new
41
+ assert_equal(text_s2, formatter.reflow_cjk_text(text_s1))
42
+ end
43
+
44
+ # UTF-8 (Traditional Chinese)
45
+ #
46
+ def test_reflow_hanzi_utf_8
47
+ reflow_zh_hant("UTF-8")
48
+ end
49
+
50
+ # UTF-16LE (Traditional Chinese)
51
+ #
52
+ def test_reflow_hanzi_utf_16le
53
+ reflow_zh_hant("UTF-16LE")
54
+ end
55
+
56
+ # UTF-16BE (Traditional Chinese)
57
+ #
58
+ def test_reflow_hanzi_utf_16be
59
+ reflow_zh_hant("UTF-16BE")
60
+ end
61
+
62
+ # UTF-32LE (Traditional Chinese)
63
+ #
64
+ def test_reflow_hanzi_utf_32le
65
+ reflow_zh_hant("UTF-32LE")
66
+ end
67
+
68
+ # UTF-32BE (Traditional Chinese)
69
+ #
70
+ def test_reflow_hanzi_utf_32be
71
+ reflow_zh_hant("UTF-32BE")
72
+ end
73
+
74
+ # Big5 (Traditional Chinese)
75
+ #
76
+ def test_reflow_hanzi_big5
77
+ reflow_zh_hant("Big5")
78
+ end
79
+
80
+ # GB2312 (Simplified Chinese)
81
+ # Double vertical bar glyph (U+2551) is not present in GB2312
82
+ #
83
+ def test_reflow_hanzi_gb2312
84
+ reflow_zh_hans("GB2312")
85
+ end
86
+
87
+ # GBK (Traditional Chinese)
88
+ #
89
+ def test_reflow_hanzi_gbk
90
+ reflow_zh_hant("GBK")
91
+ end
92
+
93
+ # GB18030 (Traditional Chinese)
94
+ #
95
+ def test_reflow_hanzi_gb18030
96
+ reflow_zh_hant("GB18030")
97
+ end
98
+ end
@@ -0,0 +1,97 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- encoding: UTF-8 -*-
3
+
4
+ require "test/unit"
5
+
6
+ require_relative File.join("..", "lib", "sanzang")
7
+
8
+ # assert_nothing_raised
9
+ # assert_equal(x, y)
10
+ # assert(stmt, "Error message")
11
+ #
12
+ class TestSanzang < Test::Unit::TestCase
13
+
14
+ def table_string
15
+ "~|三藏| sānzàng| tripiṭaka|~
16
+ ~|法師| fǎshī| dharma-master|~
17
+ ~|玄奘| xuánzàng| xuanzang|~
18
+ ~|奉| fèng| reverently|~
19
+ ~|唐| táng| tang|~
20
+ ~|大| dà| great|~
21
+ ~|詔| zhào| imperial-order|~
22
+ ~|譯| yì| translate/interpret|~"
23
+ end
24
+
25
+ def stage_1
26
+ "T31n1586_p0060a19(00)║    大唐三藏法師玄奘奉 詔譯\r\n"
27
+ end
28
+
29
+ def stage_2
30
+ "    大唐三藏法師玄奘奉\r\n 詔譯\r\n"
31
+ end
32
+
33
+ def stage_3
34
+ "[1.1]     大唐三藏法師玄奘奉\r\n" \
35
+ << "[1.2]      dà táng sānzàng fǎshī xuánzàng fèng\r\n" \
36
+ << "[1.3]      great tang tripiṭaka dharma-master xuanzang " \
37
+ << "reverently\r\n" \
38
+ << "\r\n" \
39
+ << "[2.1]  詔譯\r\n" \
40
+ << "[2.2]   zhào yì\r\n" \
41
+ << "[2.3]   imperial-order translate/interpret\r\n" \
42
+ << "\r\n"
43
+ end
44
+
45
+ def test_translation_table
46
+ table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
47
+ fin = File.open(table_path, "rb", encoding: "UTF-8")
48
+ table = Sanzang::TranslationTable.new(fin)
49
+ fin.close
50
+ assert(table.width.class == Fixnum, "Table width undefined")
51
+ assert(table.length.class == Fixnum, "Table length undefined")
52
+ assert(table.records.class == Array, "Table contents not an array")
53
+ rec0_length = table.records[0].length
54
+ table.records.each do |rec|
55
+ assert(rec.class == Array, "Malformed table records")
56
+ assert(rec.length == rec0_length, "Inconsistent table records")
57
+ end
58
+ assert(table.width > 0, "Zero-width table")
59
+ assert(table.length > 0, "Zero-length table")
60
+ end
61
+
62
+ def test_reflow_cjk_string
63
+ text = Sanzang::TextFormatter.new.reflow_cjk_text(stage_1())
64
+ assert_equal(stage_2(), text)
65
+ end
66
+
67
+ def test_translate_string
68
+ table = Sanzang::TranslationTable.new(table_string())
69
+ text = Sanzang::Translator.new(table).gen_listing(stage_2())
70
+ assert_equal(stage_3(), text)
71
+ end
72
+
73
+ def test_translate_file
74
+ table_path = File.join(File.dirname(__FILE__), "utf-8", "table.txt")
75
+ s2_path = File.join(File.dirname(__FILE__), "utf-8", "stage_2.txt")
76
+ s3_path = File.join(File.dirname(__FILE__), "utf-8", "stage_3.txt")
77
+ table = Sanzang::TranslationTable.new(table_path)
78
+ translator = Sanzang::Translator.new(table)
79
+ translator.translate_io(s2_path, s3_path)
80
+ end
81
+
82
+ def test_translator_parallel
83
+ table = Sanzang::TranslationTable.new(table_string())
84
+ translator = Sanzang::Translator.new(table)
85
+ translator.runs_parallel?
86
+ assert(translator.processor_count > 0, "Processor count less than zero")
87
+ end
88
+
89
+ def test_translate_batch
90
+ table = Sanzang::TranslationTable.new(table_string())
91
+ translator = Sanzang::Translator.new(table)
92
+ translator.translate_batch(
93
+ Dir.glob(File.join(File.dirname(__FILE__), "utf-8", "file_*.txt")),
94
+ File.join(File.dirname(__FILE__), "utf-8", "batch"), false)
95
+ end
96
+
97
+ end
@@ -0,0 +1,8 @@
1
+ [1.1]     大唐三藏法師玄奘奉
2
+ [1.2]      dà táng sānzàng fǎshī xuánzàng fèng
3
+ [1.3]      great tang tripiṭaka dharma-master xuanzang reverently
4
+
5
+ [2.1]  詔譯
6
+ [2.2]   zhào yì
7
+ [2.3]   imperial-order translate/interpret
8
+
@@ -0,0 +1,8 @@
1
+ [1.1]     大唐三藏法師玄奘奉
2
+ [1.2]      dà táng sānzàng fǎshī xuánzàng fèng
3
+ [1.3]      great tang tripiṭaka dharma-master xuanzang reverently
4
+
5
+ [2.1]  詔譯
6
+ [2.2]   zhào yì
7
+ [2.3]   imperial-order translate/interpret
8
+
@@ -0,0 +1,8 @@
1
+ [1.1]     大唐三藏法師玄奘奉
2
+ [1.2]      dà táng sānzàng fǎshī xuánzàng fèng
3
+ [1.3]      great tang tripiṭaka dharma-master xuanzang reverently
4
+
5
+ [2.1]  詔譯
6
+ [2.2]   zhào yì
7
+ [2.3]   imperial-order translate/interpret
8
+
@@ -0,0 +1,8 @@
1
+ [1.1]     大唐三藏法師玄奘奉
2
+ [1.2]      dà táng sānzàng fǎshī xuánzàng fèng
3
+ [1.3]      great tang tripiṭaka dharma-master xuanzang reverently
4
+
5
+ [2.1]  詔譯
6
+ [2.2]   zhào yì
7
+ [2.3]   imperial-order translate/interpret
8
+
@@ -0,0 +1,2 @@
1
+     大唐三藏法師玄奘奉
2
+  詔譯
@@ -0,0 +1,2 @@
1
+     大唐三藏法師玄奘奉
2
+  詔譯
@@ -0,0 +1,2 @@
1
+     大唐三藏法師玄奘奉
2
+  詔譯
@@ -0,0 +1,2 @@
1
+     大唐三藏法師玄奘奉
2
+  詔譯
@@ -0,0 +1 @@
1
+ T31n1586_p0060a19(00)║    大唐三藏法師玄奘奉 詔譯
@@ -0,0 +1,2 @@
1
+     大唐三藏法師玄奘奉
2
+  詔譯
@@ -0,0 +1,4 @@
1
+ [1.1]     大唐三藏法師玄奘奉
2
+
3
+ [2.1]  詔譯
4
+
@@ -0,0 +1,8 @@
1
+ 三藏| sānzàng| tripiṭaka
2
+ 法師| fǎshī| dharma-master
3
+ 玄奘| xuánzàng| xuanzang
4
+ 奉| fèng| reverently
5
+ 唐| táng| tang
6
+ 大| dà| great
7
+ 詔| zhào| imperial-order
8
+ 譯| yì| translate/interpret
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sanzang
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Lapis Lazuli Texts
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-10-02 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: parallel
16
+ requirement: &81673770 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 0.5.18
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *81673770
25
+ description: Sanzang is an application built for direct machine translation of natural
26
+ languages. This application is particularly suitable as a translation aid for for
27
+ ancient Chinese texts. Sanzang uses simple direct translation rules organized into
28
+ translation tables, which are stored in a straightforward text format. Batch translations
29
+ utilize multiprocessing to translate files in parallel, naturally scaling to the
30
+ number of processors available. Sanzang is available under the GNU General Public
31
+ License, version 3.
32
+ email:
33
+ - lapislazulitexts@gmail.com
34
+ executables:
35
+ - sanzang-reflow
36
+ - sanzang-translate
37
+ extensions: []
38
+ extra_rdoc_files:
39
+ - HACKING
40
+ - LICENSE
41
+ - README
42
+ files:
43
+ - bin/sanzang-reflow
44
+ - bin/sanzang-translate
45
+ - test/tc_commands.rb
46
+ - test/utf-8/batch/file_4.txt
47
+ - test/utf-8/batch/file_3.txt
48
+ - test/utf-8/batch/file_1.txt
49
+ - test/utf-8/batch/file_2.txt
50
+ - test/utf-8/stage_3.txt
51
+ - test/utf-8/file_4.txt
52
+ - test/utf-8/file_3.txt
53
+ - test/utf-8/file_1.txt
54
+ - test/utf-8/stage_1.txt
55
+ - test/utf-8/table.txt
56
+ - test/utf-8/file_2.txt
57
+ - test/utf-8/stage_2.txt
58
+ - test/tc_reflow_encodings.rb
59
+ - test/tc_simple_translation.rb
60
+ - lib/sanzang.rb
61
+ - lib/sanzang/text_formatter.rb
62
+ - lib/sanzang/translation_table.rb
63
+ - lib/sanzang/version.rb
64
+ - lib/sanzang/command/reflow.rb
65
+ - lib/sanzang/command/translate.rb
66
+ - lib/sanzang/translator.rb
67
+ - HACKING
68
+ - LICENSE
69
+ - README
70
+ homepage: http://www.lapislazulitexts.com
71
+ licenses:
72
+ - GPL-3
73
+ post_install_message:
74
+ rdoc_options:
75
+ - --main
76
+ - README
77
+ - --title
78
+ - Sanzang
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ none: false
83
+ requirements:
84
+ - - ! '>='
85
+ - !ruby/object:Gem::Version
86
+ version: 1.9.0
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ none: false
89
+ requirements:
90
+ - - ! '>='
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 1.8.11
96
+ signing_key:
97
+ specification_version: 3
98
+ summary: Sanzang
99
+ test_files:
100
+ - test/tc_commands.rb
101
+ - test/tc_reflow_encodings.rb
102
+ - test/tc_simple_translation.rb