sanzang 1.0.4 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/sanzang +2 -2
- data/lib/sanzang.rb +1 -2
- data/lib/sanzang/batch_translator.rb +9 -7
- data/lib/sanzang/command/batch.rb +1 -2
- data/lib/sanzang/command/reflow.rb +1 -2
- data/lib/sanzang/command/sanzang_cmd.rb +1 -2
- data/lib/sanzang/command/translate.rb +1 -2
- data/lib/sanzang/platform.rb +1 -2
- data/lib/sanzang/text_formatter.rb +1 -2
- data/lib/sanzang/translation_table.rb +16 -19
- data/lib/sanzang/translator.rb +1 -2
- data/lib/sanzang/version.rb +2 -3
- data/test/tc_reflow_encodings.rb +1 -2
- data/test/tc_simple_translation.rb +1 -2
- metadata +10 -10
data/bin/sanzang
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
#
|
2
|
+
# coding: UTF-8
|
3
3
|
#--
|
4
|
-
# Copyright (C) 2012 Lapis Lazuli Texts
|
4
|
+
# Copyright (C) 2012-2013 Lapis Lazuli Texts
|
5
5
|
#
|
6
6
|
# This program is free software: you can redistribute it and/or modify it under
|
7
7
|
# the terms of the GNU General Public License as published by the Free Software
|
data/lib/sanzang.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
# -*- encoding: UTF-8 -*-
|
1
|
+
# coding: UTF-8
|
3
2
|
#--
|
4
3
|
# Copyright (C) 2012-2013 Lapis Lazuli Texts
|
5
4
|
#
|
@@ -36,12 +35,15 @@ module Sanzang
|
|
36
35
|
# return value is an array containing all the output file paths.
|
37
36
|
#
|
38
37
|
def translate_batch(fpath_pairs, verbose = true, jobs = nil)
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
38
|
+
options = {}
|
39
|
+
if RUBY_PLATFORM =~ /java/
|
40
|
+
options[:in_threads] = jobs || Sanzang::Platform.processor_count
|
41
|
+
elsif Sanzang::Platform.unix_processes?
|
42
|
+
options[:in_processes] = jobs || Sanzang::Platform.processor_count
|
43
|
+
else
|
44
|
+
options[:in_processes] = 0
|
43
45
|
end
|
44
|
-
Parallel.map(fpath_pairs,
|
46
|
+
Parallel.map(fpath_pairs, options) do |f1,f2|
|
45
47
|
translate_io(f1, f2)
|
46
48
|
if verbose
|
47
49
|
$stderr.write "[#{Process.pid}] #{File.expand_path(f2)} \n"
|
data/lib/sanzang/platform.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
# -*- encoding: UTF-8 -*-
|
1
|
+
# coding: UTF-8
|
3
2
|
#--
|
4
3
|
# Copyright (C) 2012-2013 Lapis Lazuli Texts
|
5
4
|
#
|
@@ -29,7 +28,7 @@ module Sanzang
|
|
29
28
|
# summarized as follows:
|
30
29
|
#
|
31
30
|
# - Each line of text is a record for a translation rule.
|
32
|
-
# - Each record
|
31
|
+
# - Each record may begin with "~|" and end with "|~".
|
33
32
|
# - Fields in the record are separated by the "|" character.
|
34
33
|
# - The first field contains the term in the source language.
|
35
34
|
# - Subsequent fields are equivalent terms in destination languages.
|
@@ -43,15 +42,11 @@ module Sanzang
|
|
43
42
|
def initialize(rules)
|
44
43
|
contents = rules.kind_of?(String) ? rules : rules.read
|
45
44
|
contents.encode!(Encoding::UTF_8)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
@records = contents.gsub("\r", "").split("\n").collect do |rec|
|
53
|
-
rec.strip.gsub(left, "").gsub(right, "").split(separator)
|
54
|
-
end
|
45
|
+
contents.strip!
|
46
|
+
contents.gsub!(/^\s*|\s*$|\r/, "")
|
47
|
+
contents.gsub!("~|", "")
|
48
|
+
contents.gsub!("|~", "")
|
49
|
+
@records = contents.split("\n").collect {|r| r.split("|") }
|
55
50
|
|
56
51
|
if @records.length < 1
|
57
52
|
raise "Table must have at least 1 row"
|
@@ -59,9 +54,9 @@ module Sanzang
|
|
59
54
|
raise "Table must have at least 2 columns"
|
60
55
|
end
|
61
56
|
|
62
|
-
|
63
|
-
|
64
|
-
if
|
57
|
+
width = records[0].length
|
58
|
+
@records.each do |r|
|
59
|
+
if r.length != width
|
65
60
|
raise "Column mismatch: Line #{i + 1}"
|
66
61
|
end
|
67
62
|
end
|
@@ -75,6 +70,12 @@ module Sanzang
|
|
75
70
|
@records[index]
|
76
71
|
end
|
77
72
|
|
73
|
+
# The text encoding used for all translation table data
|
74
|
+
#
|
75
|
+
def encoding
|
76
|
+
Encoding::UTF_8
|
77
|
+
end
|
78
|
+
|
78
79
|
# Find a record by the source language term (first column).
|
79
80
|
#
|
80
81
|
def find(term)
|
@@ -97,9 +98,5 @@ module Sanzang
|
|
97
98
|
#
|
98
99
|
attr_reader :records
|
99
100
|
|
100
|
-
# The text encoding used for all translation table data
|
101
|
-
#
|
102
|
-
attr_reader :encoding
|
103
|
-
|
104
101
|
end
|
105
102
|
end
|
data/lib/sanzang/translator.rb
CHANGED
data/lib/sanzang/version.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
# -*- encoding: UTF-8 -*-
|
1
|
+
# coding: UTF-8
|
3
2
|
#--
|
4
3
|
# Copyright (C) 2012-2013 Lapis Lazuli Texts
|
5
4
|
#
|
@@ -20,6 +19,6 @@ module Sanzang
|
|
20
19
|
|
21
20
|
# Current version number of Sanzang
|
22
21
|
#
|
23
|
-
VERSION = "1.0.
|
22
|
+
VERSION = "1.0.5"
|
24
23
|
|
25
24
|
end
|
data/test/tc_reflow_encodings.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanzang
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,32 +9,31 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-10-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: parallel
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
|
-
- -
|
19
|
+
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: 0.
|
21
|
+
version: '0.8'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
|
-
- -
|
27
|
+
- - ~>
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: 0.
|
29
|
+
version: '0.8'
|
30
30
|
description: Sanzang is a program built for machine translation of natural languages.
|
31
31
|
This application is particularly suitable as a translation aid for CJK languages
|
32
32
|
including ancient texts. The translation method is rule-based, and translation rules
|
33
33
|
are stored in flat files as delimited text. This program can also utilize multiprocessing
|
34
34
|
to naturally scale to multiple processors and processor cores. Sanzang is available
|
35
35
|
under the GNU GPL, version 3.
|
36
|
-
email:
|
37
|
-
- lapislazulitexts@gmail.com
|
36
|
+
email: lapislazulitexts@gmail.com
|
38
37
|
executables:
|
39
38
|
- sanzang
|
40
39
|
extensions: []
|
@@ -90,12 +89,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
90
89
|
- - ! '>='
|
91
90
|
- !ruby/object:Gem::Version
|
92
91
|
version: '0'
|
93
|
-
requirements:
|
92
|
+
requirements:
|
93
|
+
- parallel ~> 0.8
|
94
94
|
rubyforge_project:
|
95
95
|
rubygems_version: 1.8.23
|
96
96
|
signing_key:
|
97
97
|
specification_version: 3
|
98
|
-
summary:
|
98
|
+
summary: Machine translation for CJK languages
|
99
99
|
test_files:
|
100
100
|
- test/tc_reflow_encodings.rb
|
101
101
|
- test/tc_simple_translation.rb
|