sanzang 1.1.2 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ed9732f3291f8bebe17d1abc1cceb68765aed70e
4
- data.tar.gz: c3560faf30781e7e30b76ef8b6c41234725606e9
3
+ metadata.gz: a5bd5aafaee153f9753cdeedf952c8581eaf770b
4
+ data.tar.gz: 111fc689cb0f19ca0dd20356f50d7483e6a51420
5
5
  SHA512:
6
- metadata.gz: 64942f119d7618ba75ed87e0e47911b0c8179445d4bdd2425b6b85665e6c5967a32a57492adba2ad3c5a9a45423dbec5540f74b3440bbce182c43408758d3b07
7
- data.tar.gz: 85624b92a30b276e0148b3d7a1449fe6b987be2fb70b1ffc1eebb1f59b0e2d1d200c07731efdb38168304e1c2d494ba2d7d54c254b8b2a866bda36ad924983d1
6
+ metadata.gz: 17a663b61ef1523e24d76d7d37a65c56ff8d73c2bbb43e3aa226437dbc4b224a96b7ebaa02370bd8054490c2d57885d24f6be5844f6767fa59565705c3c37fa4
7
+ data.tar.gz: a453e356504a8c334cefcf42455769bcb79bba82af4a48138780a3bbd3a086c474c3fa37659c3a7c33ca38f30c78e5f8237a9caf2f8cc316d95374b71bf31920
data/NEWS.rdoc CHANGED
@@ -2,6 +2,12 @@
2
2
 
3
3
  == Release History
4
4
 
5
+ === v1.2.0
6
+ * Updated vocab building code for faster term matching
7
+ * Refactored the TextFormatter class into a Formatting module
8
+ * Added methods for merging translation tables into one another
9
+ * Consolidated Sanzang module "requires" into a central location
10
+
5
11
  === v1.1.2
6
12
  * Cleaned up table loading code to be faster and simpler
7
13
  * Added RDoc option to set documentation encoding to UTF-8 (RDoc 3.x)
@@ -40,6 +40,6 @@ command to verify your installation and print version information.
40
40
 
41
41
  This command should show a summary of your \Sanzang version and environment.
42
42
 
43
- sanzang 1.1.1 (UTF-8) ruby-2.1.0p0 x86_64-linux
43
+ sanzang 1.2.0 ruby-2.1.1 x86_64-linux (UTF-8)
44
44
 
45
45
  You now have \Sanzang installed on your computer.
@@ -1,6 +1,6 @@
1
1
  # coding: UTF-8
2
2
  #--
3
- # Copyright (C) 2012-2013 Lapis Lazuli Texts
3
+ # Copyright (C) 2012-2014 Lapis Lazuli Texts
4
4
  #
5
5
  # This program is free software: you can redistribute it and/or modify it under
6
6
  # the terms of the GNU General Public License as published by the Free Software
@@ -22,11 +22,11 @@
22
22
  module Sanzang
23
23
  end
24
24
 
25
- require_relative File.join("sanzang", "batch_translator")
26
25
  require_relative File.join("sanzang", "platform")
27
- require_relative File.join("sanzang", "text_formatter")
26
+ require_relative File.join("sanzang", "formatting")
28
27
  require_relative File.join("sanzang", "translation_table")
29
28
  require_relative File.join("sanzang", "translator")
29
+ require_relative File.join("sanzang", "batch_translator")
30
30
  require_relative File.join("sanzang", "version")
31
31
 
32
32
  # The Sanzang::Command module contains Unix style commands utilizing the
@@ -15,11 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "parallel"
19
-
20
- require_relative "platform"
21
- require_relative "translator"
22
-
23
18
  module Sanzang
24
19
 
25
20
  # BatchTranslator can handle batches of files for translation, and may also
@@ -15,12 +15,8 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "optparse"
19
-
20
- require_relative File.join("..", "platform")
21
- require_relative File.join("..", "translation_table")
22
- require_relative File.join("..", "batch_translator")
23
- require_relative File.join("..", "version")
18
+ require 'optparse'
19
+ require 'parallel'
24
20
 
25
21
  module Sanzang::Command
26
22
 
@@ -15,11 +15,7 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "optparse"
19
-
20
- require_relative File.join("..", "platform")
21
- require_relative File.join("..", "text_formatter")
22
- require_relative File.join("..", "version")
18
+ require 'optparse'
23
19
 
24
20
  module Sanzang::Command
25
21
 
@@ -61,7 +57,7 @@ module Sanzang::Command
61
57
  fin.binmode.set_encoding(@encoding)
62
58
  fout = @outfile ? File.open(@outfile, "w") : $stdout
63
59
  fout.binmode.set_encoding(@encoding)
64
- fout.write(Sanzang::TextFormatter.new.reflow_cjk(fin.read))
60
+ fout.write(Sanzang::Formatting.reflow_cjk(fin.read))
65
61
  ensure
66
62
  if defined?(fin) and fin.class == File
67
63
  fin.close if not fin.closed?
@@ -15,15 +15,7 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "optparse"
19
- require "parallel"
20
-
21
- require_relative "reflow"
22
- require_relative "translate"
23
- require_relative "batch"
24
-
25
- require_relative File.join("..", "platform")
26
- require_relative File.join("..", "version")
18
+ require 'optparse'
27
19
 
28
20
  module Sanzang::Command
29
21
 
@@ -80,6 +72,8 @@ module Sanzang::Command
80
72
  # A string giving a listing of platform information
81
73
  #
82
74
  def platform_info
75
+ require 'parallel'
76
+
83
77
  info = "host_arch = #{Sanzang::Platform.machine_arch}\n"
84
78
  info << "host_os = #{Sanzang::Platform.os_name}\n"
85
79
  info << "host_processors = #{Sanzang::Platform.processor_count}\n"
@@ -96,8 +90,9 @@ module Sanzang::Command
96
90
  # This is a string giving a brief one-line summary of version information
97
91
  #
98
92
  def version_info
99
- "sanzang #{Sanzang::VERSION} (#{Sanzang::Platform.data_encoding})" \
100
- + " ruby-#{RUBY_VERSION}p#{RUBY_PATCHLEVEL} #{RUBY_PLATFORM}"
93
+ "sanzang #{Sanzang::VERSION} ruby-#{RUBY_VERSION} #{RUBY_PLATFORM} " \
94
+ + "(#{Sanzang::Platform.data_encoding})"
95
+ # "sanzang #{Sanzang::VERSION} #{Sanzang::Platform.data_encoding}"
101
96
  end
102
97
 
103
98
  # Name of the command
@@ -15,12 +15,7 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "optparse"
19
-
20
- require_relative File.join("..", "platform")
21
- require_relative File.join("..", "translation_table")
22
- require_relative File.join("..", "translator")
23
- require_relative File.join("..", "version")
18
+ require 'optparse'
24
19
 
25
20
  module Sanzang::Command
26
21
 
@@ -1,6 +1,6 @@
1
1
  # coding: UTF-8
2
2
  #--
3
- # Copyright (C) 2012-2013 Lapis Lazuli Texts
3
+ # Copyright (C) 2012-2014 Lapis Lazuli Texts
4
4
  #
5
5
  # This program is free software: you can redistribute it and/or modify it under
6
6
  # the terms of the GNU General Public License as published by the Free Software
@@ -15,16 +15,14 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- module Sanzang
19
-
20
- # This class handles formatting of text data especially to prepare the text
21
- # for direct translation. This involves reformatting and reflowing text so
22
- # that words are not divided between lines, and so the output is well suited
23
- # for humans. For practical purposes of readability, lines of text to be
24
- # translated should be succinct and easily comprehensible. The TextFormatter
25
- # class includes methods for accomplishing this reformatting.
26
- #
27
- class TextFormatter
18
+ # This module handles formatting of text data especially to prepare the text
19
+ # for direct translation. This involves reformatting and reflowing text so
20
+ # that words are not divided between lines, and so the output is well suited
21
+ # for humans. For practical purposes of readability, lines of text to be
22
+ # translated should be succinct and easily comprehensible.
23
+ #
24
+ module Sanzang::Formatting
25
+ class << self
28
26
 
29
27
  # Given a CJK string of text, reformat the string for greater compatibility
30
28
  # with direct translation, and reflow the text based on its punctuation.
@@ -17,9 +17,8 @@
17
17
 
18
18
  module Sanzang
19
19
 
20
- # A translation table encapsulates a set of rules for translating with
21
- # the \Sanzang system. These are essentially read-only objects meant for
22
- # storing well-defined translation table data.
20
+ # A translation table encapsulates a set of rules for translating with the
21
+ # \Sanzang system.
23
22
  #
24
23
  class TranslationTable
25
24
 
@@ -39,6 +38,8 @@ module Sanzang
39
38
  # - Subsequent fields are equivalent terms in destination languages.
40
39
  # - The number of columns must be consistent for the entire table.
41
40
  #
41
+ # The rules passed in here may either be a file descriptor or a string.
42
+ #
42
43
  def initialize(rules)
43
44
  contents = rules.kind_of?(String) ? rules : rules.read
44
45
  @source_encoding = contents.encoding
@@ -51,8 +52,9 @@ module Sanzang
51
52
  end
52
53
 
53
54
  @records = contents.strip.split("\n").collect {|r| r.strip.split("|") }
55
+ @sorted = false
54
56
  check_dims
55
- sort!
57
+ #sort!
56
58
  end
57
59
 
58
60
  # Retrieve a record by its numeric index.
@@ -76,10 +78,18 @@ module Sanzang
76
78
  end
77
79
  end
78
80
 
81
+ # Check if the table records are sorted
82
+ #
83
+ def sorted?
84
+ @sorted
85
+ end
86
+
79
87
  # Reverse sort all records by length
80
88
  #
81
89
  def sort!
82
90
  @records.sort! {|x,y| y[0].size <=> x[0].size }
91
+ @sorted = true
92
+ nil
83
93
  end
84
94
 
85
95
  # The text encoding used internally for all translation table data
@@ -94,6 +104,46 @@ module Sanzang
94
104
  @records.find {|rec| rec[0] == term }
95
105
  end
96
106
 
107
+ # Convert to a hash. The original records are the values.
108
+ #
109
+ # For example: "A" => ["A", "B", "C"]
110
+ #
111
+ def to_h
112
+ h = Hash.new
113
+ @records.each {|rec| h[rec[0]] = rec if not h[rec[0]] }
114
+ h
115
+ end
116
+
117
+ # Only include unique source values. The resulting table is unsorted.
118
+ #
119
+ def uniq!
120
+ @records = to_h.values
121
+ @sorted = false
122
+ nil
123
+ end
124
+
125
+ # Merge another table into this one. If the same source term exists in
126
+ # both tables, then the record from the other table will be used instead.
127
+ # Note: after a merge, the resulting table is unsorted.
128
+ #
129
+ def merge!(tab2)
130
+ if tab2.width != width
131
+ raise "Table widths must match when merging tables"
132
+ end
133
+ h1 = to_h
134
+ tab2.records.each do |rec|
135
+ h1[rec[0]] = rec
136
+ end
137
+ @records = h1.values
138
+ @sorted = false
139
+ end
140
+
141
+ # Return a CSV formatted string
142
+ #
143
+ def to_csv
144
+ @records.map {|r| r.join("|") }.join("\n")
145
+ end
146
+
97
147
  # The number of records in the table
98
148
  #
99
149
  def length
@@ -37,13 +37,10 @@ module Sanzang
37
37
  # These records represent the vocabulary used by the text.
38
38
  #
39
39
  def text_vocab(source_text)
40
- new_table = []
41
- @table.records.each do |record|
42
- if source_text.include?(record[0])
43
- new_table << record
44
- end
40
+ text_copy = String.new(source_text)
41
+ @table.records.select do |r|
42
+ text_copy.include?(r[0]) ? text_copy.gsub!(r[0], "\x1F") : false
45
43
  end
46
- new_table
47
44
  end
48
45
 
49
46
  # Use the TranslationTable of the Translator to create translations for
@@ -80,7 +77,7 @@ module Sanzang
80
77
 
81
78
  listing = ""
82
79
  texts[0].length.times do |line_i|
83
- @table.width.times do |col_i|
80
+ texts.length.times do |col_i|
84
81
  listing << "[#{pos + line_i}.#{col_i + 1}] #{texts[col_i][line_i]}" \
85
82
  << newline
86
83
  end
@@ -19,6 +19,6 @@ module Sanzang
19
19
 
20
20
  # Current version number of Sanzang
21
21
  #
22
- VERSION = "1.1.2"
22
+ VERSION = "1.2.0"
23
23
 
24
24
  end
@@ -23,8 +23,7 @@ class TestReflowEncodings < Test::Unit::TestCase
23
23
  text_s2 = "    大唐三藏法師玄奘奉\n 詔譯\n \n"
24
24
  text_s1.encode!(encoding)
25
25
  text_s2.encode!(encoding)
26
- formatter = Sanzang::TextFormatter.new
27
- assert_equal(text_s2, formatter.reflow_cjk(text_s1))
26
+ assert_equal(text_s2, Sanzang::Formatting.reflow_cjk(text_s1))
28
27
  end
29
28
 
30
29
  # Han characters, simplified and without double vertical bar. The margin
@@ -36,8 +35,7 @@ class TestReflowEncodings < Test::Unit::TestCase
36
35
  text_s2 = "    大唐三藏法师玄奘奉\n 诏译\n \n"
37
36
  text_s1.encode!(encoding)
38
37
  text_s2.encode!(encoding)
39
- formatter = Sanzang::TextFormatter.new
40
- assert_equal(text_s2, formatter.reflow_cjk(text_s1))
38
+ assert_equal(text_s2, Sanzang::Formatting.reflow_cjk(text_s1))
41
39
  end
42
40
 
43
41
  # UTF-8 (Traditional Chinese)
@@ -55,7 +55,7 @@ class TestSanzang < Test::Unit::TestCase
55
55
  end
56
56
 
57
57
  def test_reflow_cjk_string
58
- text = Sanzang::TextFormatter.new.reflow_cjk(stage_1())
58
+ text = Sanzang::Formatting.reflow_cjk(stage_1())
59
59
  assert_equal(stage_2(), text)
60
60
  end
61
61
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sanzang
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lapis Lazuli Texts
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-13 00:00:00.000000000 Z
11
+ date: 2014-03-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: parallel
@@ -52,8 +52,8 @@ files:
52
52
  - lib/sanzang/command/reflow.rb
53
53
  - lib/sanzang/command/sanzang_cmd.rb
54
54
  - lib/sanzang/command/translate.rb
55
+ - lib/sanzang/formatting.rb
55
56
  - lib/sanzang/platform.rb
56
- - lib/sanzang/text_formatter.rb
57
57
  - lib/sanzang/translation_table.rb
58
58
  - lib/sanzang/translator.rb
59
59
  - lib/sanzang/version.rb
@@ -90,7 +90,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
90
90
  requirements:
91
91
  - parallel ~> 0.8
92
92
  rubyforge_project:
93
- rubygems_version: 2.2.0
93
+ rubygems_version: 2.2.2
94
94
  signing_key:
95
95
  specification_version: 4
96
96
  summary: Machine translation from CJK languages