sanzang 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ed9732f3291f8bebe17d1abc1cceb68765aed70e
4
- data.tar.gz: c3560faf30781e7e30b76ef8b6c41234725606e9
3
+ metadata.gz: a5bd5aafaee153f9753cdeedf952c8581eaf770b
4
+ data.tar.gz: 111fc689cb0f19ca0dd20356f50d7483e6a51420
5
5
  SHA512:
6
- metadata.gz: 64942f119d7618ba75ed87e0e47911b0c8179445d4bdd2425b6b85665e6c5967a32a57492adba2ad3c5a9a45423dbec5540f74b3440bbce182c43408758d3b07
7
- data.tar.gz: 85624b92a30b276e0148b3d7a1449fe6b987be2fb70b1ffc1eebb1f59b0e2d1d200c07731efdb38168304e1c2d494ba2d7d54c254b8b2a866bda36ad924983d1
6
+ metadata.gz: 17a663b61ef1523e24d76d7d37a65c56ff8d73c2bbb43e3aa226437dbc4b224a96b7ebaa02370bd8054490c2d57885d24f6be5844f6767fa59565705c3c37fa4
7
+ data.tar.gz: a453e356504a8c334cefcf42455769bcb79bba82af4a48138780a3bbd3a086c474c3fa37659c3a7c33ca38f30c78e5f8237a9caf2f8cc316d95374b71bf31920
data/NEWS.rdoc CHANGED
@@ -2,6 +2,12 @@
2
2
 
3
3
  == Release History
4
4
 
5
+ === v1.2.0
6
+ * Updated vocab building code for faster term matching
7
+ * Refactored the TextFormatter class into a Formatting module
8
+ * Added methods for merging translation tables into one another
9
+ * Consolidated Sanzang module "requires" into a central location
10
+
5
11
  === v1.1.2
6
12
  * Cleaned up table loading code to be faster and simpler
7
13
  * Added RDoc option to set documentation encoding to UTF-8 (RDoc 3.x)
@@ -40,6 +40,6 @@ command to verify your installation and print version information.
40
40
 
41
41
  This command should show a summary of your \Sanzang version and environment.
42
42
 
43
- sanzang 1.1.1 (UTF-8) ruby-2.1.0p0 x86_64-linux
43
+ sanzang 1.2.0 ruby-2.1.1 x86_64-linux (UTF-8)
44
44
 
45
45
  You now have \Sanzang installed on your computer.
@@ -1,6 +1,6 @@
1
1
  # coding: UTF-8
2
2
  #--
3
- # Copyright (C) 2012-2013 Lapis Lazuli Texts
3
+ # Copyright (C) 2012-2014 Lapis Lazuli Texts
4
4
  #
5
5
  # This program is free software: you can redistribute it and/or modify it under
6
6
  # the terms of the GNU General Public License as published by the Free Software
@@ -22,11 +22,11 @@
22
22
  module Sanzang
23
23
  end
24
24
 
25
- require_relative File.join("sanzang", "batch_translator")
26
25
  require_relative File.join("sanzang", "platform")
27
- require_relative File.join("sanzang", "text_formatter")
26
+ require_relative File.join("sanzang", "formatting")
28
27
  require_relative File.join("sanzang", "translation_table")
29
28
  require_relative File.join("sanzang", "translator")
29
+ require_relative File.join("sanzang", "batch_translator")
30
30
  require_relative File.join("sanzang", "version")
31
31
 
32
32
  # The Sanzang::Command module contains Unix style commands utilizing the
@@ -15,11 +15,6 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "parallel"
19
-
20
- require_relative "platform"
21
- require_relative "translator"
22
-
23
18
  module Sanzang
24
19
 
25
20
  # BatchTranslator can handle batches of files for translation, and may also
@@ -15,12 +15,8 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "optparse"
19
-
20
- require_relative File.join("..", "platform")
21
- require_relative File.join("..", "translation_table")
22
- require_relative File.join("..", "batch_translator")
23
- require_relative File.join("..", "version")
18
+ require 'optparse'
19
+ require 'parallel'
24
20
 
25
21
  module Sanzang::Command
26
22
 
@@ -15,11 +15,7 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "optparse"
19
-
20
- require_relative File.join("..", "platform")
21
- require_relative File.join("..", "text_formatter")
22
- require_relative File.join("..", "version")
18
+ require 'optparse'
23
19
 
24
20
  module Sanzang::Command
25
21
 
@@ -61,7 +57,7 @@ module Sanzang::Command
61
57
  fin.binmode.set_encoding(@encoding)
62
58
  fout = @outfile ? File.open(@outfile, "w") : $stdout
63
59
  fout.binmode.set_encoding(@encoding)
64
- fout.write(Sanzang::TextFormatter.new.reflow_cjk(fin.read))
60
+ fout.write(Sanzang::Formatting.reflow_cjk(fin.read))
65
61
  ensure
66
62
  if defined?(fin) and fin.class == File
67
63
  fin.close if not fin.closed?
@@ -15,15 +15,7 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "optparse"
19
- require "parallel"
20
-
21
- require_relative "reflow"
22
- require_relative "translate"
23
- require_relative "batch"
24
-
25
- require_relative File.join("..", "platform")
26
- require_relative File.join("..", "version")
18
+ require 'optparse'
27
19
 
28
20
  module Sanzang::Command
29
21
 
@@ -80,6 +72,8 @@ module Sanzang::Command
80
72
  # A string giving a listing of platform information
81
73
  #
82
74
  def platform_info
75
+ require 'parallel'
76
+
83
77
  info = "host_arch = #{Sanzang::Platform.machine_arch}\n"
84
78
  info << "host_os = #{Sanzang::Platform.os_name}\n"
85
79
  info << "host_processors = #{Sanzang::Platform.processor_count}\n"
@@ -96,8 +90,9 @@ module Sanzang::Command
96
90
  # This is a string giving a brief one-line summary of version information
97
91
  #
98
92
  def version_info
99
- "sanzang #{Sanzang::VERSION} (#{Sanzang::Platform.data_encoding})" \
100
- + " ruby-#{RUBY_VERSION}p#{RUBY_PATCHLEVEL} #{RUBY_PLATFORM}"
93
+ "sanzang #{Sanzang::VERSION} ruby-#{RUBY_VERSION} #{RUBY_PLATFORM} " \
94
+ + "(#{Sanzang::Platform.data_encoding})"
95
+ # "sanzang #{Sanzang::VERSION} #{Sanzang::Platform.data_encoding}"
101
96
  end
102
97
 
103
98
  # Name of the command
@@ -15,12 +15,7 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- require "optparse"
19
-
20
- require_relative File.join("..", "platform")
21
- require_relative File.join("..", "translation_table")
22
- require_relative File.join("..", "translator")
23
- require_relative File.join("..", "version")
18
+ require 'optparse'
24
19
 
25
20
  module Sanzang::Command
26
21
 
@@ -1,6 +1,6 @@
1
1
  # coding: UTF-8
2
2
  #--
3
- # Copyright (C) 2012-2013 Lapis Lazuli Texts
3
+ # Copyright (C) 2012-2014 Lapis Lazuli Texts
4
4
  #
5
5
  # This program is free software: you can redistribute it and/or modify it under
6
6
  # the terms of the GNU General Public License as published by the Free Software
@@ -15,16 +15,14 @@
15
15
  # You should have received a copy of the GNU General Public License along with
16
16
  # this program. If not, see <http://www.gnu.org/licenses/>.
17
17
 
18
- module Sanzang
19
-
20
- # This class handles formatting of text data especially to prepare the text
21
- # for direct translation. This involves reformatting and reflowing text so
22
- # that words are not divided between lines, and so the output is well suited
23
- # for humans. For practical purposes of readability, lines of text to be
24
- # translated should be succinct and easily comprehensible. The TextFormatter
25
- # class includes methods for accomplishing this reformatting.
26
- #
27
- class TextFormatter
18
+ # This module handles formatting of text data especially to prepare the text
19
+ # for direct translation. This involves reformatting and reflowing text so
20
+ # that words are not divided between lines, and so the output is well suited
21
+ # for humans. For practical purposes of readability, lines of text to be
22
+ # translated should be succinct and easily comprehensible.
23
+ #
24
+ module Sanzang::Formatting
25
+ class << self
28
26
 
29
27
  # Given a CJK string of text, reformat the string for greater compatibility
30
28
  # with direct translation, and reflow the text based on its punctuation.
@@ -17,9 +17,8 @@
17
17
 
18
18
  module Sanzang
19
19
 
20
- # A translation table encapsulates a set of rules for translating with
21
- # the \Sanzang system. These are essentially read-only objects meant for
22
- # storing well-defined translation table data.
20
+ # A translation table encapsulates a set of rules for translating with the
21
+ # \Sanzang system.
23
22
  #
24
23
  class TranslationTable
25
24
 
@@ -39,6 +38,8 @@ module Sanzang
39
38
  # - Subsequent fields are equivalent terms in destination languages.
40
39
  # - The number of columns must be consistent for the entire table.
41
40
  #
41
+ # The rules passed in here may either be a file descriptor or a string.
42
+ #
42
43
  def initialize(rules)
43
44
  contents = rules.kind_of?(String) ? rules : rules.read
44
45
  @source_encoding = contents.encoding
@@ -51,8 +52,9 @@ module Sanzang
51
52
  end
52
53
 
53
54
  @records = contents.strip.split("\n").collect {|r| r.strip.split("|") }
55
+ @sorted = false
54
56
  check_dims
55
- sort!
57
+ #sort!
56
58
  end
57
59
 
58
60
  # Retrieve a record by its numeric index.
@@ -76,10 +78,18 @@ module Sanzang
76
78
  end
77
79
  end
78
80
 
81
+ # Check if the table records are sorted
82
+ #
83
+ def sorted?
84
+ @sorted
85
+ end
86
+
79
87
  # Reverse sort all records by length
80
88
  #
81
89
  def sort!
82
90
  @records.sort! {|x,y| y[0].size <=> x[0].size }
91
+ @sorted = true
92
+ nil
83
93
  end
84
94
 
85
95
  # The text encoding used internally for all translation table data
@@ -94,6 +104,46 @@ module Sanzang
94
104
  @records.find {|rec| rec[0] == term }
95
105
  end
96
106
 
107
+ # Convert to a hash. The original records are the values.
108
+ #
109
+ # For example: "A" => ["A", "B", "C"]
110
+ #
111
+ def to_h
112
+ h = Hash.new
113
+ @records.each {|rec| h[rec[0]] = rec if not h[rec[0]] }
114
+ h
115
+ end
116
+
117
+ # Only include unique source values. The resulting table is unsorted.
118
+ #
119
+ def uniq!
120
+ @records = to_h.values
121
+ @sorted = false
122
+ nil
123
+ end
124
+
125
+ # Merge another table into this one. If the same source term exists in
126
+ # both tables, then the record from the other table will be used instead.
127
+ # Note: after a merge, the resulting table is unsorted.
128
+ #
129
+ def merge!(tab2)
130
+ if tab2.width != width
131
+ raise "Table widths must match when merging tables"
132
+ end
133
+ h1 = to_h
134
+ tab2.records.each do |rec|
135
+ h1[rec[0]] = rec
136
+ end
137
+ @records = h1.values
138
+ @sorted = false
139
+ end
140
+
141
+ # Return a CSV formatted string
142
+ #
143
+ def to_csv
144
+ @records.map {|r| r.join("|") }.join("\n")
145
+ end
146
+
97
147
  # The number of records in the table
98
148
  #
99
149
  def length
@@ -37,13 +37,10 @@ module Sanzang
37
37
  # These records represent the vocabulary used by the text.
38
38
  #
39
39
  def text_vocab(source_text)
40
- new_table = []
41
- @table.records.each do |record|
42
- if source_text.include?(record[0])
43
- new_table << record
44
- end
40
+ text_copy = String.new(source_text)
41
+ @table.records.select do |r|
42
+ text_copy.include?(r[0]) ? text_copy.gsub!(r[0], "\x1F") : false
45
43
  end
46
- new_table
47
44
  end
48
45
 
49
46
  # Use the TranslationTable of the Translator to create translations for
@@ -80,7 +77,7 @@ module Sanzang
80
77
 
81
78
  listing = ""
82
79
  texts[0].length.times do |line_i|
83
- @table.width.times do |col_i|
80
+ texts.length.times do |col_i|
84
81
  listing << "[#{pos + line_i}.#{col_i + 1}] #{texts[col_i][line_i]}" \
85
82
  << newline
86
83
  end
@@ -19,6 +19,6 @@ module Sanzang
19
19
 
20
20
  # Current version number of Sanzang
21
21
  #
22
- VERSION = "1.1.2"
22
+ VERSION = "1.2.0"
23
23
 
24
24
  end
@@ -23,8 +23,7 @@ class TestReflowEncodings < Test::Unit::TestCase
23
23
  text_s2 = "    大唐三藏法師玄奘奉\n 詔譯\n \n"
24
24
  text_s1.encode!(encoding)
25
25
  text_s2.encode!(encoding)
26
- formatter = Sanzang::TextFormatter.new
27
- assert_equal(text_s2, formatter.reflow_cjk(text_s1))
26
+ assert_equal(text_s2, Sanzang::Formatting.reflow_cjk(text_s1))
28
27
  end
29
28
 
30
29
  # Han characters, simplified and without double vertical bar. The margin
@@ -36,8 +35,7 @@ class TestReflowEncodings < Test::Unit::TestCase
36
35
  text_s2 = "    大唐三藏法师玄奘奉\n 诏译\n \n"
37
36
  text_s1.encode!(encoding)
38
37
  text_s2.encode!(encoding)
39
- formatter = Sanzang::TextFormatter.new
40
- assert_equal(text_s2, formatter.reflow_cjk(text_s1))
38
+ assert_equal(text_s2, Sanzang::Formatting.reflow_cjk(text_s1))
41
39
  end
42
40
 
43
41
  # UTF-8 (Traditional Chinese)
@@ -55,7 +55,7 @@ class TestSanzang < Test::Unit::TestCase
55
55
  end
56
56
 
57
57
  def test_reflow_cjk_string
58
- text = Sanzang::TextFormatter.new.reflow_cjk(stage_1())
58
+ text = Sanzang::Formatting.reflow_cjk(stage_1())
59
59
  assert_equal(stage_2(), text)
60
60
  end
61
61
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sanzang
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lapis Lazuli Texts
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-13 00:00:00.000000000 Z
11
+ date: 2014-03-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: parallel
@@ -52,8 +52,8 @@ files:
52
52
  - lib/sanzang/command/reflow.rb
53
53
  - lib/sanzang/command/sanzang_cmd.rb
54
54
  - lib/sanzang/command/translate.rb
55
+ - lib/sanzang/formatting.rb
55
56
  - lib/sanzang/platform.rb
56
- - lib/sanzang/text_formatter.rb
57
57
  - lib/sanzang/translation_table.rb
58
58
  - lib/sanzang/translator.rb
59
59
  - lib/sanzang/version.rb
@@ -90,7 +90,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
90
90
  requirements:
91
91
  - parallel ~> 0.8
92
92
  rubyforge_project:
93
- rubygems_version: 2.2.0
93
+ rubygems_version: 2.2.2
94
94
  signing_key:
95
95
  specification_version: 4
96
96
  summary: Machine translation from CJK languages