sanzang 1.0.4 → 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/bin/sanzang CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
2
+ # coding: UTF-8
3
3
  #--
4
- # Copyright (C) 2012 Lapis Lazuli Texts
4
+ # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
5
  #
6
6
  # This program is free software: you can redistribute it and/or modify it under
7
7
  # the terms of the GNU General Public License as published by the Free Software
data/lib/sanzang.rb CHANGED
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby -w
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
  #--
4
3
  # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
4
  #
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
  #--
4
3
  # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
4
  #
@@ -36,12 +35,15 @@ module Sanzang
36
35
  # return value is an array containing all the output file paths.
37
36
  #
38
37
  def translate_batch(fpath_pairs, verbose = true, jobs = nil)
39
- if not Sanzang::Platform.unix_processes?
40
- jobs = 0
41
- elsif not jobs
42
- jobs = Sanzang::Platform.processor_count
38
+ options = {}
39
+ if RUBY_PLATFORM =~ /java/
40
+ options[:in_threads] = jobs || Sanzang::Platform.processor_count
41
+ elsif Sanzang::Platform.unix_processes?
42
+ options[:in_processes] = jobs || Sanzang::Platform.processor_count
43
+ else
44
+ options[:in_processes] = 0
43
45
  end
44
- Parallel.map(fpath_pairs, :in_processes => jobs) do |f1,f2|
46
+ Parallel.map(fpath_pairs, options) do |f1,f2|
45
47
  translate_io(f1, f2)
46
48
  if verbose
47
49
  $stderr.write "[#{Process.pid}] #{File.expand_path(f2)} \n"
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
  #--
4
3
  # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
4
  #
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
  #--
4
3
  # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
4
  #
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
  #--
4
3
  # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
4
  #
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
  #--
4
3
  # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
4
  #
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
  #--
4
3
  # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
4
  #
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
  #--
4
3
  # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
4
  #
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
  #--
4
3
  # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
4
  #
@@ -29,7 +28,7 @@ module Sanzang
29
28
  # summarized as follows:
30
29
  #
31
30
  # - Each line of text is a record for a translation rule.
32
- # - Each record begins with "~|" and ends with "|~".
31
+ # - Each record may begin with "~|" and end with "|~".
33
32
  # - Fields in the record are separated by the "|" character.
34
33
  # - The first field contains the term in the source language.
35
34
  # - Subsequent fields are equivalent terms in destination languages.
@@ -43,15 +42,11 @@ module Sanzang
43
42
  def initialize(rules)
44
43
  contents = rules.kind_of?(String) ? rules : rules.read
45
44
  contents.encode!(Encoding::UTF_8)
46
- @encoding = contents.encoding
47
-
48
- left = "~|"
49
- right = "|~"
50
- separator = "|"
51
-
52
- @records = contents.gsub("\r", "").split("\n").collect do |rec|
53
- rec.strip.gsub(left, "").gsub(right, "").split(separator)
54
- end
45
+ contents.strip!
46
+ contents.gsub!(/^\s*|\s*$|\r/, "")
47
+ contents.gsub!("~|", "")
48
+ contents.gsub!("|~", "")
49
+ @records = contents.split("\n").collect {|r| r.split("|") }
55
50
 
56
51
  if @records.length < 1
57
52
  raise "Table must have at least 1 row"
@@ -59,9 +54,9 @@ module Sanzang
59
54
  raise "Table must have at least 2 columns"
60
55
  end
61
56
 
62
- @width = records[0].length
63
- 0.upto(@records.length - 1) do |i|
64
- if @records[i].length != @width
57
+ width = records[0].length
58
+ @records.each do |r|
59
+ if r.length != width
65
60
  raise "Column mismatch: Line #{i + 1}"
66
61
  end
67
62
  end
@@ -75,6 +70,12 @@ module Sanzang
75
70
  @records[index]
76
71
  end
77
72
 
73
+ # The text encoding used for all translation table data
74
+ #
75
+ def encoding
76
+ Encoding::UTF_8
77
+ end
78
+
78
79
  # Find a record by the source language term (first column).
79
80
  #
80
81
  def find(term)
@@ -97,9 +98,5 @@ module Sanzang
97
98
  #
98
99
  attr_reader :records
99
100
 
100
- # The text encoding used for all translation table data
101
- #
102
- attr_reader :encoding
103
-
104
101
  end
105
102
  end
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
  #--
4
3
  # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
4
  #
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
  #--
4
3
  # Copyright (C) 2012-2013 Lapis Lazuli Texts
5
4
  #
@@ -20,6 +19,6 @@ module Sanzang
20
19
 
21
20
  # Current version number of Sanzang
22
21
  #
23
- VERSION = "1.0.4"
22
+ VERSION = "1.0.5"
24
23
 
25
24
  end
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
 
4
3
  require "test/unit"
5
4
 
@@ -1,5 +1,4 @@
1
- #!/usr/bin/env ruby
2
- # -*- encoding: UTF-8 -*-
1
+ # coding: UTF-8
3
2
 
4
3
  require "test/unit"
5
4
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sanzang
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,32 +9,31 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-25 00:00:00.000000000 Z
12
+ date: 2013-10-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: parallel
16
16
  requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
- - - ! '>='
19
+ - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: 0.5.19
21
+ version: '0.8'
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
25
25
  none: false
26
26
  requirements:
27
- - - ! '>='
27
+ - - ~>
28
28
  - !ruby/object:Gem::Version
29
- version: 0.5.19
29
+ version: '0.8'
30
30
  description: Sanzang is a program built for machine translation of natural languages.
31
31
  This application is particularly suitable as a translation aid for CJK languages
32
32
  including ancient texts. The translation method is rule-based, and translation rules
33
33
  are stored in flat files as delimited text. This program can also utilize multiprocessing
34
34
  to naturally scale to multiple processors and processor cores. Sanzang is available
35
35
  under the GNU GPL, version 3.
36
- email:
37
- - lapislazulitexts@gmail.com
36
+ email: lapislazulitexts@gmail.com
38
37
  executables:
39
38
  - sanzang
40
39
  extensions: []
@@ -90,12 +89,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
90
89
  - - ! '>='
91
90
  - !ruby/object:Gem::Version
92
91
  version: '0'
93
- requirements: []
92
+ requirements:
93
+ - parallel ~> 0.8
94
94
  rubyforge_project:
95
95
  rubygems_version: 1.8.23
96
96
  signing_key:
97
97
  specification_version: 3
98
- summary: Simple rule-based machine translation system.
98
+ summary: Machine translation for CJK languages
99
99
  test_files:
100
100
  - test/tc_reflow_encodings.rb
101
101
  - test/tc_simple_translation.rb