zhongwen_tools 0.9.0 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,8 +1,9 @@
1
1
  # encoding: utf-8
2
- require File.expand_path("../../regex", __FILE__)
2
+ require 'zhongwen_tools/regex'
3
3
 
4
4
  module ZhongwenTools
5
5
  module Romanization
6
+ extend self
6
7
  # Deprecated: a Regex for accurate pinyin. Use ZhongwenTools::Regex.py instead
7
8
  PY_REGEX = ZhongwenTools::Regex.py
8
9
 
@@ -10,6 +11,7 @@ module ZhongwenTools
10
11
  PINYIN_REGEX = ZhongwenTools::Regex.pyn
11
12
 
12
13
  # Public: checks if a string is pinyin.
14
+ # http://en.wikipedia.org/wiki/Pinyin
13
15
  #
14
16
  # Examples
15
17
  # py?('nǐ hǎo')
@@ -19,7 +21,8 @@ module ZhongwenTools
19
21
  def py?(str = nil)
20
22
  str ||= self
21
23
 
22
- str.gsub(ZhongwenTools::Regex.py, '').strip == ''
24
+ # NOTE: py regex does not include capitals with tones.
25
+ String.downcase(str).gsub(Regex.punc,'').gsub(Regex.py, '').gsub(/[\s\-]/,'').strip == ''
23
26
  end
24
27
 
25
28
  # Public: checks if a string is pinyin.
@@ -32,51 +35,33 @@ module ZhongwenTools
32
35
  def pyn?(str = nil)
33
36
  str ||= self
34
37
 
35
- str.gsub(ZhongwenTools::Regex.pyn, '').strip == ''
36
- end
37
-
38
- # Public: Checks if a string is wade-giles.
39
- #
40
- # Examples
41
- # wg?('pin1-yin1')
42
- # # => false
43
- #
44
- # Returns a Boolean.
45
- def wg?(str = nil, type = :pyn)
46
- # NOTE: There are some situations where wg == pyn, but there's no way to differentiate the two.
47
- # FIXME: it shouldn't be pyn, but it should be able to conver to pyn
48
- # Actually, wade-giles does sometimes overlap with pyn. So this
49
- # method creates false negatives. A future :romanization method
50
- # would default to pyn, but this method shouldn't.
51
- # Add tests where str.pyn? and str.wg?
52
-
53
- str ||= self
54
- wg = ZhongwenTools::Romanization.to_wade_giles(str, type)
55
- # TODO: need to convert string to pyn.
56
- pyn = str
57
- wg != pyn && wg.gsub(/[1-5]/,'')
38
+ str.gsub(Regex.punc,'').gsub(Regex.pyn, '').gsub(/[\s\-]/,'').strip == ''
58
39
  end
59
40
 
60
41
  # Public: Checks if a String is Zhuyin Fuhao (a.k.a. bopomofo).
42
+ # http://en.wikipedia.org/wiki/Bopomofo
43
+ # http://pinyin.info/romanization/bopomofo/index.html
61
44
  #
62
45
  # str - a String. Optional if the object calling the method is a String.
63
46
  #
64
47
  # Examples
65
48
  #
66
- # zyfh?('ㄊㄥ')
49
+ # bpmf?('ㄊㄥ')
67
50
  # # => true
68
51
  #
69
52
  # Returns a boolean.
70
- def zyfh?(str = nil)
53
+ def bpmf?(str = nil)
71
54
  str ||= self
72
55
 
73
- bopomofo = str.gsub(/[1-5\s]/,'')
74
- bopomofo.scan(ZhongwenTools::Regex.bopomofo).join == bopomofo
56
+ bopomofo = str.gsub(/[1-5\s]/,'').gsub(Regex.punc,'')
57
+ bopomofo.scan(Regex.bopomofo).join == bopomofo
75
58
  end
76
59
 
77
- # Public: Checks if a String is Tongyong Pinyin.
60
+ # Public: Checks if a String is a romanization:
61
+ # Tongyong Pinyin, Wade Giles, MSP2 or Yale.
78
62
  # http://en.wikipedia.org/wiki/Tongyong_Pinyin
79
63
  # http://pinyin.info/romanization/tongyong/
64
+ # http://en.wikipedia.org/wiki/Wade%E2%80%93Giles
80
65
  #
81
66
  # str - a String. Optional if the object calling the method is a String.
82
67
  #
@@ -84,20 +69,64 @@ module ZhongwenTools
84
69
  #
85
70
  # typy?('chuei niou')
86
71
  # # => true
72
+ # wg?('Mao2 Tse2 Tung1')
87
73
  #
88
74
  # Returns a boolean.
89
- def typy?(str = nil)
75
+ %w(typy wg yale mps2).each do |type|
76
+ define_method("#{type}?") do |str = nil|
77
+ str ||= self
78
+ # TODO: ignore tonal marks from other systems wade giles, tongyong etc.
79
+ s = str.downcase.gsub(Regex.punc,'').gsub(/[1-5\s\-']/,'')
80
+
81
+ s.scan(detect_regex(type.to_sym)).join == s
82
+ end
83
+ end
84
+
85
+ # Public: Checks the srings romanizaiton. It always assumes the first correct result is the correct result.
86
+ # This can sometimes provide sub-optimal results
87
+ # e.g.
88
+ # 'chuei niou'.romanization? #=> :pyn
89
+ # 'chuei niou'.pyn? == true # this is correct because ['chu', 'ei', 'ni', 'ou'] are all valid pinyin
90
+ # # but the best fit for 'chuei niou' should be :typy.
91
+ # But this is not considered a major issue because most of the time pyn / py will be used. It could be
92
+ # extended to try and figure out the best option, maybe by comparing the syllable length of each
93
+ # valid romanization.
94
+ #
95
+ # str - a String. Optional if the object calling the method is a String.
96
+ #
97
+ # Examples
98
+ #
99
+ #
100
+ # 'hao3'.romanization? #=> :pyn
101
+ #
102
+ # Returns a Symbol for the romanization type.
103
+ def romanization?(str = nil)
90
104
  str ||= self
91
105
 
92
- typy = str.gsub(/[1-5\s\-']/,'')
93
- # Sorting by String length means it will match the longest possible part.
94
- # FIXME: it is probably possible for this to have false negatives.
95
- # A more comprehensive regex like Regex.pyn would be needed
96
- # to accurately detect typy.
97
- regex_str = ROMANIZATIONS_TABLE.map{ |r| r[:typy] || r[:py] }.sort{|x,y| x.size <=> y.size}.reverse.join('|')
98
- typy.scan(/#{regex_str}/).join == typy
106
+ [:pyn, :py, :zyfh, :wg, :typy, :yale, :msp2].find do |type|
107
+ self.send("#{type}?", str)
108
+ end
99
109
  end
100
110
 
101
- # TODO: msp2? yale? wgyrm? romanization?
111
+ # TODO: romanizations? method that returns all possible romanizations.
112
+
113
+ # Deprecated: ZhongwenTools::Romanizaiton.zyfh? is deprecated. Use ZhongwenTools::Romanizaiton.bpmf? instead
114
+ alias_method :zyfh?, :bpmf?
115
+
116
+ private
117
+
118
+ # Internal: Produces a Regexp for a romanization type.
119
+ #
120
+ # type - a Symbol for the romanization type.
121
+ #
122
+ # Examples:
123
+ #
124
+ #
125
+ # detect_regex(:typy) #=> <Regexp>
126
+ #
127
+ # Returns a Regexp.
128
+ def detect_regex(type)
129
+ /#{ROMANIZATIONS_TABLE.map{ |r| r[type] || r[:pyn] }.sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
130
+ end
102
131
  end
103
132
  end
@@ -160,7 +160,8 @@ module ZhongwenTools
160
160
  'ng2' => "éng",
161
161
  'ng3' => "ěng",
162
162
  'ng4' => "èng",
163
- 'ng5' => 'eng'
163
+ 'ng5' => 'eng',
164
+ 'r5' => 'r'
164
165
  }
165
166
  end
166
167
  end
@@ -1,9 +1,9 @@
1
1
  # encoding: utf-8
2
2
  #$:.unshift File.join(File.dirname(__FILE__),'..','lib','zhongwen_tools', 'string')
3
3
  require 'uri'
4
- require File.expand_path("../regex", __FILE__)
5
- require File.expand_path("../string/fullwidth", __FILE__)
6
- require File.expand_path("../string/caps", __FILE__)
4
+ require 'zhongwen_tools/regex'
5
+ require 'zhongwen_tools/string/fullwidth'
6
+ require 'zhongwen_tools/string/caps'
7
7
 
8
8
  class String
9
9
  alias_method :_downcase, :downcase
@@ -1,3 +1,3 @@
1
1
  module ZhongwenTools
2
- VERSION = "0.9.0"
2
+ VERSION = '0.11.0'
3
3
  end
@@ -15,17 +15,32 @@ class TestRomanization < Minitest::Test
15
15
  assert_equal 'Zhōngwén', 'Zhong1-wen2'.to_pinyin
16
16
  assert_equal "Tiān'ānmén",'Tian1an1men2'.to_pinyin
17
17
  assert_equal @alabo[:py], @alabo[:pyn].to_pinyin
18
-
18
+ assert_equal 'r', 'r5'.to_pinyin
19
19
  #wg -> py not yet implemented
20
20
  #mzd = "Mao Tse-tung"
21
21
  #assert_equal "Mao Zedong", mzd.to_pinyin(:wg)
22
+ assert @alabo[:py].py?
23
+ assert 'Ā-lā-bó'.py?
24
+ assert 'Zhong1 wen2'.to_pinyin.py?
22
25
  end
23
26
 
24
27
  def test_pyn
25
28
  assert_equal 'ni3 hao3', @py.to_pyn(:py)
26
29
  assert_equal 'tian1an1men2', 'tian1an1men2'.to_py.to_pyn(:py)
27
30
 
28
- #assert_equal 'Wūlúhānuòfū'.to_pyn, 'Wu1-lu2-ha1-nuo4-fu1'
31
+ assert_equal 'yi4', ''.to_pyn(:py)
32
+
33
+ assert_equal 'ni3 hao3', 'ㄋㄧ3 ㄏㄠ3'.to_pyn(:bpmf)
34
+ assert_equal 'ni3 hao3', 'ㄋㄧ3 ㄏㄠ3'.to_pyn
35
+ assert_equal 'zhong1 guo2', 'chung1 kuo2'.to_pyn(:wg)
36
+ assert_equal 'zhong1 guo2', 'chung1 kuo2'.to_pyn
37
+ assert_equal 'chui1 niu3', 'chuei1 niou3'.to_pyn(:typy)
38
+ assert_equal 'cao3 di4', 'tsau3 di4'.to_pyn(:mspy2)
39
+ assert_equal 'cao3 di4', 'tsau3 di4'.to_pyn
40
+ assert_equal 'cao3 di4', 'tsau3 di4'.to_pyn(:yale)
41
+ assert_equal 'cao3 di4', 'tsau3 di4'.to_pyn
42
+
43
+ #assert_equal 'Wu1-lu2-ha1-nuo4-fu1', 'Wūlúhānuòfū'.to_pyn(:py)
29
44
  #"007:Dàpò Liàngzǐ Wēijī", "007: Da4po4 Liang4zi3 Wei1ji1"
30
45
  end
31
46
 
@@ -37,6 +52,8 @@ class TestRomanization < Minitest::Test
37
52
  assert_equal 'ㄑㄧㄥ3 ㄏㄨㄟ2ㄉㄚ2 ㄨㄛ3 ㄉㄜ5 ㄨㄣ4ㄊㄧ2 .', @sent.to_zhuyin
38
53
  assert_equal 'ㄇㄠ2 ㄗㄜ2ㄉㄨㄥ1', @mzd2.to_zhuyin_fuhao
39
54
  assert 'ㄋㄧ3 ㄏㄠ3'.zyfh?
55
+
56
+ assert_equal 'ㄋㄧ3 ㄏㄠ3', 'ni3 hau3'.to_bpmf(:yale)
40
57
  end
41
58
 
42
59
  def test_wade_giles
@@ -45,6 +62,8 @@ class TestRomanization < Minitest::Test
45
62
  assert_equal 'Mao2 Tse2 tung1', @mzd.to_wg
46
63
  assert_equal 'Mao2 Tse2-tung1', @mzd2.to_wade_giles
47
64
  assert_equal 'Mao2 Tse2-tung1 te5 mao2', 'Mao2 Ze2-dong1 de5 mao2'.to_wade_giles
65
+
66
+ assert_equal 'ni3 hao3', 'ni3 hau3'.to_wg(:yale)
48
67
  end
49
68
 
50
69
  #def test_mspy2
@@ -65,11 +84,15 @@ class TestRomanization < Minitest::Test
65
84
 
66
85
  def test_yale
67
86
  assert_equal 'ni3 hau3', @str.to_yale
87
+
88
+ assert_equal 'chwei1 nyou3', 'chuei1 niou3'.to_yale(:typy)
68
89
  end
69
90
 
70
- #def test_romanization?
71
- #skip
72
- #end
91
+ def test_romanization?
92
+ assert_equal :pyn, @alabo[:pyn].romanization?
93
+ assert_equal :py, @alabo[:py].romanization?
94
+ assert_equal :wg, @mzd.to_wg(:pyn).romanization?
95
+ end
73
96
 
74
97
  def test_detect
75
98
  assert @str.pyn?
@@ -78,9 +101,11 @@ class TestRomanization < Minitest::Test
78
101
 
79
102
  assert 'chung1 kuo2'.wg?
80
103
 
81
- # Travis CI is having trouble with this using Ruby 1.8.7, but it works locally.
82
- # I'll probably end up dropping full 1.8.7 support.
83
104
  assert @py.py?, "#{@py} should be pinyin. (#{@py.py?})" unless RUBY_VERSION < '1.9'
105
+ assert 'chuei1 niou3'.typy?
106
+ assert 'ㄋㄧ3 ㄏㄠ3'.bpmf?
107
+ assert 'ni3 hau3'.yale?
108
+ assert 'tsuen'.mps2?
84
109
  end
85
110
 
86
111
  def test_split_pyn
@@ -92,6 +117,13 @@ class TestRomanization < Minitest::Test
92
117
  end
93
118
 
94
119
  def setup
120
+ @romanizations = [
121
+ # FIXME: bopomofo, tongyong pinyin, wade-giles tones are all wrong.
122
+ # TODO: test IPA
123
+ { :pyn => 'ni3 hao', :py => 'nǐ hǎo', :bopomofo => 'ㄋㄧ3 ㄏㄠ3', :yale => 'ni3 hau3', :typy => 'ni3 hao3', :wg => 'ni3 hao3'},#, :ipa => ''}
124
+ { :pyn => 'zhong1 guo2', :py => 'nǐ hǎo', :bopomofo => 'ㄋㄧ3 ㄏㄠ3', :yale => 'ni3 hau3', :typy => 'ni3 hao3', :wg => 'chung1 kuo2'},#, :ipa => ''}
125
+ { :pyn => 'chui1 niu3', :py => '', :bopomofo => '', :yale => 'chwei1 nyou3', :typy => 'chuei1 niou3', :wg => 'chung1 kuo2'},#, :ipa => ''}
126
+ ]
95
127
  @str = 'ni3 hao3'
96
128
  @mzd = 'Mao2 Ze2 dong1'
97
129
  @mzd2 = 'Mao2 Ze2-dong1'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zhongwen_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steven Daniels
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-22 00:00:00.000000000 Z
11
+ date: 2014-06-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake