zhongwen_tools 0.9.0 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/Rakefile +0 -41
- data/lib/zhongwen_tools.rb +3 -5
- data/lib/zhongwen_tools/conversion.rb +20 -17
- data/lib/zhongwen_tools/integer.rb +1 -1
- data/lib/zhongwen_tools/numbers.rb +2 -1
- data/lib/zhongwen_tools/regex.rb +4 -5
- data/lib/zhongwen_tools/romanization.rb +90 -120
- data/lib/zhongwen_tools/romanization/conversion_table.rb +417 -417
- data/lib/zhongwen_tools/romanization/detect.rb +68 -39
- data/lib/zhongwen_tools/romanization/pyn_to_py.rb +2 -1
- data/lib/zhongwen_tools/string.rb +3 -3
- data/lib/zhongwen_tools/version.rb +1 -1
- data/test/test_romanization.rb +39 -7
- metadata +2 -2
@@ -1,8 +1,9 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
require
|
2
|
+
require 'zhongwen_tools/regex'
|
3
3
|
|
4
4
|
module ZhongwenTools
|
5
5
|
module Romanization
|
6
|
+
extend self
|
6
7
|
# Deprecated: a Regex for accurate pinyin. Use ZhongwenTools::Regex.py instead
|
7
8
|
PY_REGEX = ZhongwenTools::Regex.py
|
8
9
|
|
@@ -10,6 +11,7 @@ module ZhongwenTools
|
|
10
11
|
PINYIN_REGEX = ZhongwenTools::Regex.pyn
|
11
12
|
|
12
13
|
# Public: checks if a string is pinyin.
|
14
|
+
# http://en.wikipedia.org/wiki/Pinyin
|
13
15
|
#
|
14
16
|
# Examples
|
15
17
|
# py?('nǐ hǎo')
|
@@ -19,7 +21,8 @@ module ZhongwenTools
|
|
19
21
|
def py?(str = nil)
|
20
22
|
str ||= self
|
21
23
|
|
22
|
-
|
24
|
+
# NOTE: py regex does not include capitals with tones.
|
25
|
+
String.downcase(str).gsub(Regex.punc,'').gsub(Regex.py, '').gsub(/[\s\-]/,'').strip == ''
|
23
26
|
end
|
24
27
|
|
25
28
|
# Public: checks if a string is pinyin.
|
@@ -32,51 +35,33 @@ module ZhongwenTools
|
|
32
35
|
def pyn?(str = nil)
|
33
36
|
str ||= self
|
34
37
|
|
35
|
-
str.gsub(
|
36
|
-
end
|
37
|
-
|
38
|
-
# Public: Checks if a string is wade-giles.
|
39
|
-
#
|
40
|
-
# Examples
|
41
|
-
# wg?('pin1-yin1')
|
42
|
-
# # => false
|
43
|
-
#
|
44
|
-
# Returns a Boolean.
|
45
|
-
def wg?(str = nil, type = :pyn)
|
46
|
-
# NOTE: There are some situations where wg == pyn, but there's no way to differentiate the two.
|
47
|
-
# FIXME: it shouldn't be pyn, but it should be able to conver to pyn
|
48
|
-
# Actually, wade-giles does sometimes overlap with pyn. So this
|
49
|
-
# method creates false negatives. A future :romanization method
|
50
|
-
# would default to pyn, but this method shouldn't.
|
51
|
-
# Add tests where str.pyn? and str.wg?
|
52
|
-
|
53
|
-
str ||= self
|
54
|
-
wg = ZhongwenTools::Romanization.to_wade_giles(str, type)
|
55
|
-
# TODO: need to convert string to pyn.
|
56
|
-
pyn = str
|
57
|
-
wg != pyn && wg.gsub(/[1-5]/,'')
|
38
|
+
str.gsub(Regex.punc,'').gsub(Regex.pyn, '').gsub(/[\s\-]/,'').strip == ''
|
58
39
|
end
|
59
40
|
|
60
41
|
# Public: Checks if a String is Zhuyin Fuhao (a.k.a. bopomofo).
|
42
|
+
# http://en.wikipedia.org/wiki/Bopomofo
|
43
|
+
# http://pinyin.info/romanization/bopomofo/index.html
|
61
44
|
#
|
62
45
|
# str - a String. Optional if the object calling the method is a String.
|
63
46
|
#
|
64
47
|
# Examples
|
65
48
|
#
|
66
|
-
#
|
49
|
+
# bpmf?('ㄊㄥ')
|
67
50
|
# # => true
|
68
51
|
#
|
69
52
|
# Returns a boolean.
|
70
|
-
def
|
53
|
+
def bpmf?(str = nil)
|
71
54
|
str ||= self
|
72
55
|
|
73
|
-
bopomofo = str.gsub(/[1-5\s]/,'')
|
74
|
-
bopomofo.scan(
|
56
|
+
bopomofo = str.gsub(/[1-5\s]/,'').gsub(Regex.punc,'')
|
57
|
+
bopomofo.scan(Regex.bopomofo).join == bopomofo
|
75
58
|
end
|
76
59
|
|
77
|
-
# Public: Checks if a String is
|
60
|
+
# Public: Checks if a String is a romanization:
|
61
|
+
# Tongyong Pinyin, Wade Giles, MSP2 or Yale.
|
78
62
|
# http://en.wikipedia.org/wiki/Tongyong_Pinyin
|
79
63
|
# http://pinyin.info/romanization/tongyong/
|
64
|
+
# http://en.wikipedia.org/wiki/Wade%E2%80%93Giles
|
80
65
|
#
|
81
66
|
# str - a String. Optional if the object calling the method is a String.
|
82
67
|
#
|
@@ -84,20 +69,64 @@ module ZhongwenTools
|
|
84
69
|
#
|
85
70
|
# typy?('chuei niou')
|
86
71
|
# # => true
|
72
|
+
# wg?('Mao2 Tse2 Tung1')
|
87
73
|
#
|
88
74
|
# Returns a boolean.
|
89
|
-
|
75
|
+
%w(typy wg yale mps2).each do |type|
|
76
|
+
define_method("#{type}?") do |str = nil|
|
77
|
+
str ||= self
|
78
|
+
# TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
79
|
+
s = str.downcase.gsub(Regex.punc,'').gsub(/[1-5\s\-']/,'')
|
80
|
+
|
81
|
+
s.scan(detect_regex(type.to_sym)).join == s
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Public: Checks the srings romanizaiton. It always assumes the first correct result is the correct result.
|
86
|
+
# This can sometimes provide sub-optimal results
|
87
|
+
# e.g.
|
88
|
+
# 'chuei niou'.romanization? #=> :pyn
|
89
|
+
# 'chuei niou'.pyn? == true # this is correct because ['chu', 'ei', 'ni', 'ou'] are all valid pinyin
|
90
|
+
# # but the best fit for 'chuei niou' should be :typy.
|
91
|
+
# But this is not considered a major issue because most of the time pyn / py will be used. It could be
|
92
|
+
# extended to try and figure out the best option, maybe by comparing the syllable length of each
|
93
|
+
# valid romanization.
|
94
|
+
#
|
95
|
+
# str - a String. Optional if the object calling the method is a String.
|
96
|
+
#
|
97
|
+
# Examples
|
98
|
+
#
|
99
|
+
#
|
100
|
+
# 'hao3'.romanization? #=> :pyn
|
101
|
+
#
|
102
|
+
# Returns a Symbol for the romanization type.
|
103
|
+
def romanization?(str = nil)
|
90
104
|
str ||= self
|
91
105
|
|
92
|
-
typy
|
93
|
-
|
94
|
-
|
95
|
-
# A more comprehensive regex like Regex.pyn would be needed
|
96
|
-
# to accurately detect typy.
|
97
|
-
regex_str = ROMANIZATIONS_TABLE.map{ |r| r[:typy] || r[:py] }.sort{|x,y| x.size <=> y.size}.reverse.join('|')
|
98
|
-
typy.scan(/#{regex_str}/).join == typy
|
106
|
+
[:pyn, :py, :zyfh, :wg, :typy, :yale, :msp2].find do |type|
|
107
|
+
self.send("#{type}?", str)
|
108
|
+
end
|
99
109
|
end
|
100
110
|
|
101
|
-
# TODO:
|
111
|
+
# TODO: romanizations? method that returns all possible romanizations.
|
112
|
+
|
113
|
+
# Deprecated: ZhongwenTools::Romanizaiton.zyfh? is deprecated. Use ZhongwenTools::Romanizaiton.bpmf? instead
|
114
|
+
alias_method :zyfh?, :bpmf?
|
115
|
+
|
116
|
+
private
|
117
|
+
|
118
|
+
# Internal: Produces a Regexp for a romanization type.
|
119
|
+
#
|
120
|
+
# type - a Symbol for the romanization type.
|
121
|
+
#
|
122
|
+
# Examples:
|
123
|
+
#
|
124
|
+
#
|
125
|
+
# detect_regex(:typy) #=> <Regexp>
|
126
|
+
#
|
127
|
+
# Returns a Regexp.
|
128
|
+
def detect_regex(type)
|
129
|
+
/#{ROMANIZATIONS_TABLE.map{ |r| r[type] || r[:pyn] }.sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
|
130
|
+
end
|
102
131
|
end
|
103
132
|
end
|
@@ -1,9 +1,9 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
#$:.unshift File.join(File.dirname(__FILE__),'..','lib','zhongwen_tools', 'string')
|
3
3
|
require 'uri'
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
4
|
+
require 'zhongwen_tools/regex'
|
5
|
+
require 'zhongwen_tools/string/fullwidth'
|
6
|
+
require 'zhongwen_tools/string/caps'
|
7
7
|
|
8
8
|
class String
|
9
9
|
alias_method :_downcase, :downcase
|
data/test/test_romanization.rb
CHANGED
@@ -15,17 +15,32 @@ class TestRomanization < Minitest::Test
|
|
15
15
|
assert_equal 'Zhōngwén', 'Zhong1-wen2'.to_pinyin
|
16
16
|
assert_equal "Tiān'ānmén",'Tian1an1men2'.to_pinyin
|
17
17
|
assert_equal @alabo[:py], @alabo[:pyn].to_pinyin
|
18
|
-
|
18
|
+
assert_equal 'r', 'r5'.to_pinyin
|
19
19
|
#wg -> py not yet implemented
|
20
20
|
#mzd = "Mao Tse-tung"
|
21
21
|
#assert_equal "Mao Zedong", mzd.to_pinyin(:wg)
|
22
|
+
assert @alabo[:py].py?
|
23
|
+
assert 'Ā-lā-bó'.py?
|
24
|
+
assert 'Zhong1 wen2'.to_pinyin.py?
|
22
25
|
end
|
23
26
|
|
24
27
|
def test_pyn
|
25
28
|
assert_equal 'ni3 hao3', @py.to_pyn(:py)
|
26
29
|
assert_equal 'tian1an1men2', 'tian1an1men2'.to_py.to_pyn(:py)
|
27
30
|
|
28
|
-
|
31
|
+
assert_equal 'yi4', 'yì'.to_pyn(:py)
|
32
|
+
|
33
|
+
assert_equal 'ni3 hao3', 'ㄋㄧ3 ㄏㄠ3'.to_pyn(:bpmf)
|
34
|
+
assert_equal 'ni3 hao3', 'ㄋㄧ3 ㄏㄠ3'.to_pyn
|
35
|
+
assert_equal 'zhong1 guo2', 'chung1 kuo2'.to_pyn(:wg)
|
36
|
+
assert_equal 'zhong1 guo2', 'chung1 kuo2'.to_pyn
|
37
|
+
assert_equal 'chui1 niu3', 'chuei1 niou3'.to_pyn(:typy)
|
38
|
+
assert_equal 'cao3 di4', 'tsau3 di4'.to_pyn(:mspy2)
|
39
|
+
assert_equal 'cao3 di4', 'tsau3 di4'.to_pyn
|
40
|
+
assert_equal 'cao3 di4', 'tsau3 di4'.to_pyn(:yale)
|
41
|
+
assert_equal 'cao3 di4', 'tsau3 di4'.to_pyn
|
42
|
+
|
43
|
+
#assert_equal 'Wu1-lu2-ha1-nuo4-fu1', 'Wūlúhānuòfū'.to_pyn(:py)
|
29
44
|
#"007:Dàpò Liàngzǐ Wēijī", "007: Da4po4 Liang4zi3 Wei1ji1"
|
30
45
|
end
|
31
46
|
|
@@ -37,6 +52,8 @@ class TestRomanization < Minitest::Test
|
|
37
52
|
assert_equal 'ㄑㄧㄥ3 ㄏㄨㄟ2ㄉㄚ2 ㄨㄛ3 ㄉㄜ5 ㄨㄣ4ㄊㄧ2 .', @sent.to_zhuyin
|
38
53
|
assert_equal 'ㄇㄠ2 ㄗㄜ2ㄉㄨㄥ1', @mzd2.to_zhuyin_fuhao
|
39
54
|
assert 'ㄋㄧ3 ㄏㄠ3'.zyfh?
|
55
|
+
|
56
|
+
assert_equal 'ㄋㄧ3 ㄏㄠ3', 'ni3 hau3'.to_bpmf(:yale)
|
40
57
|
end
|
41
58
|
|
42
59
|
def test_wade_giles
|
@@ -45,6 +62,8 @@ class TestRomanization < Minitest::Test
|
|
45
62
|
assert_equal 'Mao2 Tse2 tung1', @mzd.to_wg
|
46
63
|
assert_equal 'Mao2 Tse2-tung1', @mzd2.to_wade_giles
|
47
64
|
assert_equal 'Mao2 Tse2-tung1 te5 mao2', 'Mao2 Ze2-dong1 de5 mao2'.to_wade_giles
|
65
|
+
|
66
|
+
assert_equal 'ni3 hao3', 'ni3 hau3'.to_wg(:yale)
|
48
67
|
end
|
49
68
|
|
50
69
|
#def test_mspy2
|
@@ -65,11 +84,15 @@ class TestRomanization < Minitest::Test
|
|
65
84
|
|
66
85
|
def test_yale
|
67
86
|
assert_equal 'ni3 hau3', @str.to_yale
|
87
|
+
|
88
|
+
assert_equal 'chwei1 nyou3', 'chuei1 niou3'.to_yale(:typy)
|
68
89
|
end
|
69
90
|
|
70
|
-
|
71
|
-
|
72
|
-
|
91
|
+
def test_romanization?
|
92
|
+
assert_equal :pyn, @alabo[:pyn].romanization?
|
93
|
+
assert_equal :py, @alabo[:py].romanization?
|
94
|
+
assert_equal :wg, @mzd.to_wg(:pyn).romanization?
|
95
|
+
end
|
73
96
|
|
74
97
|
def test_detect
|
75
98
|
assert @str.pyn?
|
@@ -78,9 +101,11 @@ class TestRomanization < Minitest::Test
|
|
78
101
|
|
79
102
|
assert 'chung1 kuo2'.wg?
|
80
103
|
|
81
|
-
# Travis CI is having trouble with this using Ruby 1.8.7, but it works locally.
|
82
|
-
# I'll probably end up dropping full 1.8.7 support.
|
83
104
|
assert @py.py?, "#{@py} should be pinyin. (#{@py.py?})" unless RUBY_VERSION < '1.9'
|
105
|
+
assert 'chuei1 niou3'.typy?
|
106
|
+
assert 'ㄋㄧ3 ㄏㄠ3'.bpmf?
|
107
|
+
assert 'ni3 hau3'.yale?
|
108
|
+
assert 'tsuen'.mps2?
|
84
109
|
end
|
85
110
|
|
86
111
|
def test_split_pyn
|
@@ -92,6 +117,13 @@ class TestRomanization < Minitest::Test
|
|
92
117
|
end
|
93
118
|
|
94
119
|
def setup
|
120
|
+
@romanizations = [
|
121
|
+
# FIXME: bopomofo, tongyong pinyin, wade-giles tones are all wrong.
|
122
|
+
# TODO: test IPA
|
123
|
+
{ :pyn => 'ni3 hao', :py => 'nǐ hǎo', :bopomofo => 'ㄋㄧ3 ㄏㄠ3', :yale => 'ni3 hau3', :typy => 'ni3 hao3', :wg => 'ni3 hao3'},#, :ipa => ''}
|
124
|
+
{ :pyn => 'zhong1 guo2', :py => 'nǐ hǎo', :bopomofo => 'ㄋㄧ3 ㄏㄠ3', :yale => 'ni3 hau3', :typy => 'ni3 hao3', :wg => 'chung1 kuo2'},#, :ipa => ''}
|
125
|
+
{ :pyn => 'chui1 niu3', :py => '', :bopomofo => '', :yale => 'chwei1 nyou3', :typy => 'chuei1 niou3', :wg => 'chung1 kuo2'},#, :ipa => ''}
|
126
|
+
]
|
95
127
|
@str = 'ni3 hao3'
|
96
128
|
@mzd = 'Mao2 Ze2 dong1'
|
97
129
|
@mzd2 = 'Mao2 Ze2-dong1'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zhongwen_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steven Daniels
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-06-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|