zhongwen_tools 0.7.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +0 -5
- data/README.md +4 -16
- data/lib/zhongwen_tools/conversion/string.rb +1 -1
- data/lib/zhongwen_tools/conversion.rb +5 -5
- data/lib/zhongwen_tools/integer.rb +2 -15
- data/lib/zhongwen_tools/numbers.rb +21 -18
- data/lib/zhongwen_tools/regex/ruby18.rb +15 -0
- data/lib/zhongwen_tools/regex.rb +94 -0
- data/lib/zhongwen_tools/romanization/conversion_table.rb +3 -3
- data/lib/zhongwen_tools/romanization/detect.rb +61 -49
- data/lib/zhongwen_tools/romanization/pyn_to_py.rb +5 -4
- data/lib/zhongwen_tools/romanization/string.rb +23 -0
- data/lib/zhongwen_tools/romanization.rb +3 -1
- data/lib/zhongwen_tools/string/caps.rb +5 -5
- data/lib/zhongwen_tools/string/ruby18.rb +57 -56
- data/lib/zhongwen_tools/string/ruby19.rb +1 -1
- data/lib/zhongwen_tools/string.rb +12 -8
- data/lib/zhongwen_tools/version.rb +1 -1
- data/test/test_numbers.rb +15 -3
- data/test/test_romanization.rb +25 -5
- data/test/test_string.rb +1 -0
- metadata +6 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c46a1639e99601e0a9f9cb5e6961f148cf030758
|
|
4
|
+
data.tar.gz: e4dc3033e220ecd882915dadeb7e9c780a0cbe65
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 427f2bc4b43ea3734995aa2d4c0523244882e300457500728164eef7f297441bd400569d272e59d7b3d7218777b39f46e67ca430765f4d272b5432c186dd09d2
|
|
7
|
+
data.tar.gz: 1c6065127ee0fda328044d412b545f4b85cb53660df3070599c081f197c41f581054a69cb4d64d05377e18cf81c15808d31ec1804f9ef0951fc24350bcb84374
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
|
@@ -132,22 +132,6 @@ The following capitalization methods work for pinyin.
|
|
|
132
132
|
ZhongwenTools::String.capitalize 'àomén'
|
|
133
133
|
#=> 'Àomén'
|
|
134
134
|
|
|
135
|
-
#### Ruby 1.8 safe methods
|
|
136
|
-
Zhongwen Tools is tested on every ruby since 1.8.7 and lets you deal
|
|
137
|
-
with multibyte strings in an simple, consistent fashion regardless of
|
|
138
|
-
which ruby version you are using.
|
|
139
|
-
|
|
140
|
-
require 'zhongwen_tools/string'
|
|
141
|
-
|
|
142
|
-
ZhongwenTools::String.chars '中文'
|
|
143
|
-
#=> ['中','文']
|
|
144
|
-
ZhongwenTools::String.size '中文'
|
|
145
|
-
#=> 2
|
|
146
|
-
ZhongwenTools::String.reverse '中文'
|
|
147
|
-
#=> '文中'
|
|
148
|
-
ZhongwenTools::String.to_utf8 '\x{D6D0}\x{CEC4}'
|
|
149
|
-
#=> '中文'
|
|
150
|
-
|
|
151
135
|
|
|
152
136
|
### Numbers
|
|
153
137
|
Functions for converting to and from Chinese numbers.
|
|
@@ -222,6 +206,10 @@ scripts. It **does not convert Chinese characters to pinyin** (see ZhongwenTools
|
|
|
222
206
|
str.to_py.py?
|
|
223
207
|
#=> true
|
|
224
208
|
|
|
209
|
+
#split pinyin with numbers accurately.
|
|
210
|
+
'dong1xi1'.split_pyn # => ['dong1', 'xi1']
|
|
211
|
+
'dongxi'.split_pyn # => ['dong', 'xi']
|
|
212
|
+
|
|
225
213
|
### Conversion
|
|
226
214
|
Functions for converting between scripts (e.g. traditional Chinese to
|
|
227
215
|
simplified Chinese) and [TODO] between Chinese and romanization systems (e.g.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#encoding: utf-8
|
|
1
|
+
# encoding: utf-8
|
|
2
2
|
|
|
3
3
|
module ZhongwenTools
|
|
4
4
|
|
|
@@ -20,7 +20,6 @@ module ZhongwenTools
|
|
|
20
20
|
def to_zhtw(str = nil)
|
|
21
21
|
str ||= self
|
|
22
22
|
|
|
23
|
-
|
|
24
23
|
convert(:zhtw, str)
|
|
25
24
|
end
|
|
26
25
|
|
|
@@ -48,17 +47,18 @@ module ZhongwenTools
|
|
|
48
47
|
|
|
49
48
|
private
|
|
50
49
|
# Conversion data and algorithm shamelessly stolen from chinese_convt gem.
|
|
50
|
+
# ( https://github.com/xxxooo/chinese_convt )
|
|
51
|
+
#
|
|
51
52
|
# There are two differences:
|
|
52
53
|
# + Zhongwen Tools loads the conversion data into memory and
|
|
53
|
-
# chinese_convt reads the file every time. As a result,
|
|
54
|
+
# chinese_convt reads the file every time it converts. As a result,
|
|
54
55
|
# Zhongwen Tools is ~12X faster.
|
|
55
56
|
# + Zhongwen Tools uses Ruby's nifty str[/regex/] = replacement
|
|
56
57
|
# instead of indices. Conversion tests using indices fail with Ruby 1.8.
|
|
57
|
-
# ( https://github.com/xxxooo/chinese_convt )
|
|
58
58
|
def load_table
|
|
59
59
|
filename = File.expand_path('../conversion/conversion_data', __FILE__)
|
|
60
60
|
File.open(filename).read.split("\n&\n").each do |group|
|
|
61
|
-
|
|
61
|
+
ZH_CONVERSION_TABLE << group.split("\n").map do |type|
|
|
62
62
|
Hash[ type.split(',').map{ |term| term.split(':') } ]
|
|
63
63
|
end
|
|
64
64
|
end
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
#encoding: utf-8
|
|
1
|
+
# encoding: utf-8
|
|
2
2
|
require File.expand_path("../numbers", __FILE__)
|
|
3
3
|
|
|
4
4
|
module ZhongwenTools
|
|
5
5
|
module Integer
|
|
6
6
|
include ZhongwenTools::Numbers
|
|
7
|
+
extend self
|
|
7
8
|
|
|
8
9
|
def to_zh(type = nil)
|
|
9
10
|
type == :zht ? self.to_zht? : self.to_zhs
|
|
@@ -23,19 +24,5 @@ module ZhongwenTools
|
|
|
23
24
|
int ||= self
|
|
24
25
|
number_to_pyn int.to_s, :num
|
|
25
26
|
end
|
|
26
|
-
|
|
27
|
-
class Basement
|
|
28
|
-
include ZhongwenTools::Integer
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
def self.to_zhs(*args)
|
|
32
|
-
Basement.new.to_zhs(*args)
|
|
33
|
-
end
|
|
34
|
-
def self.to_zht(*args)
|
|
35
|
-
Basement.new.to_zht(*args)
|
|
36
|
-
end
|
|
37
|
-
def self.to_pyn(*args)
|
|
38
|
-
Basement.new.to_pyn(*args)
|
|
39
|
-
end
|
|
40
27
|
end
|
|
41
28
|
end
|
|
@@ -1,10 +1,13 @@
|
|
|
1
|
-
#encoding: utf-8
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
require File.expand_path("../regex", __FILE__)
|
|
3
|
+
# TODO: more testing
|
|
2
4
|
module ZhongwenTools
|
|
3
5
|
module Numbers
|
|
4
6
|
extend self
|
|
5
7
|
|
|
6
8
|
NUMBER_MULTIPLES = '拾十百佰千仟仟万萬亿億'
|
|
7
|
-
|
|
9
|
+
# TODO: Add huge numbers.
|
|
10
|
+
# 垓 秭 穰 溝 澗 正 載 --> beyond 100,000,000!
|
|
8
11
|
NUMBERS_TABLE = [
|
|
9
12
|
{ :zhs => '零', :zht => '零', :num => 0, :pyn => 'ling2'},
|
|
10
13
|
{ :zhs => '〇', :zht => '〇', :num => 0, :pyn => 'ling2'},
|
|
@@ -35,31 +38,31 @@ module ZhongwenTools
|
|
|
35
38
|
{ :zhs => '廿', :zht => '廿', :num => 20, :pyn => ' nian4'},
|
|
36
39
|
{ :zhs => '百', :zht => '百', :num => 100, :pyn => 'bai2'},
|
|
37
40
|
{ :zhs => '佰', :zht => '佰', :num => 100, :pyn => 'bai2'},
|
|
38
|
-
{ :zhs => '千', :zht => '千', :num =>
|
|
39
|
-
{ :zhs => '仟', :zht => '仟', :num =>
|
|
40
|
-
{ :zhs => '万', :zht => '萬', :num =>
|
|
41
|
-
{ :zhs => '亿', :zht => '億', :num =>
|
|
41
|
+
{ :zhs => '千', :zht => '千', :num => 1_000, :pyn => 'qian2'},
|
|
42
|
+
{ :zhs => '仟', :zht => '仟', :num => 1_000, :pyn => 'qian2'},
|
|
43
|
+
{ :zhs => '万', :zht => '萬', :num => 10_000, :pyn => 'wan4'},
|
|
44
|
+
{ :zhs => '亿', :zht => '億', :num => 100_000_000, :pyn => 'yi4'},
|
|
42
45
|
]
|
|
43
46
|
|
|
44
47
|
def number? word
|
|
45
|
-
|
|
46
|
-
"#{word}".gsub(/([\d]|[一二三四五六七八九十百千萬万億亿]){2,}/,'') == ''
|
|
48
|
+
"#{word}".gsub(/([\d]|#{ZhongwenTools::Regex.zh_numbers}){1,}/,'') == ''
|
|
47
49
|
end
|
|
48
50
|
|
|
49
51
|
def zh_number_to_number(zh_number)
|
|
50
52
|
zh_number = zh_number.to_s
|
|
51
53
|
numbers = convert_date(zh_number)
|
|
52
54
|
|
|
53
|
-
#if it's a year, or an oddly formatted number
|
|
55
|
+
# if it's a year, or an oddly formatted number
|
|
54
56
|
return numbers.join('').to_i if zh_number[/[#{NUMBER_MULTIPLES}]/u].nil?
|
|
55
57
|
|
|
56
58
|
convert_numbers numbers
|
|
57
59
|
end
|
|
58
60
|
|
|
59
|
-
#these should also be able to convert numbers to chinese numbers
|
|
61
|
+
# these should also be able to convert numbers to chinese numbers
|
|
60
62
|
def number_to_zhs type, number
|
|
61
63
|
convert_number_to :zhs, type.to_sym, number
|
|
62
64
|
end
|
|
65
|
+
|
|
63
66
|
def number_to_zht type, number
|
|
64
67
|
convert_number_to :zht, type.to_sym, number
|
|
65
68
|
end
|
|
@@ -70,7 +73,7 @@ module ZhongwenTools
|
|
|
70
73
|
|
|
71
74
|
private
|
|
72
75
|
def convert_date(zh)
|
|
73
|
-
#if it's a year, or an oddly formatted number
|
|
76
|
+
# if it's a year, or an oddly formatted number
|
|
74
77
|
zh_numbers = ZhongwenTools::String.chars zh
|
|
75
78
|
numbers = [];
|
|
76
79
|
i = 0
|
|
@@ -78,7 +81,7 @@ module ZhongwenTools
|
|
|
78
81
|
while( i < zh_numbers.length)
|
|
79
82
|
curr_number = zh_numbers[i]
|
|
80
83
|
|
|
81
|
-
#x[:num] == curr_number.to_i is a kludge; any string will == 0
|
|
84
|
+
# x[:num] == curr_number.to_i is a kludge; any string will == 0
|
|
82
85
|
num = convert(curr_number)[:num]
|
|
83
86
|
numbers << num
|
|
84
87
|
i += 1
|
|
@@ -125,7 +128,7 @@ module ZhongwenTools
|
|
|
125
128
|
|
|
126
129
|
|
|
127
130
|
def is_number_multiplier?(number)
|
|
128
|
-
[10,100,
|
|
131
|
+
[10,100,1_000,10_000,100_000_000].include? number
|
|
129
132
|
end
|
|
130
133
|
|
|
131
134
|
|
|
@@ -144,7 +147,7 @@ module ZhongwenTools
|
|
|
144
147
|
end
|
|
145
148
|
|
|
146
149
|
def convert_from_num number, to
|
|
147
|
-
#TODO: this will fail for numbers over 1 billion. grr.
|
|
150
|
+
# TODO: this will fail for numbers over 1 billion. grr.
|
|
148
151
|
str = number.to_s
|
|
149
152
|
len = str.length
|
|
150
153
|
converted_number = []
|
|
@@ -157,8 +160,8 @@ module ZhongwenTools
|
|
|
157
160
|
converted_number << _find_number(num, to) unless num == 0
|
|
158
161
|
else
|
|
159
162
|
converted_number << _find_wan_level(i, to)
|
|
160
|
-
#checks the wan level and ...
|
|
161
|
-
converted_number << _find_number(num, to) if (num == 1 && (10**(i) /
|
|
163
|
+
# checks the wan level and ...
|
|
164
|
+
converted_number << _find_number(num, to) if (num == 1 && (10**(i) / 10_000 ** wan) != 10) || num != 1
|
|
162
165
|
end
|
|
163
166
|
end
|
|
164
167
|
|
|
@@ -174,14 +177,14 @@ module ZhongwenTools
|
|
|
174
177
|
converted_number = convert_from_zh number, to
|
|
175
178
|
end
|
|
176
179
|
|
|
177
|
-
#liang rules are tough...
|
|
180
|
+
# FIXME: liang rules are tough...
|
|
178
181
|
converted_number.join(separator).gsub(/零[#{NUMBER_MULTIPLES}]/u,'')#.gsub(/二([百佰千仟仟万萬亿億])/){"#{NUMBERS_TABLE.find{|x|x[:pyn] == 'liang3'}[to]}#{$1}"}
|
|
179
182
|
end
|
|
180
183
|
|
|
181
184
|
private
|
|
182
185
|
|
|
183
186
|
def _find_wan_level(i, to)
|
|
184
|
-
_find_number((10**(i)), to) || _find_number((10**(i) /
|
|
187
|
+
_find_number((10**(i)), to) || _find_number((10**(i) / 10_000), to) || _find_number((10**(i) / 10_000**2), to)
|
|
185
188
|
end
|
|
186
189
|
|
|
187
190
|
def _find_number(num, to)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
module ZhongwenTools
|
|
3
|
+
module Regex
|
|
4
|
+
extend self
|
|
5
|
+
|
|
6
|
+
def pyn
|
|
7
|
+
# NOTE: might not need / want the space on the end.
|
|
8
|
+
/(#{pyn_regexes.values.join('|')})([1-5])?([\s\-]+)?/
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def py
|
|
12
|
+
# NOTE: might not need / want the space on the end.
|
|
13
|
+
/(#{pyn_regexes.map{|k,v| v.to_s[7..-2].gsub_with_hash(/[aeiouv]/,py_tones)}.join('|')}(\s\-))/
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def fullwidth
|
|
17
|
+
/[0-9A-Za-z%.:#$&+-/\=;<>]/
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def capital_letters
|
|
21
|
+
/(#{Regexp.union(ZhongwenTools::UNICODE_CAPS.keys)})/
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def lowercase_letters
|
|
25
|
+
/(#{Regexp.union(ZhongwenTools::UNICODE_CAPS.values)})/
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def zh
|
|
29
|
+
/[\u2E80-\u2E99]|[\u2E9B-\u2EF3]|[\u2F00-\u2FD5]|[\u3005|\u3007]|[\u3021-\u3029]|[\u3038-\u303B]|[\u3400-\u4DB5]|[\u4E00-\u9FCC]|[\uF900-\uFA6D]|[\uFA70-\uFAD9]/
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def punc
|
|
33
|
+
/[\u0021-\u0023]|[\u0025-\u002A]|[\u002C-\u002F]|[\u003A\u003B\u003F\u0040]|[\u005B-\u005D\u005F\u007B\u007D\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387]/
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def zh_punc
|
|
37
|
+
# TODO: includes non-zh punctuation codes. Should only include punctuation in CJK ranges.
|
|
38
|
+
/[\u2E00-\u2E2E]|[\u2E30-\u2E3B]|[\u3001-\u3003]|[\u3008-\u3011]|[\u3014-\u301F]|[\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF]|[\uA60D-\uA60F]|[\uA673\uA67E]|[\uA6F2-\uA6F7]|[\uA874-\uA877]|[\uA8CE\uA8CF]|[\uA8F8-\uA8FA]|[\uA92E\uA92F\uA95F]|[\uA9C1-\uA9CD]|[\uA9DE\uA9DF]|[\uAA5C-\uAA5F]|[\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F]|[\uFE10-\uFE19]|[\uFE30-\uFE52]|[\uFE54-\uFE61]|[\uFE63\uFE68\uFE6A\uFE6B]|[\uFF01-\uFF03]|[\uFF05-\uFF0A]|[\uFF0C-\uFF0F]|[\uFF1A\uFF1B\uFF1F\uFF20]|[\uFF3B-\uFF3D]|[\uFF3F\uFF5B\uFF5D]|[\uFF5F-\uFF65]/
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def zh_numbers
|
|
42
|
+
# TODO: include numbers like yotta, etc.
|
|
43
|
+
# 垓 秭 穰 溝 澗 正 載 --> beyond 100,000,000!
|
|
44
|
+
/[〇零一壹幺二贰貳两兩三弎叁參四肆䦉五伍六陆陸七柒八捌九玖十拾廿百佰千仟万萬亿億]/
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Public: A Regex for bopomofo, a.k.a. Zhuyin Fuhao 注音符号.
|
|
48
|
+
#
|
|
49
|
+
# Examples
|
|
50
|
+
#
|
|
51
|
+
#
|
|
52
|
+
# bopomofo #=> <Regex>
|
|
53
|
+
#
|
|
54
|
+
# Returns a Regex.
|
|
55
|
+
def bopomofo
|
|
56
|
+
/[ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩ]/
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
def pyn_regexes
|
|
61
|
+
# http://stackoverflow.com/questions/20736291/regex-for-matching-pinyin
|
|
62
|
+
# https://www.debuggex.com/r/_9kbxA6f00gIGiVo
|
|
63
|
+
# NOTE: you might need to change the order of these regexes for more accurate matching of some pinyin.
|
|
64
|
+
{
|
|
65
|
+
:nl_regex => /([nN]eng?|[lnLN](a(i|ng?|o)?|e(i|ng)?|i(ang|a[on]?|e|ng?|u)?|o(ng?|u)|u(o|i|an?|n)?|ve?))/,
|
|
66
|
+
:bpm_regex => /([mM]iu|[pmPM]ou|[bpmBPM](o|e(i|ng?)?|a(ng?|i|o)?|i(e|ng?|a[no])?|u))/,
|
|
67
|
+
:f_regex => /([fF](ou?|[ae](ng?|i)?|u))/,
|
|
68
|
+
:dt_regex => /([dD](e(i|ng?)|i(a[on]?|u))|[dtDT](a(i|ng?|o)?|e(i|ng)?|i(a[on]?|e|ng|u)?|o(ng?|u)|u(o|i|an?|n)?))/,
|
|
69
|
+
:gkh_regex => /([ghkGHK](a(i|ng?|o)?|e(i|ng?)?|o(u|ng)|u(a(i|ng?)?|i|n|o)?))/,
|
|
70
|
+
:zczhch_regex => /([zZ]h?ei|[czCZ]h?(e(ng?)?|o(ng?|u)?|ao|u?a(i|ng?)?|u?(o|i|n)?))/,
|
|
71
|
+
:ssh_regex => /([sS]ong|[sS]hua(i|ng?)?|[sS]hei|[sS][h]?(a(i|ng?|o)?|en?g?|ou|u(a?n|o|i)?|i))/,
|
|
72
|
+
:r_regex => /([rR]([ae]ng?|i|e|ao|ou|ong|u[oin]|ua?n?))/,
|
|
73
|
+
:jqx_regex => /([jqxJQX](i(a(o|ng?)?|[eu]|ong|ng?)?|u(e|a?n)?))/,
|
|
74
|
+
:aeo_regex => /(([aA](i|o|ng?)?|[oO]u?|[eE](i|ng?|r)?))/,
|
|
75
|
+
:w_regex => /([wW](a(i|ng?)?|o|e(i|ng?)?|u))/,
|
|
76
|
+
:y_regex => /[yY](a(o|ng?)?|e|in?g?|o(u|ng)?|u(e|a?n)?)/
|
|
77
|
+
}
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def py_tones
|
|
81
|
+
py_tones = {
|
|
82
|
+
'a' => '[āáǎàa]',
|
|
83
|
+
'e' => '[ēéěèe]',
|
|
84
|
+
'i' => '[īíǐìi]',
|
|
85
|
+
'o' => '[ōóǒòo]',
|
|
86
|
+
'u' => '[ūúǔùu]',
|
|
87
|
+
'v' => '[ǖǘǚǜü]'
|
|
88
|
+
#([ĀÁǍÀA][io]?|[io]?|[][āáǎàaēéěèeūúǔùu]?o?|[ĒÉĚÈE]i?|[]i?|[ŌÓǑÒO]u?|[]u?|u[āáǎàaēoēéěèe]?i?|[]e?)(n?g?r?)){1,}
|
|
89
|
+
}
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
require File.expand_path("../regex/ruby18", __FILE__) if RUBY_VERSION < '1.9'
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
#encoding: utf-8
|
|
1
|
+
# encoding: utf-8
|
|
2
2
|
module ZhongwenTools
|
|
3
3
|
module Romanization
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
# TODO: remove excess values, i.e. keys whose value == :py
|
|
5
|
+
ROMANIZATIONS_TABLE = [{:zyfh => " ㄚ", :wg => "a", :mps2 => "a", :yale => "a", :typy => "a", :py => "a"},
|
|
6
6
|
{ :zyfh => "ㄞ", :wg => "ai", :mps2 => "ai", :yale => "ai", :typy => "ai", :py => "ai"},
|
|
7
7
|
{ :zyfh => "ㄢ", :wg => "an", :mps2 => "an", :yale => "an", :typy => "an", :py => "an"},
|
|
8
8
|
{ :zyfh => "ㄤ", :wg => "ang", :mps2 => "ang", :yale => "ang", :typy => "ang", :py => "ang"},
|
|
@@ -1,49 +1,13 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
|
+
require File.expand_path("../../regex", __FILE__)
|
|
3
|
+
|
|
2
4
|
module ZhongwenTools
|
|
3
5
|
module Romanization
|
|
6
|
+
# Deprecated: a Regex for accurate pinyin. Use ZhongwenTools::Regex.py instead
|
|
7
|
+
PY_REGEX = ZhongwenTools::Regex.py
|
|
4
8
|
|
|
5
|
-
#
|
|
6
|
-
|
|
7
|
-
:bpm_regex => /(miu|[pm]ou|[bpm](o|e(i|ng?)?|a(ng?|i|o)?|i(e|ng?|a[no])?|u))/,
|
|
8
|
-
:f_regex => /(f(ou?|[ae](ng?|i)?|u))/,
|
|
9
|
-
:dt_regex => /(d(e(i|ng?)|i(a[on]?|u))|[dt](a(i|ng?|o)?|e(i|ng)?|i(a[on]?|e|ng|u)?|o(ng?|u)|u(o|i|an?|n)?))/,
|
|
10
|
-
:nl_regex => /(neng?|[ln](a(i|ng?|o)?|e(i|ng)?|i(ang|a[on]?|e|ng?|u)?|o(ng?|u)|u(o|i|an?|n)?|ve?))/,
|
|
11
|
-
:gkh_regex => /([ghk](a(i|ng?|o)?|e(i|ng?)?|o(u|ng)|u(a(i|ng?)?|i|n|o)?))/,
|
|
12
|
-
:zczhch_regex => /(z[h]?ei|[cz]hua(i|ng?)?|[cz][h]?(a(i|ng?|o)?|en?g?|o(u|ng)?|u(a?n|o|i)?))/,
|
|
13
|
-
:ssh_regex => /(song|shua(i|ng?)?|shei|s[h]?(a(i|ng?|o)?|en?g?|ou|u(a?n|o|i)?))/,
|
|
14
|
-
:r_regex => /(r([ae]ng?|i|e|ao|ou|ong|u[oin]|ua?n?))/,
|
|
15
|
-
:jqx_regex => /([jqx](i(a(o|ng?)?|[eu]|ong|ng?)?|u(e|a?n)?))/,
|
|
16
|
-
:aw_regex => /(wu|w?(a(i|o|ng?)?|ou?|e(i|ng?)?))/,
|
|
17
|
-
:y_regex => /y(a(o|ng?)?|e|in?g?|o(u|ng)?|u(e|a?n)?)/
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
if RUBY_VERSION < '1.9'
|
|
22
|
-
py_tones = {
|
|
23
|
-
'a' => '(ā|á|ǎ|à|a)',
|
|
24
|
-
'e' => '(ē|é|ě|è|e)',
|
|
25
|
-
'i' => '(ī|í|ǐ|ì|i)',
|
|
26
|
-
'o' => '(ō|ó|ǒ|ò|o)',
|
|
27
|
-
'u' => '(ū|ú|ǔ|ù|u)',
|
|
28
|
-
'v' => '(ǖ|ǘ|ǚ|ǜ|ü)'
|
|
29
|
-
}
|
|
30
|
-
# might not need the space on the end.
|
|
31
|
-
|
|
32
|
-
PY_REGEX = /(#{pyn_regexes.map{|k,v| v.to_s[7..-2].gsub_with_hash(/[aeiouv]/,py_tones)}.join('|')}(\s\-))/
|
|
33
|
-
else
|
|
34
|
-
py_tones = {
|
|
35
|
-
'a' => '[āáǎàa]',
|
|
36
|
-
'e' => '[ēéěèe]',
|
|
37
|
-
'i' => '[īíǐìi]',
|
|
38
|
-
'o' => '[ōóǒòo]',
|
|
39
|
-
'u' => '[ūúǔùu]',
|
|
40
|
-
'v' => '[ǖǘǚǜü]'
|
|
41
|
-
#([ĀÁǍÀA][io]?|[io]?|[][āáǎàaēéěèeūúǔùu]?o?|[ĒÉĚÈE]i?|[]i?|[ŌÓǑÒO]u?|[]u?|u[āáǎàaēoēéěèe]?i?|[]e?)(n?g?r?)){1,}
|
|
42
|
-
}
|
|
43
|
-
PY_REGEX = /(#{pyn_regexes.map{|k,v| v.to_s[7..-2].gsub(/[aeiouv]/,py_tones)}.join('|')}(\s\-))/
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
PINYIN_REGEX = /(#{pyn_regexes.values.join('|')})([1-5])?([\s\-]+)?/
|
|
9
|
+
# Deprecate: a Regex for accurate pinyin with numbers. use ZhongwenTools::Regex.pyn instead.
|
|
10
|
+
PINYIN_REGEX = ZhongwenTools::Regex.pyn
|
|
47
11
|
|
|
48
12
|
# Public: checks if a string is pinyin.
|
|
49
13
|
#
|
|
@@ -55,7 +19,7 @@ module ZhongwenTools
|
|
|
55
19
|
def py?(str = nil)
|
|
56
20
|
str ||= self
|
|
57
21
|
|
|
58
|
-
str.gsub(
|
|
22
|
+
str.gsub(ZhongwenTools::Regex.py, '').strip == ''
|
|
59
23
|
end
|
|
60
24
|
|
|
61
25
|
# Public: checks if a string is pinyin.
|
|
@@ -68,24 +32,72 @@ module ZhongwenTools
|
|
|
68
32
|
def pyn?(str = nil)
|
|
69
33
|
str ||= self
|
|
70
34
|
|
|
71
|
-
str.gsub(
|
|
35
|
+
str.gsub(ZhongwenTools::Regex.pyn, '').strip == ''
|
|
72
36
|
end
|
|
73
37
|
|
|
74
|
-
# Public:
|
|
38
|
+
# Public: Checks if a string is wade-giles.
|
|
75
39
|
#
|
|
76
40
|
# Examples
|
|
77
41
|
# wg?('pin1-yin1')
|
|
78
42
|
# # => false
|
|
79
|
-
#
|
|
43
|
+
#
|
|
44
|
+
# Returns a Boolean.
|
|
80
45
|
def wg?(str = nil, type = :pyn)
|
|
81
|
-
#
|
|
46
|
+
# NOTE: There are some situations where wg == pyn, but there's no way to differentiate the two.
|
|
47
|
+
# FIXME: it shouldn't be pyn, but it should be able to conver to pyn
|
|
48
|
+
# Actually, wade-giles does sometimes overlap with pyn. So this
|
|
49
|
+
# method creates false negatives. A future :romanization method
|
|
50
|
+
# would default to pyn, but this method shouldn't.
|
|
51
|
+
# Add tests where str.pyn? and str.wg?
|
|
52
|
+
|
|
82
53
|
str ||= self
|
|
83
|
-
#easy ones.. is it py? pyn? zyfh? gyrm?
|
|
84
|
-
#harder ones: is it typy, msp2, yale, wg
|
|
85
54
|
wg = ZhongwenTools::Romanization.to_wade_giles(str, type)
|
|
86
55
|
# TODO: need to convert string to pyn.
|
|
87
56
|
pyn = str
|
|
88
57
|
wg != pyn && wg.gsub(/[1-5]/,'')
|
|
89
58
|
end
|
|
59
|
+
|
|
60
|
+
# Public: Checks if a String is Zhuyin Fuhao (a.k.a. bopomofo).
|
|
61
|
+
#
|
|
62
|
+
# str - a String. Optional if the object calling the method is a String.
|
|
63
|
+
#
|
|
64
|
+
# Examples
|
|
65
|
+
#
|
|
66
|
+
# zyfh?('ㄊㄥ')
|
|
67
|
+
# # => true
|
|
68
|
+
#
|
|
69
|
+
# Returns a boolean.
|
|
70
|
+
def zyfh?(str = nil)
|
|
71
|
+
str ||= self
|
|
72
|
+
|
|
73
|
+
bopomofo = str.gsub(/[1-5\s]/,'')
|
|
74
|
+
bopomofo.scan(ZhongwenTools::Regex.bopomofo).join == bopomofo
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Public: Checks if a String is Tongyong Pinyin.
|
|
78
|
+
# http://en.wikipedia.org/wiki/Tongyong_Pinyin
|
|
79
|
+
# http://pinyin.info/romanization/tongyong/
|
|
80
|
+
#
|
|
81
|
+
# str - a String. Optional if the object calling the method is a String.
|
|
82
|
+
#
|
|
83
|
+
# Examples
|
|
84
|
+
#
|
|
85
|
+
# typy?('chuei niou')
|
|
86
|
+
# # => true
|
|
87
|
+
#
|
|
88
|
+
# Returns a boolean.
|
|
89
|
+
def typy?(str = nil)
|
|
90
|
+
str ||= self
|
|
91
|
+
|
|
92
|
+
typy = str.gsub(/[1-5\s\-']/,'')
|
|
93
|
+
# Sorting by String length means it will match the longest possible part.
|
|
94
|
+
# FIXME: it is probably possible for this to have false negatives.
|
|
95
|
+
# A more comprehensive regex like Regex.pyn would be needed
|
|
96
|
+
# to accurately detect typy.
|
|
97
|
+
regex_str = ROMANIZATIONS_TABLE.map{ |r| r[:typy] || r[:py] }.sort{|x,y| x.size <=> y.size}.reverse.join('|')
|
|
98
|
+
typy.scan(/#{regex_str}/).join == typy
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# TODO: msp2? yale? wgyrm? romanization?
|
|
90
102
|
end
|
|
91
103
|
end
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
#encoding: utf-8
|
|
2
|
-
|
|
3
|
-
#
|
|
4
|
-
#
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
# NOTE: This table works for pyn -> pinyin conversion, but it introduces
|
|
4
|
+
# mistakes when converting pinyin to pyn. In practice, pinyin can't
|
|
5
|
+
# be converted to pyn properly unless it's properly formatted.
|
|
5
6
|
module ZhongwenTools
|
|
6
7
|
module Romanization
|
|
7
8
|
PYN_PY = {
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
module ZhongwenTools
|
|
3
|
+
module Romanization
|
|
4
|
+
|
|
5
|
+
# Public: splits pinyin number strings.
|
|
6
|
+
#
|
|
7
|
+
# str - a String to be split
|
|
8
|
+
#
|
|
9
|
+
# Examples
|
|
10
|
+
#
|
|
11
|
+
#
|
|
12
|
+
# split_pyn('zhong1guo2')
|
|
13
|
+
# # => ['zhong1', 'guo2']
|
|
14
|
+
#
|
|
15
|
+
# Returns an Array of Strings.
|
|
16
|
+
def split_pyn(str = nil)
|
|
17
|
+
str ||= self
|
|
18
|
+
puts "WARNING: string is not valid pinyin-num format. #{str}" unless str.pyn?
|
|
19
|
+
|
|
20
|
+
str.scan(/(#{ZhongwenTools::Regex.pyn})/).map{ |arr| arr[0].strip.gsub('-','') }.flatten
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
2
|
require File.expand_path("../romanization/conversion_table", __FILE__)
|
|
3
3
|
require File.expand_path("../romanization/detect", __FILE__)
|
|
4
|
+
require File.expand_path("../romanization/string", __FILE__)
|
|
4
5
|
require File.expand_path("../romanization/pyn_to_py", __FILE__)
|
|
5
6
|
|
|
6
7
|
module ZhongwenTools
|
|
@@ -66,6 +67,7 @@ module ZhongwenTools
|
|
|
66
67
|
#
|
|
67
68
|
# Returns a string with actual pinyin
|
|
68
69
|
def _to_pinyin str
|
|
70
|
+
# TODO: move regex to ZhongwenTools::Regex
|
|
69
71
|
regex = /(([BPMFDTNLGKHZCSRJQXWYbpmfdtnlgkhzcsrjqxwy]?[h]?)(A[io]?|a[io]?|i[aeu]?o?|Ei?|ei?|Ou?|ou?|u[aoe]?i?|ve?)?(n?g?)(r?)([1-5])(\-+)?)/
|
|
70
72
|
|
|
71
73
|
# doing the substitution in a block is ~8x faster than using scan and each.
|
|
@@ -107,7 +109,7 @@ module ZhongwenTools
|
|
|
107
109
|
|
|
108
110
|
def _replacement(token, from = nil)
|
|
109
111
|
token = token.downcase.gsub(/[1-5].*/,'')
|
|
110
|
-
|
|
112
|
+
ROMANIZATIONS_TABLE.find do |x|
|
|
111
113
|
if from.nil?
|
|
112
114
|
x.values.include?(token)
|
|
113
115
|
else
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#encoding: utf-8
|
|
1
|
+
# encoding: utf-8
|
|
2
2
|
|
|
3
3
|
module ZhongwenTools
|
|
4
4
|
UNICODE_CAPS = {
|
|
@@ -18,10 +18,10 @@ module ZhongwenTools
|
|
|
18
18
|
'Ó' => 'ó',
|
|
19
19
|
'Ǒ' => 'ǒ',
|
|
20
20
|
'Ò' => 'ò',
|
|
21
|
-
'Ǖ' => 'ǖ'
|
|
22
|
-
'Ǘ' => 'ǘ'
|
|
23
|
-
'Ǚ' => 'ǚ'
|
|
24
|
-
'Ǜ' => 'ǜ'
|
|
21
|
+
'Ǖ' => 'ǖ', # using combining diatrical marks
|
|
22
|
+
'Ǘ' => 'ǘ', # using combining diatrical marks
|
|
23
|
+
'Ǚ' => 'ǚ', # using combining diatrical marks
|
|
24
|
+
'Ǜ' => 'ǜ', # using combining diatrical marks
|
|
25
25
|
'Ū' => 'ū',
|
|
26
26
|
'Ú' => 'ú',
|
|
27
27
|
'Ǔ' => 'ǔ',
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#encoding: utf-8
|
|
1
|
+
# encoding: utf-8
|
|
2
2
|
|
|
3
3
|
class String
|
|
4
4
|
define_method(:chars) do
|
|
@@ -22,74 +22,75 @@ end
|
|
|
22
22
|
|
|
23
23
|
module ZhongwenTools
|
|
24
24
|
module String
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
text
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
def convert_regex(regex)
|
|
40
|
-
str = regex.to_s
|
|
41
|
-
regex.to_s.scan(/u[0-9A-Z]{4}/).each{|cp| str = str.sub('\\' + cp,cp.from_codepoint)}
|
|
42
|
-
/#{str}/
|
|
25
|
+
# TODO: replace deprecated constant UNICODE_REGEX.
|
|
26
|
+
end
|
|
27
|
+
def to_utf8(encoding = nil, encodings = nil)
|
|
28
|
+
# FIXME: should substitute out known bad actors like space
|
|
29
|
+
encodings = ['utf-8', 'GB18030', 'BIG5', 'GBK', 'GB2312'] if encodings.nil?
|
|
30
|
+
encodings = encoding + encodings unless encoding.nil?
|
|
31
|
+
raise 'Unable to Convert' if encodings.size == 0
|
|
32
|
+
|
|
33
|
+
begin
|
|
34
|
+
text = Iconv.conv('utf-8', encodings[0], self)
|
|
35
|
+
rescue
|
|
36
|
+
text = self.to_utf8(nil, encodings[1..-1])
|
|
43
37
|
end
|
|
38
|
+
text
|
|
39
|
+
end
|
|
44
40
|
|
|
45
|
-
|
|
46
|
-
|
|
41
|
+
def convert_regex(regex)
|
|
42
|
+
str = regex.to_s
|
|
43
|
+
regex.to_s.scan(/u[0-9A-Z]{4}/).each{|cp| str = str.sub('\\' + cp,cp.from_codepoint)}
|
|
44
|
+
/#{str}/
|
|
45
|
+
end
|
|
47
46
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
:punc => self.convert_regex(UNICODE_REGEX[:punc])
|
|
51
|
-
}
|
|
52
|
-
#str.scan(/#{regex[:zh]}|#{regex[:punc]}|\s/).join == str
|
|
53
|
-
!self.fullwidth?(str) && (!str[regex[:zh]].nil? || !str[regex[:punc]].nil?)
|
|
54
|
-
end
|
|
47
|
+
def has_zh?(str = nil)
|
|
48
|
+
str ||= self
|
|
55
49
|
|
|
56
|
-
|
|
57
|
-
|
|
50
|
+
regex = {
|
|
51
|
+
:zh => self.convert_regex(UNICODE_REGEX[:zh]),
|
|
52
|
+
:punc => self.convert_regex(UNICODE_REGEX[:punc])
|
|
53
|
+
}
|
|
54
|
+
# str.scan(/#{regex[:zh]}|#{regex[:punc]}|\s/).join == str
|
|
55
|
+
!self.fullwidth?(str) && (!str[regex[:zh]].nil? || !str[regex[:punc]].nil?)
|
|
56
|
+
end
|
|
58
57
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
:punc => self.convert_regex(UNICODE_REGEX[:punc])
|
|
62
|
-
}
|
|
58
|
+
def zh?(str = nil)
|
|
59
|
+
str ||= self
|
|
63
60
|
|
|
64
|
-
|
|
65
|
-
|
|
61
|
+
regex = {
|
|
62
|
+
:zh => self.convert_regex(UNICODE_REGEX[:zh]),
|
|
63
|
+
:punc => self.convert_regex(UNICODE_REGEX[:punc])
|
|
64
|
+
}
|
|
66
65
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
regex = {
|
|
70
|
-
:zh => self.convert_regex(UNICODE_REGEX[:zh]),
|
|
71
|
-
:punc => self.convert_regex(UNICODE_REGEX[:punc])
|
|
72
|
-
}
|
|
66
|
+
!str.fullwidth? && (str.scan(/(#{regex[:zh]}+|#{regex[:punc]}+|\s+)/).join == str)
|
|
67
|
+
end
|
|
73
68
|
|
|
74
|
-
|
|
75
|
-
|
|
69
|
+
def has_zh_punctuation?(str = nil)
|
|
70
|
+
str ||= self
|
|
71
|
+
regex = {
|
|
72
|
+
:zh => self.convert_regex(UNICODE_REGEX[:zh]),
|
|
73
|
+
:punc => self.convert_regex(UNICODE_REGEX[:punc])
|
|
74
|
+
}
|
|
76
75
|
|
|
77
|
-
|
|
78
|
-
|
|
76
|
+
!str[regex[:punc]].nil?
|
|
77
|
+
end
|
|
79
78
|
|
|
80
|
-
|
|
81
|
-
|
|
79
|
+
def strip_zh_punctuation(str = nil)
|
|
80
|
+
str ||= self
|
|
82
81
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
matches = str.scan(/([0-9A-Za-z%.:#$&+-/\=;<>])/u).uniq.flatten
|
|
82
|
+
str.gsub(self.convert_regex(UNICODE_REGEX[:punc]), '')
|
|
83
|
+
end
|
|
86
84
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
end
|
|
85
|
+
def to_halfwidth(str = nil)
|
|
86
|
+
str ||= self
|
|
87
|
+
matches = str.scan(/([0-9A-Za-z%.:#$&+-/\=;<>])/u).uniq.flatten
|
|
91
88
|
|
|
92
|
-
|
|
89
|
+
matches.each do |match|
|
|
90
|
+
replacement = FW_HW[match]
|
|
91
|
+
str = str.gsub(match, replacement)
|
|
93
92
|
end
|
|
93
|
+
|
|
94
|
+
str
|
|
94
95
|
end
|
|
95
96
|
end
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
2
|
#$:.unshift File.join(File.dirname(__FILE__),'..','lib','zhongwen_tools', 'string')
|
|
3
3
|
require 'uri'
|
|
4
|
+
require File.expand_path("../regex", __FILE__)
|
|
4
5
|
require File.expand_path("../string/fullwidth", __FILE__)
|
|
5
6
|
require File.expand_path("../string/caps", __FILE__)
|
|
6
7
|
|
|
7
8
|
class String
|
|
8
9
|
alias_method :_downcase, :downcase
|
|
9
10
|
alias_method :_upcase, :upcase
|
|
11
|
+
alias_method :gsub_with_hash, :gsub
|
|
10
12
|
|
|
11
13
|
def downcase
|
|
12
14
|
self._downcase.gsub(/(#{ZhongwenTools::UNICODE_CAPS.keys.join('|')})/){
|
|
@@ -35,10 +37,12 @@ module ZhongwenTools
|
|
|
35
37
|
module String
|
|
36
38
|
extend self
|
|
37
39
|
|
|
40
|
+
# Deprecated: a Hash of unicode Regexes. Use ZhongwenTools::Regex.zh instead
|
|
38
41
|
UNICODE_REGEX = {
|
|
39
|
-
:zh =>
|
|
40
|
-
:punc =>
|
|
42
|
+
:zh => Regex.zh,
|
|
43
|
+
:punc => Regex.zh_punc
|
|
41
44
|
}
|
|
45
|
+
|
|
42
46
|
def to_utf8(str = nil)
|
|
43
47
|
(str || self).force_encoding('utf-8')
|
|
44
48
|
#TODO: better conversion methods can be extracted from categories service
|
|
@@ -47,13 +51,13 @@ module ZhongwenTools
|
|
|
47
51
|
def has_zh?(str = nil)
|
|
48
52
|
str ||= self
|
|
49
53
|
|
|
50
|
-
!str[/(#{
|
|
54
|
+
!str[/(#{Regex.zh}|#{Regex.zh_punc})/].nil?
|
|
51
55
|
end
|
|
52
56
|
|
|
53
57
|
def zh?(str = nil)
|
|
54
58
|
str ||= self
|
|
55
59
|
|
|
56
|
-
str.scan(/(#{
|
|
60
|
+
str.scan(/(#{Regex.zh}+|#{Regex.zh_punc}+|\s+)/).join == str
|
|
57
61
|
end
|
|
58
62
|
|
|
59
63
|
def downcase(str = nil)
|
|
@@ -77,13 +81,13 @@ module ZhongwenTools
|
|
|
77
81
|
def has_zh_punctuation?(str = nil)
|
|
78
82
|
str ||= self
|
|
79
83
|
|
|
80
|
-
!str[
|
|
84
|
+
!str[Regex.zh_punc].nil?
|
|
81
85
|
end
|
|
82
86
|
|
|
83
87
|
def strip_zh_punctuation(str = nil)
|
|
84
88
|
str ||= self
|
|
85
89
|
|
|
86
|
-
str.gsub(
|
|
90
|
+
str.gsub(Regex.zh_punc, '')
|
|
87
91
|
end
|
|
88
92
|
|
|
89
93
|
def size(str = nil)
|
|
@@ -122,7 +126,7 @@ module ZhongwenTools
|
|
|
122
126
|
|
|
123
127
|
def halfwidth?(str = nil)
|
|
124
128
|
str ||= self
|
|
125
|
-
str[
|
|
129
|
+
str[Regex.fullwidth].nil?
|
|
126
130
|
end
|
|
127
131
|
|
|
128
132
|
def fullwidth?(str = nil)
|
|
@@ -133,7 +137,7 @@ module ZhongwenTools
|
|
|
133
137
|
def to_halfwidth(str = nil)
|
|
134
138
|
str ||= self
|
|
135
139
|
|
|
136
|
-
str.gsub(/(
|
|
140
|
+
str.gsub(/(#{Regex.fullwidth})/){ ZhongwenTools::FW_HW[$1] }
|
|
137
141
|
end
|
|
138
142
|
|
|
139
143
|
def to_codepoint(str = nil)
|
data/test/test_numbers.rb
CHANGED
|
@@ -41,14 +41,26 @@ class TestNumbers < Minitest::Test
|
|
|
41
41
|
pyn = self.number_to_pyn num
|
|
42
42
|
|
|
43
43
|
assert_equal 'yi1-bai2-san1-shi2-liu4', pyn
|
|
44
|
+
|
|
45
|
+
num = '一千五百四十二'
|
|
46
|
+
pyn = self.number_to_pyn num
|
|
47
|
+
assert_equal 'yi1-qian2-wu3-bai2-si4-shi2-er4', pyn
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def test_is_number
|
|
51
|
+
@numbers.map{ |n| n[:zh]}.each do |zh|
|
|
52
|
+
assert self.number? zh
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
assert self.number? '一'
|
|
44
56
|
end
|
|
45
57
|
|
|
46
58
|
def setup
|
|
47
59
|
@numbers = [
|
|
48
|
-
{:zh =>'一万两千七', :en =>
|
|
49
|
-
{:zh => '三千六十三', :en =>
|
|
60
|
+
{:zh =>'一万两千七', :en => 12_007},
|
|
61
|
+
{:zh => '三千六十三', :en => 3_063},
|
|
50
62
|
{:zh => '一百五十', :en => 150 },
|
|
51
|
-
{:zh => '三千亿', :en =>
|
|
63
|
+
{:zh => '三千亿', :en => 300_000_000_000},
|
|
52
64
|
{:zh => '一九六六', :en => 1966},
|
|
53
65
|
{:zh => '二零零八', :en => 2008},
|
|
54
66
|
]
|
data/test/test_romanization.rb
CHANGED
|
@@ -24,6 +24,9 @@ class TestRomanization < Minitest::Test
|
|
|
24
24
|
def test_pyn
|
|
25
25
|
assert_equal 'ni3 hao3', @py.to_pyn(:py)
|
|
26
26
|
assert_equal 'tian1an1men2', 'tian1an1men2'.to_py.to_pyn(:py)
|
|
27
|
+
|
|
28
|
+
#assert_equal 'Wūlúhānuòfū'.to_pyn, 'Wu1-lu2-ha1-nuo4-fu1'
|
|
29
|
+
#"007:Dàpò Liàngzǐ Wēijī", "007: Da4po4 Liang4zi3 Wei1ji1"
|
|
27
30
|
end
|
|
28
31
|
|
|
29
32
|
def test_zhuyin_fuhao
|
|
@@ -33,6 +36,7 @@ class TestRomanization < Minitest::Test
|
|
|
33
36
|
assert_equal 'ㄇㄠ2 ㄗㄜ2 ㄉㄨㄥ1', @mzd.to_zhuyin_fuhao
|
|
34
37
|
assert_equal 'ㄑㄧㄥ3 ㄏㄨㄟ2ㄉㄚ2 ㄨㄛ3 ㄉㄜ5 ㄨㄣ4ㄊㄧ2 .', @sent.to_zhuyin
|
|
35
38
|
assert_equal 'ㄇㄠ2 ㄗㄜ2ㄉㄨㄥ1', @mzd2.to_zhuyin_fuhao
|
|
39
|
+
assert 'ㄋㄧ3 ㄏㄠ3'.zyfh?
|
|
36
40
|
end
|
|
37
41
|
|
|
38
42
|
def test_wade_giles
|
|
@@ -48,11 +52,16 @@ class TestRomanization < Minitest::Test
|
|
|
48
52
|
#assert_equal '', @str.to_mspy2
|
|
49
53
|
#end
|
|
50
54
|
|
|
51
|
-
|
|
55
|
+
def test_typy
|
|
52
56
|
#skip
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
57
|
+
pyn = 'chui1 niu3'
|
|
58
|
+
typy = 'chuei1 niou3'
|
|
59
|
+
assert_equal typy, pyn.to_typy
|
|
60
|
+
# FIXME: to_typy doesn't work with non-spaced pinyin.
|
|
61
|
+
#assert_equal typy, typy.to_pyn(:typy)
|
|
62
|
+
assert typy.typy?
|
|
63
|
+
refute pyn.typy?
|
|
64
|
+
end
|
|
56
65
|
|
|
57
66
|
def test_yale
|
|
58
67
|
assert_equal 'ni3 hau3', @str.to_yale
|
|
@@ -68,7 +77,18 @@ class TestRomanization < Minitest::Test
|
|
|
68
77
|
refute @py.pyn?
|
|
69
78
|
|
|
70
79
|
assert 'chung1 kuo2'.wg?
|
|
71
|
-
|
|
80
|
+
|
|
81
|
+
# Travis CI is having trouble with this using Ruby 1.8.7, but it works locally.
|
|
82
|
+
# I'll probably end up dropping full 1.8.7 support.
|
|
83
|
+
assert @py.py?, "#{@py} should be pinyin. (#{@py.py?})" unless RUBY_VERSION < '1.9'
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def test_split_pyn
|
|
87
|
+
assert_equal 'zhong1guo2'.split_pyn, %w(zhong1 guo2)
|
|
88
|
+
assert_equal 'dong1xi'.split_pyn, %w(dong1 xi)
|
|
89
|
+
assert_equal 'zhongguo'.split_pyn, %w(zhong guo)
|
|
90
|
+
assert_equal 'dong1 xi1 '.split_pyn, %w(dong1 xi1)
|
|
91
|
+
assert_equal @mzd2.split_pyn, %w(Mao2 Ze2 dong1)
|
|
72
92
|
end
|
|
73
93
|
|
|
74
94
|
def setup
|
data/test/test_string.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: zhongwen_tools
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.9.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Steven Daniels
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2014-05-
|
|
11
|
+
date: 2014-05-22 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake
|
|
@@ -137,10 +137,13 @@ files:
|
|
|
137
137
|
- lib/zhongwen_tools/conversion/string.rb
|
|
138
138
|
- lib/zhongwen_tools/integer.rb
|
|
139
139
|
- lib/zhongwen_tools/numbers.rb
|
|
140
|
+
- lib/zhongwen_tools/regex.rb
|
|
141
|
+
- lib/zhongwen_tools/regex/ruby18.rb
|
|
140
142
|
- lib/zhongwen_tools/romanization.rb
|
|
141
143
|
- lib/zhongwen_tools/romanization/conversion_table.rb
|
|
142
144
|
- lib/zhongwen_tools/romanization/detect.rb
|
|
143
145
|
- lib/zhongwen_tools/romanization/pyn_to_py.rb
|
|
146
|
+
- lib/zhongwen_tools/romanization/string.rb
|
|
144
147
|
- lib/zhongwen_tools/string.rb
|
|
145
148
|
- lib/zhongwen_tools/string/caps.rb
|
|
146
149
|
- lib/zhongwen_tools/string/fullwidth.rb
|
|
@@ -174,7 +177,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
174
177
|
version: '0'
|
|
175
178
|
requirements: []
|
|
176
179
|
rubyforge_project: zhongwen_tools
|
|
177
|
-
rubygems_version: 2.2.
|
|
180
|
+
rubygems_version: 2.2.2
|
|
178
181
|
signing_key:
|
|
179
182
|
specification_version: 4
|
|
180
183
|
summary: Zhongwen Tools provide romanization conversions and helper methods for Chinese.
|