zhongwen_tools 0.12.4 → 0.15.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +1 -1
  3. data/README.md +74 -165
  4. data/Rakefile +0 -1
  5. data/lib/zhongwen_tools/{string/caps.rb → caps.rb} +19 -1
  6. data/lib/zhongwen_tools/core.rb +19 -0
  7. data/lib/zhongwen_tools/core_ext/integer.rb +8 -0
  8. data/lib/zhongwen_tools/core_ext/string.rb +10 -0
  9. data/lib/zhongwen_tools/fullwidth.rb +102 -0
  10. data/lib/zhongwen_tools/integer_extension.rb +31 -0
  11. data/lib/zhongwen_tools/number/number_table.rb +44 -0
  12. data/lib/zhongwen_tools/number.rb +221 -0
  13. data/lib/zhongwen_tools/regex.rb +38 -22
  14. data/lib/zhongwen_tools/romanization/pinyin.rb +231 -0
  15. data/lib/zhongwen_tools/romanization/{pyn_to_py.rb → pinyin_table.rb} +2 -1
  16. data/lib/zhongwen_tools/romanization/romanization_table.rb +425 -0
  17. data/lib/zhongwen_tools/romanization.rb +199 -136
  18. data/lib/zhongwen_tools/{string/ruby19.rb → ruby_19.rb} +1 -2
  19. data/lib/zhongwen_tools/{conversion → script}/conversion_data +0 -0
  20. data/lib/zhongwen_tools/{conversion.rb → script.rb} +21 -34
  21. data/lib/zhongwen_tools/string_extension.rb +136 -0
  22. data/lib/zhongwen_tools/unicode.rb +25 -0
  23. data/lib/zhongwen_tools/uri.rb +14 -0
  24. data/lib/zhongwen_tools/version.rb +1 -1
  25. data/lib/zhongwen_tools/zhongwen.rb +29 -0
  26. data/lib/zhongwen_tools.rb +2 -3
  27. data/test/test_caps.rb +26 -0
  28. data/test/test_core.rb +13 -0
  29. data/test/test_fullwidth.rb +30 -0
  30. data/test/test_helper.rb +4 -12
  31. data/test/test_helpers/unload_zhongwen_tools_script.rb +5 -0
  32. data/test/test_integer_extension.rb +34 -0
  33. data/test/test_number.rb +79 -0
  34. data/test/test_pinyin.rb +68 -0
  35. data/test/test_regex.rb +41 -0
  36. data/test/test_romanization.rb +110 -133
  37. data/test/{test_conversion.rb → test_script.rb} +41 -44
  38. data/test/test_string_extension.rb +94 -0
  39. data/test/test_unicode.rb +27 -0
  40. data/test/test_uri.rb +16 -0
  41. data/test/test_zhongwen.rb +37 -0
  42. data/zhongwen_tools.gemspec +1 -1
  43. metadata +93 -52
  44. data/Gemfile.1.8.7 +0 -8
  45. data/lib/zhongwen_tools/conversion/string.rb +0 -19
  46. data/lib/zhongwen_tools/integer.rb +0 -28
  47. data/lib/zhongwen_tools/numbers.rb +0 -195
  48. data/lib/zhongwen_tools/regex/ruby18.rb +0 -15
  49. data/lib/zhongwen_tools/romanization/conversion_table.rb +0 -425
  50. data/lib/zhongwen_tools/romanization/detect.rb +0 -141
  51. data/lib/zhongwen_tools/romanization/string.rb +0 -36
  52. data/lib/zhongwen_tools/string/fullwidth.rb +0 -85
  53. data/lib/zhongwen_tools/string/ruby18.rb +0 -96
  54. data/lib/zhongwen_tools/string.rb +0 -164
  55. data/test/test_integer.rb +0 -31
  56. data/test/test_numbers.rb +0 -68
  57. data/test/test_string.rb +0 -133
@@ -0,0 +1,221 @@
1
+ # encoding: utf-8
2
+ require 'zhongwen_tools/regex'
3
+ require 'zhongwen_tools/zhongwen'
4
+ require 'zhongwen_tools/romanization/pinyin'
5
+ require 'zhongwen_tools/number/number_table'
6
+
7
+ # Number.to_pyn, to_i, to_zhs, etc.
8
+ module ZhongwenTools
9
+ module Number
10
+ def self.number?(obj)
11
+ klass = obj.class
12
+
13
+ if klass == String
14
+ regex = /([\d]|#{ZhongwenTools::Regex.zh_numbers}){1,}/
15
+ "#{obj}".gsub(regex, '') == ''
16
+ elsif klass == Integer
17
+ true
18
+ elsif klass == Fixnum
19
+ true
20
+ end
21
+ end
22
+
23
+ #needs to be a class method
24
+ %w(i zhs zht pyn).each do |action|
25
+ define_singleton_method("to_#{ action }") do |*args|
26
+ obj, from, separator = args
27
+ from ||= number_type(obj)
28
+
29
+ convert(obj, action.to_sym, from.to_sym, separator)
30
+ end
31
+ end
32
+
33
+ def self.to_zh(obj, type = :zhs, from = nil)
34
+ type = type.to_sym
35
+
36
+ if type == :zht
37
+ to_zht(obj, from)
38
+ else
39
+ to_zhs(obj, from)
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ def self.convert(obj, to, from, separator = '')
46
+ fail ArgumentError unless [:zhs, :zht, :i, :pyn].include?(to.to_sym)
47
+ fail ArgumentError unless [String, Integer, Fixnum].include?(obj.class)
48
+
49
+ number = convert_from from, to, obj
50
+
51
+ if to == :i
52
+ combine_integers(number)
53
+ elsif to == :pyn
54
+ regex = /#{ %w(yi4 wan4 qian1 bai2 shi2).map{ |x| 'ling2\-' + x }.join('|')}/
55
+ finalize_number(number, '-').gsub(regex, '').gsub(/\-+/, '-').gsub(/\-$/, '')
56
+ else
57
+ finalize_number(number)
58
+ end
59
+ end
60
+
61
+ def self.number_type(obj)
62
+ klass = obj.class
63
+
64
+ if klass == Fixnum || klass == Integer
65
+ :i
66
+ else
67
+ if ZhongwenTools::Zhongwen.zh?(obj)
68
+ # need to check zhs or zht
69
+ if zht?(obj)
70
+ :zht
71
+ else
72
+ :zhs
73
+ end
74
+ else #assume it is pyn
75
+ #if ZhongwenTools.const_defined?(:Romanization) && ZhongwenTools::Romanization.romanization?(obj)
76
+ # might need to convert to pyn
77
+ :pyn
78
+ end
79
+ end
80
+ end
81
+
82
+ def self.zht?(str)
83
+ str[/#{ZhongwenTools::Regex.zht_numbers }*/] == str
84
+ end
85
+
86
+ def self.zhs?(str)
87
+ !zht?(str)
88
+ end
89
+
90
+ def self.convert_from(from, to, number)
91
+ if from == :zht
92
+ convert_from_zh(to, number)
93
+ elsif from == :zhs
94
+ convert_from_zh(to, number)
95
+ elsif from == :i
96
+ convert_from_integer(to, number)
97
+ elsif from == :pyn
98
+ convert_from_pyn(to, number)
99
+ end
100
+ end
101
+
102
+ def self.convert_from_pyn(to, pyn)
103
+ # convert to pyn
104
+ # split the pyn and then
105
+ pyns = ZhongwenTools::Romanization::Pinyin.split_pyn(pyn)
106
+
107
+ pyns.map do |p|
108
+ convert_number(p).fetch( to ){ p }
109
+ end
110
+ end
111
+
112
+ def self.convert_from_zh(to, number)
113
+ converted_number = number.chars.map do |zh|
114
+ convert_number(zh).fetch(to){ zh }
115
+ end
116
+
117
+ converted_number
118
+ end
119
+
120
+ def self.combine_integers(integers)
121
+ return combine_year(integers) if year?(integers)
122
+
123
+ number = 0
124
+ length = integers.size
125
+ skipped = false
126
+
127
+ integers.each_with_index do |curr_num, i|
128
+ next if skipped == i
129
+
130
+ if (i+2) <= length
131
+ number, i = combine_integer(integers, number, curr_num, i)
132
+ skipped = i + 1
133
+ else
134
+ number = adjust_integer(number, curr_num)
135
+ end
136
+ end
137
+
138
+ number
139
+ end
140
+
141
+ def self.year?(integers)
142
+ integers.select{ |i| i < 10 }.size == integers.size
143
+ end
144
+
145
+ def self.combine_year(integers)
146
+ integers.map{ |i| i.to_s }.join.to_i
147
+ end
148
+
149
+ def self.combine_integer integers, result, curr_num, i
150
+ next_number = integers[i + 1]
151
+ if number_multiplier? next_number
152
+ result += next_number * curr_num
153
+ end
154
+
155
+ [result, i]
156
+ end
157
+
158
+ def self.adjust_integer(number, curr_num)
159
+ number_multiplier?(curr_num) ? number * curr_num : number + curr_num
160
+ end
161
+
162
+ def self.number_multiplier?(number)
163
+ [10,100,1_000,10_000,100_000_000].include? number
164
+ end
165
+
166
+
167
+ def self.convert_from_integer to, int
168
+ # FIXME: this will fail for numbers over 1 billion.
169
+ result = []
170
+ nums = convert_integer_to_reversed_array_of_integers(int)
171
+
172
+ nums.each_with_index do |num, i|
173
+ wan = wan_level(wan, i)
174
+
175
+ if i == 0
176
+ result << convert_integer(num, to) unless num == 0
177
+ else
178
+ result << convert_wan_level(i, to)
179
+ # checks the wan level and ...
180
+ result << convert_integer(num, to) if wan_ok?(num, wan, i)
181
+ end
182
+ end
183
+
184
+ result.reverse!
185
+ end
186
+
187
+ def self.convert_integer_to_reversed_array_of_integers(int)
188
+ int.to_s.chars.to_a.reverse.map{ |x| x.to_i }
189
+ end
190
+
191
+ def self.wan_ok?(num, wan, i)
192
+ (num == 1 && (10**(i) / 10_000 ** wan) != 10) || num != 1
193
+ end
194
+
195
+ def self.wan_level(wan, i)
196
+ wan ||= 0
197
+ wan += 1 if (i + 1) % 5 == 0
198
+
199
+ wan
200
+ end
201
+
202
+ def self.convert_wan_level(i, to)
203
+ convert_integer((10**(i)), to) || convert_integer((10**(i) / 10_000), to) || convert_integer((10**(i) / 10_000**2), to)
204
+ end
205
+
206
+ def self.convert_integer(int, to)
207
+ NUMBERS_TABLE.find{ |x| x[:i] == int }.fetch(to){ 0 }
208
+ end
209
+
210
+ def self.convert_number(number)
211
+ NUMBERS_TABLE.find{ |x| x[:zhs] == number || x[:zht] == number || x[:pyn] == number }
212
+ end
213
+
214
+ def self.finalize_number(number, separator = '')
215
+ # FIXME: is finalize_number the best name you can think of?
216
+ # NOTE: Figuring out usage of "liang" vs. "er" is pretty
217
+ # difficult, so always use "er" instead.
218
+ number.join(separator).gsub(/零#{ZhongwenTools::Regex.zh_number_multiple}/u,'')
219
+ end
220
+ end
221
+ end
@@ -1,51 +1,68 @@
1
1
  # encoding: utf-8
2
+
2
3
  module ZhongwenTools
3
4
  module Regex
4
- extend self
5
-
6
- def pyn
7
- /(#{pyn_regexes.values.join('|')}|r)([1-5])?([\s\-]+)?/
5
+ def self.pyn
6
+ /(#{pyn_regexes.values.join('|')}|r)([1-5])([\s\-]+)?/
8
7
  end
9
8
 
10
- def py
9
+ def self.py
11
10
  # FIXME: need to detect Ālābó
12
11
  # ([ĀÁǍÀA][io]?|[io]?|[][āáǎàaēéěèeūúǔùu]?o?|[ĒÉĚÈE]i?|[]i?|[ŌÓǑÒO]u?|[]u?|u[āáǎàaēoēéěèe]?i?|[]e?)(n?g?r?)){1,}
13
- /(#{pyn_regexes.map{|k,v| v.to_s[7..-2].gsub_with_hash(/[aeiouv]/,py_tones)}.join('|')}([\s\-])?)/
12
+ /(#{pyn_regexes.map{|k,v| v.to_s[7..-2].gsub(/[aeiouv]/,py_tones)}.join('|')}([\s\-])?)/
14
13
  end
15
14
 
16
- def pinyin_num
15
+ def self.pinyin_num
17
16
  /(([BPMFDTNLGKHZCSRJQXWYbpmfdtnlgkhzcsrjqxwy]?[h]?)(A[io]?|a[io]?|i[aeu]?o?|Ei?|ei?|Ou?|ou?|u[aoe]?i?|ve?)?(n?g?)(r?)([1-5])(\-+)?)/
18
17
  end
19
18
 
20
- def fullwidth
19
+ def self.pinyin_toneless
20
+ /(#{pyn_regexes.values.join('|')}|r)([\s\-]+)?/
21
+ end
22
+
23
+ def self.fullwidth
21
24
  /[0-9A-Za-z%.:#$&+-/\=;<>]/
22
25
  end
23
26
 
24
- def capital_letters
25
- /(#{Regexp.union(ZhongwenTools::UNICODE_CAPS.keys)})/
27
+ def self.capital_letters
28
+ /(#{Regexp.union(ZhongwenTools::Caps::CAPS.keys)})/
26
29
  end
27
30
 
28
- def lowercase_letters
29
- /(#{Regexp.union(ZhongwenTools::UNICODE_CAPS.values)})/
31
+ def self.lowercase_letters
32
+ /(#{Regexp.union(ZhongwenTools::Caps::CAPS.values)})/
30
33
  end
31
34
 
32
- def zh
35
+ def self.zh
33
36
  /[\u2E80-\u2E99]|[\u2E9B-\u2EF3]|[\u2F00-\u2FD5]|[\u3005|\u3007]|[\u3021-\u3029]|[\u3038-\u303B]|[\u3400-\u4DB5]|[\u4E00-\u9FCC]|[\uF900-\uFA6D]|[\uFA70-\uFAD9]/
34
37
  end
35
38
 
36
- def punc
39
+ def self.punc
37
40
  /[\u0021-\u0023]|[\u0025-\u002A]|[\u002C-\u002F]|[\u003A\u003B\u003F\u0040]|[\u005B-\u005D\u005F\u007B\u007D\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387]/
38
41
  end
39
42
 
40
- def zh_punc
43
+ def self.zh_punc
41
44
  # TODO: includes non-zh punctuation codes. Should only include punctuation in CJK ranges.
42
45
  /[\u2E00-\u2E2E]|[\u2E30-\u2E3B]|[\u3001-\u3003]|[\u3008-\u3011]|[\u3014-\u301F]|[\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF]|[\uA60D-\uA60F]|[\uA673\uA67E]|[\uA6F2-\uA6F7]|[\uA874-\uA877]|[\uA8CE\uA8CF]|[\uA8F8-\uA8FA]|[\uA92E\uA92F\uA95F]|[\uA9C1-\uA9CD]|[\uA9DE\uA9DF]|[\uAA5C-\uAA5F]|[\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F]|[\uFE10-\uFE19]|[\uFE30-\uFE52]|[\uFE54-\uFE61]|[\uFE63\uFE68\uFE6A\uFE6B]|[\uFF01-\uFF03]|[\uFF05-\uFF0A]|[\uFF0C-\uFF0F]|[\uFF1A\uFF1B\uFF1F\uFF20]|[\uFF3B-\uFF3D]|[\uFF3F\uFF5B\uFF5D]|[\uFF5F-\uFF65]/
43
46
  end
44
47
 
45
- def zh_numbers
48
+ def self.zh_numbers
46
49
  # TODO: include numbers like yotta, etc.
47
50
  # 垓 秭 穰 溝 澗 正 載 --> beyond 100,000,000!
48
- /[〇零一壹幺二贰貳两兩三弎叁參四肆䦉五伍六陆陸七柒八捌九玖十拾廿百佰千仟万萬亿億]/
51
+ # Regional: Dong Guai
52
+ /[〇零一壹幺二贰貳两兩三弎叁參仨四肆䦉五伍六陆陸七柒八捌九玖十拾廿卅百佰千仟万萬亿億]/
53
+ end
54
+
55
+ def self.zhs_numbers
56
+ # TODO: check if 佰,仟 are the financial numbers in zhs
57
+ /[〇零一壹幺二贰两三弎叁仨四肆䦉五伍六陆七柒八捌九玖十拾廿卅百佰千仟万亿]/
58
+ end
59
+
60
+ def self.zht_numbers
61
+ /[〇零一壹幺二貳兩三弎參仨四肆䦉五伍六陸七柒八捌九玖十拾廿卅佰千仟萬億]/
62
+ end
63
+
64
+ def self.zh_number_multiple
65
+ /[拾十百佰千仟仟万萬亿億]/
49
66
  end
50
67
 
51
68
  # Public: A Regex for bopomofo, a.k.a. Zhuyin Fuhao 注音符号.
@@ -56,12 +73,13 @@ module ZhongwenTools
56
73
  # bopomofo #=> <Regex>
57
74
  #
58
75
  # Returns a Regex.
59
- def bopomofo
76
+ def self.bopomofo
60
77
  /[ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩ]/
61
78
  end
62
79
 
63
80
  private
64
- def pyn_regexes
81
+
82
+ def self.pyn_regexes
65
83
  # http://stackoverflow.com/questions/20736291/regex-for-matching-pinyin
66
84
  # https://www.debuggex.com/r/_9kbxA6f00gIGiVo
67
85
  # NOTE: you might need to change the order of these regexes for more accurate matching of some pinyin.
@@ -81,7 +99,7 @@ module ZhongwenTools
81
99
  }
82
100
  end
83
101
 
84
- def py_tones
102
+ def self.py_tones
85
103
  py_tones = {
86
104
  'a' => '[āáǎàa]',
87
105
  'e' => '[ēéěèe]',
@@ -93,5 +111,3 @@ module ZhongwenTools
93
111
  end
94
112
  end
95
113
  end
96
-
97
- require File.expand_path("../regex/ruby18", __FILE__) if RUBY_VERSION < '1.9'
@@ -0,0 +1,231 @@
1
+ # encoding: utf-8
2
+ require 'zhongwen_tools/regex'
3
+ require 'zhongwen_tools/caps'
4
+ require 'zhongwen_tools/romanization'
5
+
6
+ module ZhongwenTools
7
+ module Romanization
8
+
9
+ def self.convert_to_py(str, from)
10
+ str = convert_romanization(str, from, :pyn) if from != :pyn
11
+ ZhongwenTools::Romanization::Pinyin.convert_pyn_to_pinyin(str)
12
+ end
13
+
14
+ def self.convert_to_pyn(str, from)
15
+ orig_str = str.dup
16
+
17
+ if from == :py
18
+ str = ZhongwenTools::Romanization::Pinyin.convert_pinyin_to_pyn(str)
19
+ else
20
+ str = convert_romanization(str, from, :pyn)
21
+ end
22
+
23
+ str = ZhongwenTools::Romanization::Pinyin.add_hyphens_to_pyn(str) if hyphenated?(orig_str)
24
+
25
+ str
26
+ end
27
+
28
+ module Pinyin
29
+ %w(pinyin py pyn).each do |romanization|
30
+ define_singleton_method("to_#{romanization}") do |*args|
31
+ str, from = args
32
+ from ||= ZhongwenTools::Romanization.romanization? str
33
+
34
+ #_convert_romanization str, _set_type(type.to_sym), _set_type(from)
35
+ ZhongwenTools::Romanization.convert str, py_type(romanization), (py_type(from) || from)
36
+ end
37
+ end
38
+
39
+ def self.split_pyn(str)
40
+ # FIXME: ignore punctuation
41
+ regex = str[/[1-5]/].nil? ? /(#{ZhongwenTools::Regex.pinyin_toneless})/ : /(#{ZhongwenTools::Regex.pyn})/
42
+
43
+ str.scan(regex).map{ |arr| arr[0].strip.gsub('-','') }.flatten
44
+ end
45
+
46
+ def self.split_py(str)
47
+ words = str.split(' ')
48
+
49
+ results = words.map do |word|
50
+ word, is_capitalized = normalize_pinyin(word)
51
+ result = word.split(/['\-]/).flatten.map do |x|
52
+ find_py(x)
53
+ end
54
+
55
+ recapitalize(result.flatten, is_capitalized)
56
+ end
57
+
58
+ results.flatten
59
+ end
60
+
61
+ # Public: checks if a string is pinyin.
62
+ # http://en.wikipedia.org/wiki/Pinyin
63
+ #
64
+ # Examples
65
+ # py?('nǐ hǎo')
66
+ # # => true
67
+ #
68
+ # Returns Boolean.
69
+ def self.py?(str)
70
+ # NOTE: py regex does not include capitals with tones.
71
+ #ZhongwenTools::Caps.downcase(str).gsub(ZhongwenTools::Regex.punc,'').gsub(Regex.py, '').gsub(/[\s\-]/,'').strip == ''
72
+ regex = /(#{ ZhongwenTools::Regex.punc }|#{ ZhongwenTools::Regex.py }|[\s\-])/
73
+ ZhongwenTools::Caps.downcase(str).gsub(regex, '').strip == ''
74
+ end
75
+
76
+ # Public: checks if a string is pinyin.
77
+ #
78
+ # Examples
79
+ # pyn?('pin1-yin1')
80
+ # # => true
81
+ #
82
+ # Returns Boolean.
83
+ def self.pyn?(str)
84
+ # FIXME: use strip_punctuation method
85
+ normalized_str = ZhongwenTools::Caps.downcase(str.gsub(ZhongwenTools::Regex.punc,'').gsub(/[\s\-]/,''))
86
+ pyn_arr = split_pyn(normalized_str).map{ |p| p }
87
+
88
+ pyn_matches_properly?(pyn_arr, normalized_str) &&
89
+ are_all_pyn_syllables_complete?(pyn_arr)
90
+ end
91
+
92
+ def self.add_hyphens_to_pyn(str)
93
+ results = str.split(' ').map do |s|
94
+ split_pyn(s).join('-')
95
+ end
96
+
97
+ results.join(' ')
98
+ end
99
+
100
+ private
101
+
102
+ def self.pyn_matches_properly?(pyn_arr, normalized_str)
103
+ pyn_arr.join('') == normalized_str
104
+ end
105
+
106
+ def self.are_all_pyn_syllables_complete?(pyn_arr)
107
+ pyns = ROMANIZATIONS_TABLE.map{ |r| r[:pyn] }
108
+
109
+ pyn_syllables = pyn_arr.select do |p|
110
+ pyns.include?(p.gsub(/[1-5]/, ''))
111
+ end
112
+
113
+ pyn_arr.size == pyn_syllables.size
114
+ end
115
+
116
+ def self.py_type(romanization)
117
+ romanization = romanization.to_s.downcase.to_sym
118
+
119
+ { pyn: :pyn, py: :py, pinyin: :py }[romanization]
120
+ end
121
+
122
+
123
+ def self.normalize_pinyin(pinyin)
124
+ [ZhongwenTools::Caps.downcase(pinyin), capitalized?(pinyin)]
125
+ end
126
+
127
+ def self.find_py(str)
128
+ str.scan(ZhongwenTools::Regex.py).map{ |x| (x - [nil])[0] }
129
+
130
+ end
131
+
132
+ def self.recapitalize(obj, capitalized)
133
+ return obj unless capitalized
134
+
135
+ if obj.class == String
136
+ ZhongwenTools::Caps.capitalize(obj)
137
+ elsif obj.class == Array
138
+ [ZhongwenTools::Caps.capitalize(obj[0]), obj[1..-1]].flatten
139
+ end
140
+ end
141
+
142
+ # Internal: converts real pinyin to pinyin number string.
143
+ #
144
+ # pinyin - A String for the pinyin.
145
+ #
146
+ # Examples
147
+ #
148
+ # convert_pinyin_to_pyn('Nǐ hǎo ma') #=> 'Ni3 hao3 ma5?'
149
+ #
150
+ # Returns a String in pinyin number format.
151
+ def self.convert_pinyin_to_pyn(pinyin)
152
+ words = pinyin.split(' ')
153
+
154
+ pyn = words.map do |word|
155
+ # NOTE: if a word is upcase, then it will be converted the same
156
+ # as a word that is only capitalized.
157
+ word, is_capitalized = normalize_pinyin(word)
158
+
159
+ pys = split_py(word)
160
+ #is_capitalized ? ZhongwenTools::Caps.capitalize(result) : result
161
+ recapitalize(current_pyn(word, pys), is_capitalized)
162
+ end
163
+
164
+ pyn.join(' ')
165
+ end
166
+
167
+ def self.capitalized?(str)
168
+ str[0] != ZhongwenTools::Caps.downcase(str[0])
169
+ end
170
+
171
+ def self.current_pyn(pyn, pinyin_arr)
172
+ replacements = []
173
+ pinyin_arr.each do |pinyin|
174
+ replace = pinyin_replacement(pinyin)
175
+ match = pinyin
176
+ if replacements.size > 0
177
+ pyn = pyn.sub(/(#{replacements.join('.*')}.*)#{match}/){ $1 + replace }
178
+ else
179
+ pyn = pyn.sub(/#{match}/){ "#{$1}#{replace}"}
180
+ end
181
+ replacements << replace
182
+ end
183
+
184
+ pyn.gsub("'", '')
185
+ end
186
+
187
+ def self.pinyin_replacement(py)
188
+ matches = PYN_PY.values.select do |x|
189
+ py.include? x
190
+ end
191
+ match = select_pinyin_match(matches)
192
+ replace = PYN_PY.find{|k,v| k if v == match}[0]
193
+
194
+ py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){$1 + $3 + $2}
195
+ end
196
+
197
+ def self.select_pinyin_match(matches)
198
+ # take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
199
+ match = matches.sort{|x,y| x.bytes.to_a.length <=> y.bytes.to_a.length}[-1]
200
+
201
+ # Edge case.. en/eng pyn -> py conversion is one way only.
202
+ match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
203
+ end
204
+
205
+
206
+ # Internal: Replaces numbered pinyin with actual pinyin. Pinyin separated with hyphens are combined as one word.
207
+ #
208
+ # str - A String to replace with actual pinyin
209
+ #
210
+ # Examples
211
+ #
212
+ # convert_pyn_to_pinyin 'Ni3 hao3 ma5?' # => "Nǐ hǎo ma?"
213
+ #
214
+ #
215
+ # Returns a string with actual pinyin
216
+ def self.convert_pyn_to_pinyin(str)
217
+ regex = Regex.pinyin_num
218
+ # Using gsub is ~8x faster than using scan and each.
219
+ # Explanation: if it's pinyin without vowels, e.g. m, ng, then convert,
220
+ # otherwise, check if it needs an apostrophe (http://www.pinyin.info/romanization/hanyu/apostrophes.html).
221
+ # If it does, add it and then convert. Otherwise, just convert.
222
+ # Oh, and if it has double hyphens, replace with one hyphen.
223
+ # And finally, correct those apostrophes at the very end.
224
+ # It's like magic.
225
+ str.gsub(regex) do
226
+ ($3.nil? ? "#{PYN_PY[$1]}" : ($2 == '' && ['a','e','o'].include?($3[0,1]))? "'#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}" : "#{$2}#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}") + (($7.to_s.length > 1) ? '-' : '')
227
+ end.gsub("-'","-").sub(/^'/,'')
228
+ end
229
+ end
230
+ end
231
+ end
@@ -2,7 +2,8 @@
2
2
 
3
3
  # NOTE: This table works for pyn -> pinyin conversion, but it introduces
4
4
  # mistakes when converting pinyin to pyn. In practice, pinyin can't
5
- # be converted to pyn properly unless it's properly formatted.
5
+ # be converted to pyn with complete accuracy unless it is properly
6
+ # formatted.
6
7
  module ZhongwenTools
7
8
  module Romanization
8
9
  PYN_PY = {