hanzi_to_pinyin 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +20 -0
- data/README.rdoc +42 -0
- data/Rakefile +48 -0
- data/VERSION +1 -0
- data/lib/data/hz2py.json +25479 -0
- data/lib/data/unicode_to_pinyin.yml +25478 -0
- data/lib/hanzi_to_pinyin.rb +102 -0
- metadata +12 -5
@@ -0,0 +1,102 @@
|
|
1
|
+
# encoding:utf-8
|
2
|
+
require "json"
|
3
|
+
|
4
|
+
class HanziToPinyin
|
5
|
+
|
6
|
+
VERSION = IO.read File.expand_path("../../VERSION",__FILE__)
|
7
|
+
|
8
|
+
# Unicode中汉字开始点(16进制)
|
9
|
+
@@hanzi_unicode_start = 19968
|
10
|
+
# Unicode中汉字的结束点
|
11
|
+
@@hanzi_unicode_end = 40869
|
12
|
+
|
13
|
+
# 数字(10进制)
|
14
|
+
@@number_unicode_start = 48
|
15
|
+
@@number_unicode_end = 57
|
16
|
+
|
17
|
+
# 汉字 unicode 编码(16进制)
|
18
|
+
@@unicode = YAML.load(IO.read File.expand_path("../data/unicode_to_pinyin.yml",__FILE__))
|
19
|
+
@@py = ::JSON.parse(IO.read File.expand_path("../data/hz2py.json",__FILE__))
|
20
|
+
|
21
|
+
# 只取首字母
|
22
|
+
def self.hanzi_2_pinyin(hanzi)
|
23
|
+
hanzi = hanzi.force_encoding("utf-8")
|
24
|
+
u_str = ''
|
25
|
+
hanzi.each_codepoint { |c|
|
26
|
+
if is_hanzi?(c)
|
27
|
+
unicode = c.to_s(16).upcase
|
28
|
+
u_str << @@unicode[unicode]
|
29
|
+
else
|
30
|
+
if c == 45 # -
|
31
|
+
u_str << "_"
|
32
|
+
else
|
33
|
+
u_str << c.chr.downcase
|
34
|
+
end
|
35
|
+
end
|
36
|
+
}
|
37
|
+
u_str
|
38
|
+
end
|
39
|
+
class << self
|
40
|
+
alias_method :hanzi_to_pinyin , :hanzi_2_pinyin
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# 只处理汉字和数字 多音字,分隔 字字之间;分隔
|
45
|
+
# 查理Smith => "cha,zha;li"
|
46
|
+
# 郭轶 => "guo;yi,die"
|
47
|
+
# 我们 => "wo;men"
|
48
|
+
# 宗志强 => "zong;zhi;qiang,jiang"
|
49
|
+
def self.hanzi_2_py(hanzi)
|
50
|
+
hanzi = hanzi.force_encoding("utf-8")
|
51
|
+
str = ''
|
52
|
+
hanzi.each_char do |hz|
|
53
|
+
if is_number?(hz.ord)
|
54
|
+
if str.length == 0
|
55
|
+
str << hz.chr
|
56
|
+
else
|
57
|
+
if str[-1] == ";"
|
58
|
+
str << hz.chr
|
59
|
+
else
|
60
|
+
str << ";#{hz.chr}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
elsif is_hanzi?(hz.ord)
|
64
|
+
values = @@py[hz]
|
65
|
+
if values.size > 1
|
66
|
+
if str.length == 0
|
67
|
+
str << "#{values.join(',')}"
|
68
|
+
else
|
69
|
+
if str[-1] == ";"
|
70
|
+
str << "#{values.join(',')}"
|
71
|
+
else
|
72
|
+
str << ";#{values.join(',')}"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
else
|
76
|
+
if str.length == 0
|
77
|
+
str << "#{values.join};"
|
78
|
+
else
|
79
|
+
if str[-1] == ";"
|
80
|
+
str << "#{values.join}"
|
81
|
+
else
|
82
|
+
str << ";#{values.join}"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
str
|
89
|
+
end
|
90
|
+
class << self
|
91
|
+
alias_method :hanzi_to_py , :hanzi_2_py
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.is_hanzi?(hanzi_codepoint)
|
95
|
+
hanzi_codepoint >= @@hanzi_unicode_start && hanzi_codepoint <= @@hanzi_unicode_end
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.is_number?(number_codepoint)
|
99
|
+
number_codepoint >= @@number_unicode_start && number_codepoint <= @@number_unicode_end
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hanzi_to_pinyin
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -14,7 +14,7 @@ default_executable:
|
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rspec
|
17
|
-
requirement: &
|
17
|
+
requirement: &86066550 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ! '>='
|
@@ -22,7 +22,7 @@ dependencies:
|
|
22
22
|
version: '0'
|
23
23
|
type: :development
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
25
|
+
version_requirements: *86066550
|
26
26
|
description: chinese hanzi to pinyin , fetch first letter OR full pinyin, written
|
27
27
|
in Ruby.
|
28
28
|
email:
|
@@ -30,7 +30,14 @@ email:
|
|
30
30
|
executables: []
|
31
31
|
extensions: []
|
32
32
|
extra_rdoc_files: []
|
33
|
-
files:
|
33
|
+
files:
|
34
|
+
- README.rdoc
|
35
|
+
- VERSION
|
36
|
+
- LICENSE.txt
|
37
|
+
- Rakefile
|
38
|
+
- lib/data/unicode_to_pinyin.yml
|
39
|
+
- lib/data/hz2py.json
|
40
|
+
- lib/hanzi_to_pinyin.rb
|
34
41
|
has_rdoc: true
|
35
42
|
homepage: http://github.com/wxianfeng/hanzi_to_pinyin
|
36
43
|
licenses: []
|
@@ -48,7 +55,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
48
55
|
version: '0'
|
49
56
|
segments:
|
50
57
|
- 0
|
51
|
-
hash:
|
58
|
+
hash: 26390223
|
52
59
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
53
60
|
none: false
|
54
61
|
requirements:
|