unihan2 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/data/unicode-chars-ver.csv +1145 -0
- data/lib/unihan2.rb +50 -233
- metadata +2 -1
data/lib/unihan2.rb
CHANGED
@@ -1,24 +1,25 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
1
3
|
class Unihan2
|
2
4
|
DATA_DIR = File.join(File.dirname(__FILE__), '../data')
|
3
5
|
|
4
6
|
def initialize
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
end
|
16
|
-
end
|
7
|
+
read_strokes
|
8
|
+
read_version
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
# return total strokes of the character char
|
13
|
+
# @param char [String] the character
|
14
|
+
# @return [Integer] the total strokes
|
15
|
+
def strokes(char)
|
16
|
+
@strokes[char]
|
17
17
|
end
|
18
18
|
|
19
|
-
#
|
20
|
-
#
|
21
|
-
|
19
|
+
# return unicode version of specific character
|
20
|
+
# @param code [String] character or codepoing
|
21
|
+
# @return [Float] unicode version
|
22
|
+
def ver(code)
|
22
23
|
return nil if code.nil?
|
23
24
|
|
24
25
|
if code.is_a? Integer
|
@@ -29,237 +30,53 @@ class Unihan2
|
|
29
30
|
i = code.hex
|
30
31
|
end
|
31
32
|
|
32
|
-
|
33
|
-
when 0..0xFFFF then uv0(i)
|
34
|
-
when 0x20000..0x2FFFF then uv2(i)
|
35
|
-
when 0x30000..0x3134A then 13.0
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
# return total strokes of the character char
|
40
|
-
# @param char [String] the character
|
41
|
-
# @return [Integer] the total strokes
|
42
|
-
def strokes(char)
|
43
|
-
@strokes[char]
|
33
|
+
ver_bsearch(i, 0, @vers.size-1)
|
44
34
|
end
|
45
35
|
|
46
36
|
private
|
47
37
|
|
48
|
-
def
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
when 0x2000..0x2FFF then uv02(i)
|
60
|
-
when 0x3000..0x33FF then uv03(i)
|
61
|
-
when 0x3400..0x9FA5
|
62
|
-
case i
|
63
|
-
when 0x3400..0x4DB5 then 3.0 # CJK Extension A 中日韓統一表意文字擴充 A 區
|
64
|
-
when 0x4DB6..0x4DBF then 13.0 # Extension A
|
65
|
-
when 0x4E00..0x9FA5 then 1.1 # CJK Unified Ideographs
|
66
|
-
end
|
67
|
-
when 0x9FA6..0x9FFF
|
68
|
-
case i
|
69
|
-
when 0x9FA6..0x9FBB then 4.1 # CJK Unified Ideographs
|
70
|
-
when 0x9FBC..0x9FC3 then 5.1 # CJK Unified Ideographs
|
71
|
-
when 0x9FC4..0x9FCB then 5.2 # CJK Unified Ideographs
|
72
|
-
when 0x9FCC then 6.1 # CJK Unified Ideographs
|
73
|
-
when 0x9FCD..0x9FD5 then 8.0 # CJK Unified Ideographs
|
74
|
-
when 0x9FD6..0x9FEA then 10.0 # CJK Unified Ideographs
|
75
|
-
when 0x9FEB..0x9FEF then 11.0 # CJK Unified Ideographs
|
76
|
-
when 0x9FF0..0x9FFC then 13.0 # CJK Unified Ideographs
|
77
|
-
end
|
78
|
-
when 0xA000..0xDFFF
|
79
|
-
case i
|
80
|
-
when 0xA000..0xA48C then 3.0 # Yi Syllables 彝族文字區
|
81
|
-
when 0xAC00..0xD7A3 then 2.0 # Hangul Syllables 韓文拼音
|
82
|
-
end
|
83
|
-
when 0xF000..0xFFFF
|
84
|
-
case i
|
85
|
-
when 0xF900..0xFA2D then 1.1 # CJK Compatibility Ideographs
|
86
|
-
when 0xFA2E..0xFA2F then 6.1 # CJK Compatibility Ideographs
|
87
|
-
when 0xFA30..0xFA6A then 3.2 # CJK Compatibility Ideographs
|
88
|
-
when 0xFA6B..0xFA6D then 5.2 # CJK Compatibility Ideographs
|
89
|
-
when 0xFA70..0xFAD9 then 4.1 # CJK Compatibility Ideographs
|
90
|
-
when 0xFE10..0xFE19 then 4.1 # Vertical Forms 中文直排標點
|
91
|
-
when 0xFE20..0xFE23 then 1.1 # Combining Half Marks
|
92
|
-
when 0xFE24..0xFE26 then 5.1 # Combining Half Marks
|
93
|
-
when 0xFE30..0xFE44 then 1.0 # CJK Compatibility Forms 兼容性表格
|
94
|
-
when 0xFE45..0xFE46 then 3.2 # CJK Compatibility Forms 兼容性表格
|
95
|
-
when 0xFE47..0xFE48 then 4.0 # CJK Compatibility Forms 兼容性表格
|
96
|
-
when 0xFE49..0xFE4F then 1.0 # CJK Compatibility Forms 兼容性表格
|
97
|
-
when 0xFE50..0xFE52 then 1.0 # Small Form Variants
|
98
|
-
when 0xFE54..0xFE66 then 1.0 # Small Form Variants
|
99
|
-
when 0xFE68..0xFE6B then 1.0 # Small Form Variants
|
100
|
-
when 0xFF01..0xFF5E then 1.0 # Halfwidth and Fullwidth Forms
|
101
|
-
when 0xFF5F..0xFF60 then 3.2 # Halfwidth and Fullwidth Forms
|
102
|
-
when 0xFF61..0xFF9F then 1.0 # Halfwidth and Fullwidth Forms
|
103
|
-
end
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def self.uv00(i)
|
108
|
-
case i
|
109
|
-
when 0..0x01FF
|
110
|
-
case i
|
111
|
-
when 0..0x017E then 1.0
|
112
|
-
when 0x017F then 1.1
|
113
|
-
when 0x0180..0x01F0 then 1.0
|
114
|
-
when 0x01F1..0x01F5 then 1.1
|
115
|
-
when 0x01F6..0x01F9 then 3.0
|
116
|
-
when 0x01FA..0x0217 then 1.1
|
117
|
-
end
|
118
|
-
when 0x0200..0x02FF
|
119
|
-
case i
|
120
|
-
when 0x0218..0x021F then 3.0
|
121
|
-
when 0x0220 then 3.2
|
122
|
-
when 0x0221 then 4.0
|
123
|
-
when 0x0222..0x0233 then 3.0
|
124
|
-
when 0x0234..0x0236 then 4.0
|
125
|
-
when 0x0237..0x0241 then 4.1
|
126
|
-
when 0x0242..0x024F then 5.0
|
127
|
-
when 0x0250..0x02A8 then 1.0
|
128
|
-
when 0x02A9..0x02AD then 3.0
|
129
|
-
when 0x02AE..0x02AF then 4.0
|
130
|
-
when 0x02B0..0x02DE then 1.0
|
131
|
-
when 0x02DF then 3.0
|
132
|
-
when 0x02E0..0x02E9 then 1.0
|
133
|
-
when 0x02EA..0x02EE then 3.0
|
134
|
-
when 0x02EF..0x02FF then 4.0
|
135
|
-
end
|
136
|
-
when 0x0300..0x0341 then 1.0
|
137
|
-
when 0x0400..0x04FF
|
138
|
-
case i
|
139
|
-
when 0x0401..0x040C then 1.0
|
140
|
-
when 0x040D then 3.0
|
141
|
-
when 0x040E..0x044F then 1.0
|
38
|
+
def read_strokes
|
39
|
+
fn = File.join(DATA_DIR, 'Unihan_DictionaryLikeData.txt')
|
40
|
+
@strokes = {}
|
41
|
+
File.foreach(fn) do |line|
|
42
|
+
next if line.start_with? '#'
|
43
|
+
line.chomp!
|
44
|
+
cells = line.split("\t")
|
45
|
+
if cells[1] == 'kTotalStrokes'
|
46
|
+
c = [cells[0].sub(/^U\+(.*)$/, '\1').hex].pack('U')
|
47
|
+
i = cells[2].to_i
|
48
|
+
@strokes[c] = i
|
142
49
|
end
|
143
50
|
end
|
144
51
|
end
|
145
52
|
|
146
|
-
def
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
end
|
155
|
-
when 0x2100..0x21FF
|
156
|
-
case i
|
157
|
-
when 0x2100..0x2138 then 1.0
|
158
|
-
when 0x2153..0x2182 then 1.0
|
159
|
-
when 0x2190..0x21EA then 1.0
|
160
|
-
end
|
161
|
-
when 0x2200..0x22F1 then 1.0
|
162
|
-
when 0x2400..0x24FF
|
163
|
-
case i
|
164
|
-
when 0x2460..0x24EA then 1.0
|
165
|
-
when 0x24EB..0x24FE then 3.2
|
166
|
-
when 0x24FF then 4.0
|
167
|
-
end
|
168
|
-
when 0x2500..0x25FF
|
169
|
-
case i
|
170
|
-
when 0x2500..0x2595 then 1.0
|
171
|
-
when 0x2596..0x259F then 3.2
|
172
|
-
when 0x25A0..0x25EE then 1.0
|
173
|
-
when 0x25EF then 1.1
|
174
|
-
end
|
175
|
-
when 0x2600..0x26FF
|
176
|
-
case i
|
177
|
-
when 0x2600..0x2613 then 1.0
|
178
|
-
when 0x261A..0x266F then 1.0
|
179
|
-
end
|
180
|
-
when 0x2E00..0x2FFF
|
181
|
-
case i
|
182
|
-
when 0x2E80..0x2EF3 then 3.0
|
183
|
-
when 0x2F00..0x2FD5 then 3.0
|
184
|
-
when 0x2FF0..0x2FFB then 3.0
|
185
|
-
end
|
53
|
+
def read_version
|
54
|
+
@vers = []
|
55
|
+
fn = File.join(DATA_DIR, 'unicode-chars-ver.csv')
|
56
|
+
CSV.foreach(fn, headers: true) do |row|
|
57
|
+
@vers << {
|
58
|
+
range: (row['cp1'].hex..row['cp2'].hex),
|
59
|
+
age: row['age'].to_f
|
60
|
+
}
|
186
61
|
end
|
187
62
|
end
|
188
63
|
|
189
|
-
def
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
when 0x30A0 then 3.2 # Katakana 日文片假名
|
204
|
-
when 0x30A1..0x30F6 then 1.0 # Katakana 日文片假名
|
205
|
-
when 0x30F7..0x30FA then 1.1 # Katakana 日文片假名
|
206
|
-
when 0x30FB..0x30FE then 1.0 # Katakana 日文片假名
|
207
|
-
when 0x30FF then 3.2 # Katakana 日文片假名 (Unicode 3.2)
|
208
|
-
end
|
209
|
-
when 0x3100..0x31FF
|
210
|
-
case i
|
211
|
-
when 0x3105..0x312C then 1.0 # Bopomofo 注音符號
|
212
|
-
when 0x312D then 5.1 # Bopomofo 上下顛倒的 'ㄓ'
|
213
|
-
when 0x3131..0x318E then 1.0 # Hangul Compatibility Jamo 韓文
|
214
|
-
when 0x3190..0x319F then 1.0 # Kanbun 在上方的小漢字
|
215
|
-
when 0x31A0..0x31B7 then 3.0 # Bopomofo Extended 注音擴展
|
216
|
-
when 0x31B8..0x31BA then 6.0 # Bopomofo Extended 注音擴展
|
217
|
-
when 0x31C0..0x31CF then 4.1 # CJK Strokes 筆劃 (基本筆劃, 如撇, 勾, 點...)
|
218
|
-
when 0x31D0..0x31E3 then 5.1 # CJK Strokes 筆劃 (基本筆劃, 如撇, 勾, 點...)
|
219
|
-
when 0x31F0..0x31FF then 3.2 # Katakana Phonetic Extensions 日文片假名語音擴展
|
220
|
-
end
|
221
|
-
when 0x3200..0x32FF
|
222
|
-
case i
|
223
|
-
when 0x3200..0x321C then 1.0 # Enclosed CJK Letters and Months 括號韓文
|
224
|
-
when 0x321D..0x321E then 4.0 # Enclosed CJK Letters and Months 括號韓文
|
225
|
-
when 0x3220..0x3243 then 1.0 # Enclosed CJK Letters and Months 括號一~十及漢字
|
226
|
-
when 0x3244..0x324F then 5.2 # Enclosed CJK Letters and Months 圓圈中有字及10~80
|
227
|
-
when 0x3250 then 4.0 # Enclosed CJK Letters and Months 'PTE' 組成一字
|
228
|
-
when 0x3251..0x325F then 3.2 # Enclosed CJK Letters and Months 圓圈 21~35
|
229
|
-
when 0x3260..0x327B then 1.0 # Enclosed CJK Letters and Months 圓圈韓文
|
230
|
-
when 0x327C..0x327D then 4.0 # Enclosed CJK Letters and Months 圓圈韓文
|
231
|
-
when 0x327E then 4.1 # Enclosed CJK Letters and Months 圓圈韓文
|
232
|
-
when 0x327F..0x32B0 then 1.0 # Enclosed CJK Letters and Months 圓圈一~十及漢字
|
233
|
-
when 0x32B1..0x32BF then 3.2 # Enclosed CJK Letters and Months 圓圈 36~50
|
234
|
-
when 0x32C0..0x32CB then 1.1 # Enclosed CJK Letters and Months 1月~12月
|
235
|
-
when 0x32CC..0x32CF then 4.0 # Enclosed CJK Letters and Months 多英文組成一個字
|
236
|
-
when 0x32D0..0x32FE then 1.0 # Enclosed CJK Letters and Months 圓圈日文
|
237
|
-
end
|
238
|
-
when 0x3300..0x33FF
|
239
|
-
case i
|
240
|
-
when 0x3300..0x3357 then 1.0 # CJK Compatibility 多個日文組成一字
|
241
|
-
when 0x3358..0x3376 then 1.1 # CJK Compatibility 0点~24点 及多英文組成一字
|
242
|
-
when 0x3377..0x337A then 4.0 # CJK Compatibility 多英文組成一字
|
243
|
-
when 0x337B..0x33DD then 1.0 # CJK Compatibility 多日本漢字及多英文組成一字
|
244
|
-
when 0x33DE..0x33DF then 4.0 # CJK Compatibility 多英文組成一字
|
245
|
-
when 0x33E0..0x33FE then 1.1 # CJK Compatibility 1日~31日
|
246
|
-
when 0x33FF then 4.0 # CJK Compatibility 'gal' 組成一字
|
64
|
+
def ver_bsearch(code, start, stop)
|
65
|
+
return nil if start > stop
|
66
|
+
middle = (stop - start) / 2 + start
|
67
|
+
h = @vers[middle]
|
68
|
+
if h[:range].include?(code)
|
69
|
+
return h[:age]
|
70
|
+
elsif middle == start
|
71
|
+
return nil if code < h[:range].begin
|
72
|
+
return ver_bsearch(code, middle+1, stop)
|
73
|
+
else
|
74
|
+
if code < h[:range].begin
|
75
|
+
return ver_bsearch(code, start, middle)
|
76
|
+
else
|
77
|
+
return ver_bsearch(code, middle, stop)
|
247
78
|
end
|
248
79
|
end
|
249
80
|
end
|
250
81
|
|
251
|
-
def self.uv2(i)
|
252
|
-
case i
|
253
|
-
when 0x20000..0x2A6D6 then 3.1 # Extension B
|
254
|
-
when 0x2A6D7..0x2A6DD then 13.0 # Extension B
|
255
|
-
when 0x2A700..0x2B734 then 5.2 # extension C
|
256
|
-
when 0x2B740..0x2B81D then 6.0 # extension D
|
257
|
-
when 0x2B820..0x2CEA1 then 8.0 # extension E
|
258
|
-
when 0x2CEB0..0x2EBE0 then 10.0 # extension F
|
259
|
-
when 0x2F800..0x2FA1D then 3.1 # Unicode 3.1: CJK Compatibility Supplement
|
260
|
-
end
|
261
|
-
end
|
262
|
-
|
263
|
-
private_class_method :uv0, :uv00, :uv02, :uv03, :uv2
|
264
|
-
|
265
82
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unihan2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
@@ -17,6 +17,7 @@ extensions: []
|
|
17
17
|
extra_rdoc_files: []
|
18
18
|
files:
|
19
19
|
- data/Unihan_DictionaryLikeData.txt
|
20
|
+
- data/unicode-chars-ver.csv
|
20
21
|
- lib/unihan2.rb
|
21
22
|
homepage: https://github.com/RayCHOU/unihan2
|
22
23
|
licenses:
|