unihan2 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/data/unicode-chars-ver.csv +1145 -0
  3. data/lib/unihan2.rb +50 -233
  4. metadata +2 -1
@@ -1,24 +1,25 @@
1
+ require 'csv'
2
+
1
3
  class Unihan2
2
4
  DATA_DIR = File.join(File.dirname(__FILE__), '../data')
3
5
 
4
6
  def initialize
5
- fn = File.join(DATA_DIR, 'Unihan_DictionaryLikeData.txt')
6
- @strokes = {}
7
- File.foreach(fn) do |line|
8
- next if line.start_with? '#'
9
- line.chomp!
10
- cells = line.split("\t")
11
- if cells[1] == 'kTotalStrokes'
12
- c = [cells[0].sub(/^U\+(.*)$/, '\1').hex].pack('U')
13
- i = cells[2].to_i
14
- @strokes[c] = i
15
- end
16
- end
7
+ read_strokes
8
+ read_version
9
+ end
10
+
11
+
12
+ # return total strokes of the character char
13
+ # @param char [String] the character
14
+ # @return [Integer] the total strokes
15
+ def strokes(char)
16
+ @strokes[char]
17
17
  end
18
18
 
19
- # Listing of Characters Covered by the Unihan Database
20
- # https://www.unicode.org/reports/tr38/tr38-29.html#BlockListing
21
- def self.ver(code)
19
+ # return unicode version of specific character
20
+ # @param code [String] character or codepoing
21
+ # @return [Float] unicode version
22
+ def ver(code)
22
23
  return nil if code.nil?
23
24
 
24
25
  if code.is_a? Integer
@@ -29,237 +30,53 @@ class Unihan2
29
30
  i = code.hex
30
31
  end
31
32
 
32
- case i
33
- when 0..0xFFFF then uv0(i)
34
- when 0x20000..0x2FFFF then uv2(i)
35
- when 0x30000..0x3134A then 13.0
36
- end
37
- end
38
-
39
- # return total strokes of the character char
40
- # @param char [String] the character
41
- # @return [Integer] the total strokes
42
- def strokes(char)
43
- @strokes[char]
33
+ ver_bsearch(i, 0, @vers.size-1)
44
34
  end
45
35
 
46
36
  private
47
37
 
48
- def self.uv0(i)
49
- case i
50
- when 0..0x0FFF then uv00(i)
51
- when 0x1E00..0x1EFF
52
- case i
53
- when 0x1E00..0x1E9A then 1.1
54
- when 0x1E9B then 2.0
55
- when 0x1E9C..0x1E9F then 5.1
56
- when 0x1EA0..0x1EF9 then 1.1
57
- when 0x1EFA..0x1EFF then 5.1
58
- end
59
- when 0x2000..0x2FFF then uv02(i)
60
- when 0x3000..0x33FF then uv03(i)
61
- when 0x3400..0x9FA5
62
- case i
63
- when 0x3400..0x4DB5 then 3.0 # CJK Extension A 中日韓統一表意文字擴充 A 區
64
- when 0x4DB6..0x4DBF then 13.0 # Extension A
65
- when 0x4E00..0x9FA5 then 1.1 # CJK Unified Ideographs
66
- end
67
- when 0x9FA6..0x9FFF
68
- case i
69
- when 0x9FA6..0x9FBB then 4.1 # CJK Unified Ideographs
70
- when 0x9FBC..0x9FC3 then 5.1 # CJK Unified Ideographs
71
- when 0x9FC4..0x9FCB then 5.2 # CJK Unified Ideographs
72
- when 0x9FCC then 6.1 # CJK Unified Ideographs
73
- when 0x9FCD..0x9FD5 then 8.0 # CJK Unified Ideographs
74
- when 0x9FD6..0x9FEA then 10.0 # CJK Unified Ideographs
75
- when 0x9FEB..0x9FEF then 11.0 # CJK Unified Ideographs
76
- when 0x9FF0..0x9FFC then 13.0 # CJK Unified Ideographs
77
- end
78
- when 0xA000..0xDFFF
79
- case i
80
- when 0xA000..0xA48C then 3.0 # Yi Syllables 彝族文字區
81
- when 0xAC00..0xD7A3 then 2.0 # Hangul Syllables 韓文拼音
82
- end
83
- when 0xF000..0xFFFF
84
- case i
85
- when 0xF900..0xFA2D then 1.1 # CJK Compatibility Ideographs
86
- when 0xFA2E..0xFA2F then 6.1 # CJK Compatibility Ideographs
87
- when 0xFA30..0xFA6A then 3.2 # CJK Compatibility Ideographs
88
- when 0xFA6B..0xFA6D then 5.2 # CJK Compatibility Ideographs
89
- when 0xFA70..0xFAD9 then 4.1 # CJK Compatibility Ideographs
90
- when 0xFE10..0xFE19 then 4.1 # Vertical Forms 中文直排標點
91
- when 0xFE20..0xFE23 then 1.1 # Combining Half Marks
92
- when 0xFE24..0xFE26 then 5.1 # Combining Half Marks
93
- when 0xFE30..0xFE44 then 1.0 # CJK Compatibility Forms 兼容性表格
94
- when 0xFE45..0xFE46 then 3.2 # CJK Compatibility Forms 兼容性表格
95
- when 0xFE47..0xFE48 then 4.0 # CJK Compatibility Forms 兼容性表格
96
- when 0xFE49..0xFE4F then 1.0 # CJK Compatibility Forms 兼容性表格
97
- when 0xFE50..0xFE52 then 1.0 # Small Form Variants
98
- when 0xFE54..0xFE66 then 1.0 # Small Form Variants
99
- when 0xFE68..0xFE6B then 1.0 # Small Form Variants
100
- when 0xFF01..0xFF5E then 1.0 # Halfwidth and Fullwidth Forms
101
- when 0xFF5F..0xFF60 then 3.2 # Halfwidth and Fullwidth Forms
102
- when 0xFF61..0xFF9F then 1.0 # Halfwidth and Fullwidth Forms
103
- end
104
- end
105
- end
106
-
107
- def self.uv00(i)
108
- case i
109
- when 0..0x01FF
110
- case i
111
- when 0..0x017E then 1.0
112
- when 0x017F then 1.1
113
- when 0x0180..0x01F0 then 1.0
114
- when 0x01F1..0x01F5 then 1.1
115
- when 0x01F6..0x01F9 then 3.0
116
- when 0x01FA..0x0217 then 1.1
117
- end
118
- when 0x0200..0x02FF
119
- case i
120
- when 0x0218..0x021F then 3.0
121
- when 0x0220 then 3.2
122
- when 0x0221 then 4.0
123
- when 0x0222..0x0233 then 3.0
124
- when 0x0234..0x0236 then 4.0
125
- when 0x0237..0x0241 then 4.1
126
- when 0x0242..0x024F then 5.0
127
- when 0x0250..0x02A8 then 1.0
128
- when 0x02A9..0x02AD then 3.0
129
- when 0x02AE..0x02AF then 4.0
130
- when 0x02B0..0x02DE then 1.0
131
- when 0x02DF then 3.0
132
- when 0x02E0..0x02E9 then 1.0
133
- when 0x02EA..0x02EE then 3.0
134
- when 0x02EF..0x02FF then 4.0
135
- end
136
- when 0x0300..0x0341 then 1.0
137
- when 0x0400..0x04FF
138
- case i
139
- when 0x0401..0x040C then 1.0
140
- when 0x040D then 3.0
141
- when 0x040E..0x044F then 1.0
38
+ def read_strokes
39
+ fn = File.join(DATA_DIR, 'Unihan_DictionaryLikeData.txt')
40
+ @strokes = {}
41
+ File.foreach(fn) do |line|
42
+ next if line.start_with? '#'
43
+ line.chomp!
44
+ cells = line.split("\t")
45
+ if cells[1] == 'kTotalStrokes'
46
+ c = [cells[0].sub(/^U\+(.*)$/, '\1').hex].pack('U')
47
+ i = cells[2].to_i
48
+ @strokes[c] = i
142
49
  end
143
50
  end
144
51
  end
145
52
 
146
- def self.uv02(i)
147
- case i
148
- when 0x2000..0x20FF
149
- case i
150
- when 0x2000..0x202E then 1.0
151
- when 0x2045..0x2046 then 1.1
152
- when 0x2047 then 3.2
153
- when 0x2048..0x204F then 3.0
154
- end
155
- when 0x2100..0x21FF
156
- case i
157
- when 0x2100..0x2138 then 1.0
158
- when 0x2153..0x2182 then 1.0
159
- when 0x2190..0x21EA then 1.0
160
- end
161
- when 0x2200..0x22F1 then 1.0
162
- when 0x2400..0x24FF
163
- case i
164
- when 0x2460..0x24EA then 1.0
165
- when 0x24EB..0x24FE then 3.2
166
- when 0x24FF then 4.0
167
- end
168
- when 0x2500..0x25FF
169
- case i
170
- when 0x2500..0x2595 then 1.0
171
- when 0x2596..0x259F then 3.2
172
- when 0x25A0..0x25EE then 1.0
173
- when 0x25EF then 1.1
174
- end
175
- when 0x2600..0x26FF
176
- case i
177
- when 0x2600..0x2613 then 1.0
178
- when 0x261A..0x266F then 1.0
179
- end
180
- when 0x2E00..0x2FFF
181
- case i
182
- when 0x2E80..0x2EF3 then 3.0
183
- when 0x2F00..0x2FD5 then 3.0
184
- when 0x2FF0..0x2FFB then 3.0
185
- end
53
+ def read_version
54
+ @vers = []
55
+ fn = File.join(DATA_DIR, 'unicode-chars-ver.csv')
56
+ CSV.foreach(fn, headers: true) do |row|
57
+ @vers << {
58
+ range: (row['cp1'].hex..row['cp2'].hex),
59
+ age: row['age'].to_f
60
+ }
186
61
  end
187
62
  end
188
63
 
189
- def self.uv03(i)
190
- case i
191
- when 0x3000..0x30FF
192
- case i
193
- when 0x3000..0x3036 then 1.0 # CJK Symbols and Punctuation
194
- when 0x3037 then 1.1 # CJK Symbols and Punctuation
195
- when 0x3038..0x303A then 3.0 # CJK Symbols and Punctuation
196
- when 0x303B..0x303D then 3.2 # CJK Symbols and Punctuation
197
- when 0x303E then 3.0 # CJK Symbols and Punctuation
198
- when 0x303F then 1.0 # CJK Symbols and Punctuation
199
- when 0x3041..0x3094 then 1.0 # Hiragana 日文平假名
200
- when 0x3095..0x3096 then 3.2 # Hiragana 日文平假名
201
- when 0x3099..0x309E then 1.0 # Hiragana 日文平假名
202
- when 0x309F then 3.2 # Hiragana 日文平假名
203
- when 0x30A0 then 3.2 # Katakana 日文片假名
204
- when 0x30A1..0x30F6 then 1.0 # Katakana 日文片假名
205
- when 0x30F7..0x30FA then 1.1 # Katakana 日文片假名
206
- when 0x30FB..0x30FE then 1.0 # Katakana 日文片假名
207
- when 0x30FF then 3.2 # Katakana 日文片假名 (Unicode 3.2)
208
- end
209
- when 0x3100..0x31FF
210
- case i
211
- when 0x3105..0x312C then 1.0 # Bopomofo 注音符號
212
- when 0x312D then 5.1 # Bopomofo 上下顛倒的 'ㄓ'
213
- when 0x3131..0x318E then 1.0 # Hangul Compatibility Jamo 韓文
214
- when 0x3190..0x319F then 1.0 # Kanbun 在上方的小漢字
215
- when 0x31A0..0x31B7 then 3.0 # Bopomofo Extended 注音擴展
216
- when 0x31B8..0x31BA then 6.0 # Bopomofo Extended 注音擴展
217
- when 0x31C0..0x31CF then 4.1 # CJK Strokes 筆劃 (基本筆劃, 如撇, 勾, 點...)
218
- when 0x31D0..0x31E3 then 5.1 # CJK Strokes 筆劃 (基本筆劃, 如撇, 勾, 點...)
219
- when 0x31F0..0x31FF then 3.2 # Katakana Phonetic Extensions 日文片假名語音擴展
220
- end
221
- when 0x3200..0x32FF
222
- case i
223
- when 0x3200..0x321C then 1.0 # Enclosed CJK Letters and Months 括號韓文
224
- when 0x321D..0x321E then 4.0 # Enclosed CJK Letters and Months 括號韓文
225
- when 0x3220..0x3243 then 1.0 # Enclosed CJK Letters and Months 括號一~十及漢字
226
- when 0x3244..0x324F then 5.2 # Enclosed CJK Letters and Months 圓圈中有字及10~80
227
- when 0x3250 then 4.0 # Enclosed CJK Letters and Months 'PTE' 組成一字
228
- when 0x3251..0x325F then 3.2 # Enclosed CJK Letters and Months 圓圈 21~35
229
- when 0x3260..0x327B then 1.0 # Enclosed CJK Letters and Months 圓圈韓文
230
- when 0x327C..0x327D then 4.0 # Enclosed CJK Letters and Months 圓圈韓文
231
- when 0x327E then 4.1 # Enclosed CJK Letters and Months 圓圈韓文
232
- when 0x327F..0x32B0 then 1.0 # Enclosed CJK Letters and Months 圓圈一~十及漢字
233
- when 0x32B1..0x32BF then 3.2 # Enclosed CJK Letters and Months 圓圈 36~50
234
- when 0x32C0..0x32CB then 1.1 # Enclosed CJK Letters and Months 1月~12月
235
- when 0x32CC..0x32CF then 4.0 # Enclosed CJK Letters and Months 多英文組成一個字
236
- when 0x32D0..0x32FE then 1.0 # Enclosed CJK Letters and Months 圓圈日文
237
- end
238
- when 0x3300..0x33FF
239
- case i
240
- when 0x3300..0x3357 then 1.0 # CJK Compatibility 多個日文組成一字
241
- when 0x3358..0x3376 then 1.1 # CJK Compatibility 0点~24点 及多英文組成一字
242
- when 0x3377..0x337A then 4.0 # CJK Compatibility 多英文組成一字
243
- when 0x337B..0x33DD then 1.0 # CJK Compatibility 多日本漢字及多英文組成一字
244
- when 0x33DE..0x33DF then 4.0 # CJK Compatibility 多英文組成一字
245
- when 0x33E0..0x33FE then 1.1 # CJK Compatibility 1日~31日
246
- when 0x33FF then 4.0 # CJK Compatibility 'gal' 組成一字
64
+ def ver_bsearch(code, start, stop)
65
+ return nil if start > stop
66
+ middle = (stop - start) / 2 + start
67
+ h = @vers[middle]
68
+ if h[:range].include?(code)
69
+ return h[:age]
70
+ elsif middle == start
71
+ return nil if code < h[:range].begin
72
+ return ver_bsearch(code, middle+1, stop)
73
+ else
74
+ if code < h[:range].begin
75
+ return ver_bsearch(code, start, middle)
76
+ else
77
+ return ver_bsearch(code, middle, stop)
247
78
  end
248
79
  end
249
80
  end
250
81
 
251
- def self.uv2(i)
252
- case i
253
- when 0x20000..0x2A6D6 then 3.1 # Extension B
254
- when 0x2A6D7..0x2A6DD then 13.0 # Extension B
255
- when 0x2A700..0x2B734 then 5.2 # extension C
256
- when 0x2B740..0x2B81D then 6.0 # extension D
257
- when 0x2B820..0x2CEA1 then 8.0 # extension E
258
- when 0x2CEB0..0x2EBE0 then 10.0 # extension F
259
- when 0x2F800..0x2FA1D then 3.1 # Unicode 3.1: CJK Compatibility Supplement
260
- end
261
- end
262
-
263
- private_class_method :uv0, :uv00, :uv02, :uv03, :uv2
264
-
265
82
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unihan2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
@@ -17,6 +17,7 @@ extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
19
  - data/Unihan_DictionaryLikeData.txt
20
+ - data/unicode-chars-ver.csv
20
21
  - lib/unihan2.rb
21
22
  homepage: https://github.com/RayCHOU/unihan2
22
23
  licenses: