unihan2 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/data/unicode-chars-ver.csv +1145 -0
  3. data/lib/unihan2.rb +50 -233
  4. metadata +2 -1
@@ -1,24 +1,25 @@
1
+ require 'csv'
2
+
1
3
  class Unihan2
2
4
  DATA_DIR = File.join(File.dirname(__FILE__), '../data')
3
5
 
4
6
  def initialize
5
- fn = File.join(DATA_DIR, 'Unihan_DictionaryLikeData.txt')
6
- @strokes = {}
7
- File.foreach(fn) do |line|
8
- next if line.start_with? '#'
9
- line.chomp!
10
- cells = line.split("\t")
11
- if cells[1] == 'kTotalStrokes'
12
- c = [cells[0].sub(/^U\+(.*)$/, '\1').hex].pack('U')
13
- i = cells[2].to_i
14
- @strokes[c] = i
15
- end
16
- end
7
+ read_strokes
8
+ read_version
9
+ end
10
+
11
+
12
+ # return total strokes of the character char
13
+ # @param char [String] the character
14
+ # @return [Integer] the total strokes
15
+ def strokes(char)
16
+ @strokes[char]
17
17
  end
18
18
 
19
- # Listing of Characters Covered by the Unihan Database
20
- # https://www.unicode.org/reports/tr38/tr38-29.html#BlockListing
21
- def self.ver(code)
19
+ # return unicode version of specific character
20
+ # @param code [String] character or codepoing
21
+ # @return [Float] unicode version
22
+ def ver(code)
22
23
  return nil if code.nil?
23
24
 
24
25
  if code.is_a? Integer
@@ -29,237 +30,53 @@ class Unihan2
29
30
  i = code.hex
30
31
  end
31
32
 
32
- case i
33
- when 0..0xFFFF then uv0(i)
34
- when 0x20000..0x2FFFF then uv2(i)
35
- when 0x30000..0x3134A then 13.0
36
- end
37
- end
38
-
39
- # return total strokes of the character char
40
- # @param char [String] the character
41
- # @return [Integer] the total strokes
42
- def strokes(char)
43
- @strokes[char]
33
+ ver_bsearch(i, 0, @vers.size-1)
44
34
  end
45
35
 
46
36
  private
47
37
 
48
- def self.uv0(i)
49
- case i
50
- when 0..0x0FFF then uv00(i)
51
- when 0x1E00..0x1EFF
52
- case i
53
- when 0x1E00..0x1E9A then 1.1
54
- when 0x1E9B then 2.0
55
- when 0x1E9C..0x1E9F then 5.1
56
- when 0x1EA0..0x1EF9 then 1.1
57
- when 0x1EFA..0x1EFF then 5.1
58
- end
59
- when 0x2000..0x2FFF then uv02(i)
60
- when 0x3000..0x33FF then uv03(i)
61
- when 0x3400..0x9FA5
62
- case i
63
- when 0x3400..0x4DB5 then 3.0 # CJK Extension A 中日韓統一表意文字擴充 A 區
64
- when 0x4DB6..0x4DBF then 13.0 # Extension A
65
- when 0x4E00..0x9FA5 then 1.1 # CJK Unified Ideographs
66
- end
67
- when 0x9FA6..0x9FFF
68
- case i
69
- when 0x9FA6..0x9FBB then 4.1 # CJK Unified Ideographs
70
- when 0x9FBC..0x9FC3 then 5.1 # CJK Unified Ideographs
71
- when 0x9FC4..0x9FCB then 5.2 # CJK Unified Ideographs
72
- when 0x9FCC then 6.1 # CJK Unified Ideographs
73
- when 0x9FCD..0x9FD5 then 8.0 # CJK Unified Ideographs
74
- when 0x9FD6..0x9FEA then 10.0 # CJK Unified Ideographs
75
- when 0x9FEB..0x9FEF then 11.0 # CJK Unified Ideographs
76
- when 0x9FF0..0x9FFC then 13.0 # CJK Unified Ideographs
77
- end
78
- when 0xA000..0xDFFF
79
- case i
80
- when 0xA000..0xA48C then 3.0 # Yi Syllables 彝族文字區
81
- when 0xAC00..0xD7A3 then 2.0 # Hangul Syllables 韓文拼音
82
- end
83
- when 0xF000..0xFFFF
84
- case i
85
- when 0xF900..0xFA2D then 1.1 # CJK Compatibility Ideographs
86
- when 0xFA2E..0xFA2F then 6.1 # CJK Compatibility Ideographs
87
- when 0xFA30..0xFA6A then 3.2 # CJK Compatibility Ideographs
88
- when 0xFA6B..0xFA6D then 5.2 # CJK Compatibility Ideographs
89
- when 0xFA70..0xFAD9 then 4.1 # CJK Compatibility Ideographs
90
- when 0xFE10..0xFE19 then 4.1 # Vertical Forms 中文直排標點
91
- when 0xFE20..0xFE23 then 1.1 # Combining Half Marks
92
- when 0xFE24..0xFE26 then 5.1 # Combining Half Marks
93
- when 0xFE30..0xFE44 then 1.0 # CJK Compatibility Forms 兼容性表格
94
- when 0xFE45..0xFE46 then 3.2 # CJK Compatibility Forms 兼容性表格
95
- when 0xFE47..0xFE48 then 4.0 # CJK Compatibility Forms 兼容性表格
96
- when 0xFE49..0xFE4F then 1.0 # CJK Compatibility Forms 兼容性表格
97
- when 0xFE50..0xFE52 then 1.0 # Small Form Variants
98
- when 0xFE54..0xFE66 then 1.0 # Small Form Variants
99
- when 0xFE68..0xFE6B then 1.0 # Small Form Variants
100
- when 0xFF01..0xFF5E then 1.0 # Halfwidth and Fullwidth Forms
101
- when 0xFF5F..0xFF60 then 3.2 # Halfwidth and Fullwidth Forms
102
- when 0xFF61..0xFF9F then 1.0 # Halfwidth and Fullwidth Forms
103
- end
104
- end
105
- end
106
-
107
- def self.uv00(i)
108
- case i
109
- when 0..0x01FF
110
- case i
111
- when 0..0x017E then 1.0
112
- when 0x017F then 1.1
113
- when 0x0180..0x01F0 then 1.0
114
- when 0x01F1..0x01F5 then 1.1
115
- when 0x01F6..0x01F9 then 3.0
116
- when 0x01FA..0x0217 then 1.1
117
- end
118
- when 0x0200..0x02FF
119
- case i
120
- when 0x0218..0x021F then 3.0
121
- when 0x0220 then 3.2
122
- when 0x0221 then 4.0
123
- when 0x0222..0x0233 then 3.0
124
- when 0x0234..0x0236 then 4.0
125
- when 0x0237..0x0241 then 4.1
126
- when 0x0242..0x024F then 5.0
127
- when 0x0250..0x02A8 then 1.0
128
- when 0x02A9..0x02AD then 3.0
129
- when 0x02AE..0x02AF then 4.0
130
- when 0x02B0..0x02DE then 1.0
131
- when 0x02DF then 3.0
132
- when 0x02E0..0x02E9 then 1.0
133
- when 0x02EA..0x02EE then 3.0
134
- when 0x02EF..0x02FF then 4.0
135
- end
136
- when 0x0300..0x0341 then 1.0
137
- when 0x0400..0x04FF
138
- case i
139
- when 0x0401..0x040C then 1.0
140
- when 0x040D then 3.0
141
- when 0x040E..0x044F then 1.0
38
+ def read_strokes
39
+ fn = File.join(DATA_DIR, 'Unihan_DictionaryLikeData.txt')
40
+ @strokes = {}
41
+ File.foreach(fn) do |line|
42
+ next if line.start_with? '#'
43
+ line.chomp!
44
+ cells = line.split("\t")
45
+ if cells[1] == 'kTotalStrokes'
46
+ c = [cells[0].sub(/^U\+(.*)$/, '\1').hex].pack('U')
47
+ i = cells[2].to_i
48
+ @strokes[c] = i
142
49
  end
143
50
  end
144
51
  end
145
52
 
146
- def self.uv02(i)
147
- case i
148
- when 0x2000..0x20FF
149
- case i
150
- when 0x2000..0x202E then 1.0
151
- when 0x2045..0x2046 then 1.1
152
- when 0x2047 then 3.2
153
- when 0x2048..0x204F then 3.0
154
- end
155
- when 0x2100..0x21FF
156
- case i
157
- when 0x2100..0x2138 then 1.0
158
- when 0x2153..0x2182 then 1.0
159
- when 0x2190..0x21EA then 1.0
160
- end
161
- when 0x2200..0x22F1 then 1.0
162
- when 0x2400..0x24FF
163
- case i
164
- when 0x2460..0x24EA then 1.0
165
- when 0x24EB..0x24FE then 3.2
166
- when 0x24FF then 4.0
167
- end
168
- when 0x2500..0x25FF
169
- case i
170
- when 0x2500..0x2595 then 1.0
171
- when 0x2596..0x259F then 3.2
172
- when 0x25A0..0x25EE then 1.0
173
- when 0x25EF then 1.1
174
- end
175
- when 0x2600..0x26FF
176
- case i
177
- when 0x2600..0x2613 then 1.0
178
- when 0x261A..0x266F then 1.0
179
- end
180
- when 0x2E00..0x2FFF
181
- case i
182
- when 0x2E80..0x2EF3 then 3.0
183
- when 0x2F00..0x2FD5 then 3.0
184
- when 0x2FF0..0x2FFB then 3.0
185
- end
53
+ def read_version
54
+ @vers = []
55
+ fn = File.join(DATA_DIR, 'unicode-chars-ver.csv')
56
+ CSV.foreach(fn, headers: true) do |row|
57
+ @vers << {
58
+ range: (row['cp1'].hex..row['cp2'].hex),
59
+ age: row['age'].to_f
60
+ }
186
61
  end
187
62
  end
188
63
 
189
- def self.uv03(i)
190
- case i
191
- when 0x3000..0x30FF
192
- case i
193
- when 0x3000..0x3036 then 1.0 # CJK Symbols and Punctuation
194
- when 0x3037 then 1.1 # CJK Symbols and Punctuation
195
- when 0x3038..0x303A then 3.0 # CJK Symbols and Punctuation
196
- when 0x303B..0x303D then 3.2 # CJK Symbols and Punctuation
197
- when 0x303E then 3.0 # CJK Symbols and Punctuation
198
- when 0x303F then 1.0 # CJK Symbols and Punctuation
199
- when 0x3041..0x3094 then 1.0 # Hiragana 日文平假名
200
- when 0x3095..0x3096 then 3.2 # Hiragana 日文平假名
201
- when 0x3099..0x309E then 1.0 # Hiragana 日文平假名
202
- when 0x309F then 3.2 # Hiragana 日文平假名
203
- when 0x30A0 then 3.2 # Katakana 日文片假名
204
- when 0x30A1..0x30F6 then 1.0 # Katakana 日文片假名
205
- when 0x30F7..0x30FA then 1.1 # Katakana 日文片假名
206
- when 0x30FB..0x30FE then 1.0 # Katakana 日文片假名
207
- when 0x30FF then 3.2 # Katakana 日文片假名 (Unicode 3.2)
208
- end
209
- when 0x3100..0x31FF
210
- case i
211
- when 0x3105..0x312C then 1.0 # Bopomofo 注音符號
212
- when 0x312D then 5.1 # Bopomofo 上下顛倒的 'ㄓ'
213
- when 0x3131..0x318E then 1.0 # Hangul Compatibility Jamo 韓文
214
- when 0x3190..0x319F then 1.0 # Kanbun 在上方的小漢字
215
- when 0x31A0..0x31B7 then 3.0 # Bopomofo Extended 注音擴展
216
- when 0x31B8..0x31BA then 6.0 # Bopomofo Extended 注音擴展
217
- when 0x31C0..0x31CF then 4.1 # CJK Strokes 筆劃 (基本筆劃, 如撇, 勾, 點...)
218
- when 0x31D0..0x31E3 then 5.1 # CJK Strokes 筆劃 (基本筆劃, 如撇, 勾, 點...)
219
- when 0x31F0..0x31FF then 3.2 # Katakana Phonetic Extensions 日文片假名語音擴展
220
- end
221
- when 0x3200..0x32FF
222
- case i
223
- when 0x3200..0x321C then 1.0 # Enclosed CJK Letters and Months 括號韓文
224
- when 0x321D..0x321E then 4.0 # Enclosed CJK Letters and Months 括號韓文
225
- when 0x3220..0x3243 then 1.0 # Enclosed CJK Letters and Months 括號一~十及漢字
226
- when 0x3244..0x324F then 5.2 # Enclosed CJK Letters and Months 圓圈中有字及10~80
227
- when 0x3250 then 4.0 # Enclosed CJK Letters and Months 'PTE' 組成一字
228
- when 0x3251..0x325F then 3.2 # Enclosed CJK Letters and Months 圓圈 21~35
229
- when 0x3260..0x327B then 1.0 # Enclosed CJK Letters and Months 圓圈韓文
230
- when 0x327C..0x327D then 4.0 # Enclosed CJK Letters and Months 圓圈韓文
231
- when 0x327E then 4.1 # Enclosed CJK Letters and Months 圓圈韓文
232
- when 0x327F..0x32B0 then 1.0 # Enclosed CJK Letters and Months 圓圈一~十及漢字
233
- when 0x32B1..0x32BF then 3.2 # Enclosed CJK Letters and Months 圓圈 36~50
234
- when 0x32C0..0x32CB then 1.1 # Enclosed CJK Letters and Months 1月~12月
235
- when 0x32CC..0x32CF then 4.0 # Enclosed CJK Letters and Months 多英文組成一個字
236
- when 0x32D0..0x32FE then 1.0 # Enclosed CJK Letters and Months 圓圈日文
237
- end
238
- when 0x3300..0x33FF
239
- case i
240
- when 0x3300..0x3357 then 1.0 # CJK Compatibility 多個日文組成一字
241
- when 0x3358..0x3376 then 1.1 # CJK Compatibility 0点~24点 及多英文組成一字
242
- when 0x3377..0x337A then 4.0 # CJK Compatibility 多英文組成一字
243
- when 0x337B..0x33DD then 1.0 # CJK Compatibility 多日本漢字及多英文組成一字
244
- when 0x33DE..0x33DF then 4.0 # CJK Compatibility 多英文組成一字
245
- when 0x33E0..0x33FE then 1.1 # CJK Compatibility 1日~31日
246
- when 0x33FF then 4.0 # CJK Compatibility 'gal' 組成一字
64
+ def ver_bsearch(code, start, stop)
65
+ return nil if start > stop
66
+ middle = (stop - start) / 2 + start
67
+ h = @vers[middle]
68
+ if h[:range].include?(code)
69
+ return h[:age]
70
+ elsif middle == start
71
+ return nil if code < h[:range].begin
72
+ return ver_bsearch(code, middle+1, stop)
73
+ else
74
+ if code < h[:range].begin
75
+ return ver_bsearch(code, start, middle)
76
+ else
77
+ return ver_bsearch(code, middle, stop)
247
78
  end
248
79
  end
249
80
  end
250
81
 
251
- def self.uv2(i)
252
- case i
253
- when 0x20000..0x2A6D6 then 3.1 # Extension B
254
- when 0x2A6D7..0x2A6DD then 13.0 # Extension B
255
- when 0x2A700..0x2B734 then 5.2 # extension C
256
- when 0x2B740..0x2B81D then 6.0 # extension D
257
- when 0x2B820..0x2CEA1 then 8.0 # extension E
258
- when 0x2CEB0..0x2EBE0 then 10.0 # extension F
259
- when 0x2F800..0x2FA1D then 3.1 # Unicode 3.1: CJK Compatibility Supplement
260
- end
261
- end
262
-
263
- private_class_method :uv0, :uv00, :uv02, :uv03, :uv2
264
-
265
82
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unihan2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
@@ -17,6 +17,7 @@ extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
19
  - data/Unihan_DictionaryLikeData.txt
20
+ - data/unicode-chars-ver.csv
20
21
  - lib/unihan2.rb
21
22
  homepage: https://github.com/RayCHOU/unihan2
22
23
  licenses: