unicode 0.4.4.3-x86-mswin32-60 → 0.4.4.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +3 -3
- data/lib/unicode.rb +116 -5
- data/unicode.gemspec +1 -1
- metadata +25 -15
- data/ext/unicode/extconf.rb +0 -3
- data/ext/unicode/unicode.c +0 -1325
- data/ext/unicode/unidata.map +0 -24555
- data/ext/unicode/ustring.c +0 -208
- data/ext/unicode/ustring.h +0 -48
- data/ext/unicode/wstring.c +0 -257
- data/ext/unicode/wstring.h +0 -43
- data/lib/unicode/1.9/unicode_native.so +0 -0
- data/lib/unicode/2.0/unicode_native.so +0 -0
- data/lib/unicode/2.1/unicode_native.so +0 -0
- data/lib/unicode/2.2/unicode_native.so +0 -0
data/ext/unicode/unicode.c
DELETED
@@ -1,1325 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* Unicode Library version 0.4.3
|
3
|
-
* Aug 8, 2012: version 0.4
|
4
|
-
* Oct 14, 2010: version 0.4
|
5
|
-
* Feb 26, 2010: version 0.3
|
6
|
-
* Dec 29, 2009: version 0.2
|
7
|
-
* Nov 23, 1999 yoshidam
|
8
|
-
*
|
9
|
-
*/
|
10
|
-
|
11
|
-
#define UNICODE_VERSION "0.4.3"
|
12
|
-
|
13
|
-
#include "ruby.h"
|
14
|
-
#ifdef HAVE_RUBY_IO_H
|
15
|
-
# include "ruby/io.h"
|
16
|
-
#else
|
17
|
-
# include "rubyio.h"
|
18
|
-
#endif
|
19
|
-
#include <stdio.h>
|
20
|
-
#include "wstring.h"
|
21
|
-
#include "unidata.map"
|
22
|
-
|
23
|
-
#ifndef RSTRING_PTR
|
24
|
-
# define RSTRING_PTR(s) (RSTRING(s)->ptr)
|
25
|
-
# define RSTRING_LEN(s) (RSTRING(s)->len)
|
26
|
-
#endif
|
27
|
-
|
28
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
29
|
-
static rb_encoding* enc_out;
|
30
|
-
# define ENC_(o) (rb_enc_associate(o, enc_out))
|
31
|
-
#else
|
32
|
-
# define ENC_(o) (o)
|
33
|
-
#endif
|
34
|
-
|
35
|
-
inline static VALUE
|
36
|
-
taintObject(VALUE src, VALUE obj) {
|
37
|
-
if (OBJ_TAINTED(src))
|
38
|
-
OBJ_TAINT(obj);
|
39
|
-
return obj;
|
40
|
-
}
|
41
|
-
#define TO_(src, obj) (taintObject(src, obj))
|
42
|
-
|
43
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
44
|
-
# define CONVERT_TO_UTF8(str) do { \
|
45
|
-
int encindex = ENCODING_GET(str); \
|
46
|
-
volatile VALUE encobj; \
|
47
|
-
if (encindex != rb_utf8_encindex() && \
|
48
|
-
encindex != rb_usascii_encindex()) { \
|
49
|
-
encobj = rb_enc_from_encoding(enc_out); \
|
50
|
-
str = rb_str_encode(str, encobj, 0, Qnil); \
|
51
|
-
} \
|
52
|
-
} while (0)
|
53
|
-
#endif
|
54
|
-
|
55
|
-
static VALUE mUnicode;
|
56
|
-
static VALUE unicode_data;
|
57
|
-
static VALUE composition_table;
|
58
|
-
static VALUE catname_long[c_Cn+1];
|
59
|
-
static VALUE catname_abbr[c_Cn+1];
|
60
|
-
|
61
|
-
/* Hangul */
|
62
|
-
#define SBASE (0xac00)
|
63
|
-
#define LBASE (0x1100)
|
64
|
-
#define LCOUNT (19)
|
65
|
-
#define VBASE (0x1161)
|
66
|
-
#define VCOUNT (21)
|
67
|
-
#define TBASE (0x11a7)
|
68
|
-
#define TCOUNT (28)
|
69
|
-
#define NCOUNT (VCOUNT * TCOUNT) /* 588 */
|
70
|
-
#define SCOUNT (LCOUNT * NCOUNT) /* 11172 */
|
71
|
-
|
72
|
-
VALUE
|
73
|
-
get_unidata(int ucs) {
|
74
|
-
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
75
|
-
if (!NIL_P(ch))
|
76
|
-
return ch;
|
77
|
-
#ifdef CJK_IDEOGRAPH_EXTENSION_A_FIRST
|
78
|
-
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_A_FIRST &&
|
79
|
-
ucs <= CJK_IDEOGRAPH_EXTENSION_A_LAST)
|
80
|
-
return rb_hash_aref(unicode_data,
|
81
|
-
INT2FIX(CJK_IDEOGRAPH_EXTENSION_A_FIRST));
|
82
|
-
#endif
|
83
|
-
#ifdef CJK_IDEOGRAPH_FIRST
|
84
|
-
else if (ucs >= CJK_IDEOGRAPH_FIRST &&
|
85
|
-
ucs <= CJK_IDEOGRAPH_LAST)
|
86
|
-
return rb_hash_aref(unicode_data,
|
87
|
-
INT2FIX(CJK_IDEOGRAPH_FIRST));
|
88
|
-
#endif
|
89
|
-
#ifdef HANGUL_SYLLABLE_FIRST
|
90
|
-
else if (ucs >= HANGUL_SYLLABLE_FIRST &&
|
91
|
-
ucs <= HANGUL_SYLLABLE_LAST)
|
92
|
-
return rb_hash_aref(unicode_data,
|
93
|
-
INT2FIX(HANGUL_SYLLABLE_FIRST));
|
94
|
-
#endif
|
95
|
-
#ifdef NON_PRIVATE_USE_HIGH_SURROGATE_FIRST
|
96
|
-
else if (ucs >= NON_PRIVATE_USE_HIGH_SURROGATE_FIRST &&
|
97
|
-
ucs <= NON_PRIVATE_USE_HIGH_SURROGATE_LAST)
|
98
|
-
return rb_hash_aref(unicode_data,
|
99
|
-
INT2FIX(NON_PRIVATE_USE_HIGH_SURROGATE_FIRST));
|
100
|
-
#endif
|
101
|
-
#ifdef PRIVATE_USE_HIGH_SURROGATE_FIRST
|
102
|
-
else if (ucs >= PRIVATE_USE_HIGH_SURROGATE_FIRST &&
|
103
|
-
ucs <= PRIVATE_USE_HIGH_SURROGATE_LAST)
|
104
|
-
return rb_hash_aref(unicode_data,
|
105
|
-
INT2FIX(PRIVATE_USE_HIGH_SURROGATE_FIRST));
|
106
|
-
#endif
|
107
|
-
#ifdef LOW_SURROGATE_FIRST
|
108
|
-
else if (ucs >= LOW_SURROGATE_FIRST &&
|
109
|
-
ucs <= LOW_SURROGATE_LAST)
|
110
|
-
return rb_hash_aref(unicode_data,
|
111
|
-
INT2FIX(LOW_SURROGATE_FIRST));
|
112
|
-
#endif
|
113
|
-
#ifdef PRIVATE_USE_FIRST
|
114
|
-
else if (ucs >= PRIVATE_USE_FIRST &&
|
115
|
-
ucs <= PRIVATE_USE_LAST)
|
116
|
-
return rb_hash_aref(unicode_data,
|
117
|
-
INT2FIX(PRIVATE_USE_FIRST));
|
118
|
-
#endif
|
119
|
-
#ifdef CJK_IDEOGRAPH_EXTENSION_B_FIRST
|
120
|
-
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_B_FIRST &&
|
121
|
-
ucs <= CJK_IDEOGRAPH_EXTENSION_B_LAST)
|
122
|
-
return rb_hash_aref(unicode_data,
|
123
|
-
INT2FIX(CJK_IDEOGRAPH_EXTENSION_B_FIRST));
|
124
|
-
#endif
|
125
|
-
#ifdef CJK_IDEOGRAPH_EXTENSION_C_FIRST
|
126
|
-
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_C_FIRST &&
|
127
|
-
ucs <= CJK_IDEOGRAPH_EXTENSION_C_LAST)
|
128
|
-
return rb_hash_aref(unicode_data,
|
129
|
-
INT2FIX(CJK_IDEOGRAPH_EXTENSION_C_FIRST));
|
130
|
-
#endif
|
131
|
-
#ifdef CJK_IDEOGRAPH_EXTENSION_D_FIRST
|
132
|
-
else if (ucs >= CJK_IDEOGRAPH_EXTENSION_D_FIRST &&
|
133
|
-
ucs <= CJK_IDEOGRAPH_EXTENSION_D_LAST)
|
134
|
-
return rb_hash_aref(unicode_data,
|
135
|
-
INT2FIX(CJK_IDEOGRAPH_EXTENSION_D_FIRST));
|
136
|
-
#endif
|
137
|
-
#ifdef PLANE_15_PRIVATE_USE_FIRST
|
138
|
-
else if (ucs >= PLANE_15_PRIVATE_USE_FIRST &&
|
139
|
-
ucs <= PLANE_15_PRIVATE_USE_LAST)
|
140
|
-
return rb_hash_aref(unicode_data,
|
141
|
-
INT2FIX(PLANE_15_PRIVATE_USE_FIRST));
|
142
|
-
#endif
|
143
|
-
#ifdef PLANE_16_PRIVATE_USE_FIRST
|
144
|
-
else if (ucs >= PLANE_16_PRIVATE_USE_FIRST &&
|
145
|
-
ucs <= PLANE_16_PRIVATE_USE_LAST)
|
146
|
-
return rb_hash_aref(unicode_data,
|
147
|
-
INT2FIX(PLANE_16_PRIVATE_USE_FIRST));
|
148
|
-
#endif
|
149
|
-
return Qnil;
|
150
|
-
}
|
151
|
-
|
152
|
-
static int
|
153
|
-
get_cc(int ucs)
|
154
|
-
{
|
155
|
-
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
156
|
-
|
157
|
-
if (!NIL_P(ch)) {
|
158
|
-
return unidata[FIX2INT(ch)].combining_class;
|
159
|
-
}
|
160
|
-
return 0;
|
161
|
-
}
|
162
|
-
|
163
|
-
static int
|
164
|
-
get_gencat(int ucs)
|
165
|
-
{
|
166
|
-
VALUE ch = get_unidata(ucs);
|
167
|
-
|
168
|
-
if (!NIL_P(ch)) {
|
169
|
-
return unidata[FIX2INT(ch)].general_category;
|
170
|
-
}
|
171
|
-
return c_Cn; /* Unassigned */
|
172
|
-
}
|
173
|
-
|
174
|
-
static int
|
175
|
-
get_eawidth(int ucs)
|
176
|
-
{
|
177
|
-
VALUE ch = get_unidata(ucs);
|
178
|
-
|
179
|
-
if (!NIL_P(ch)) {
|
180
|
-
return unidata[FIX2INT(ch)].east_asian_width;
|
181
|
-
}
|
182
|
-
return w_N; /* Neutral */
|
183
|
-
}
|
184
|
-
|
185
|
-
static const char*
|
186
|
-
get_canon(int ucs)
|
187
|
-
{
|
188
|
-
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
189
|
-
|
190
|
-
if (!NIL_P(ch)) {
|
191
|
-
return unidata[FIX2INT(ch)].canon;
|
192
|
-
}
|
193
|
-
return NULL;
|
194
|
-
}
|
195
|
-
|
196
|
-
static const char*
|
197
|
-
get_canon_ex(int ucs)
|
198
|
-
{
|
199
|
-
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
200
|
-
|
201
|
-
if (!NIL_P(ch)) {
|
202
|
-
int i = FIX2INT(ch);
|
203
|
-
if (!unidata[i].exclusion)
|
204
|
-
return unidata[i].canon;
|
205
|
-
}
|
206
|
-
return NULL;
|
207
|
-
}
|
208
|
-
|
209
|
-
static const char*
|
210
|
-
get_compat(int ucs)
|
211
|
-
{
|
212
|
-
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
213
|
-
|
214
|
-
if (!NIL_P(ch)) {
|
215
|
-
return unidata[FIX2INT(ch)].compat;
|
216
|
-
}
|
217
|
-
return NULL;
|
218
|
-
}
|
219
|
-
|
220
|
-
static const char*
|
221
|
-
get_uppercase(int ucs)
|
222
|
-
{
|
223
|
-
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
224
|
-
|
225
|
-
if (!NIL_P(ch)) {
|
226
|
-
return unidata[FIX2INT(ch)].uppercase;
|
227
|
-
}
|
228
|
-
return NULL;
|
229
|
-
}
|
230
|
-
|
231
|
-
static const char*
|
232
|
-
get_lowercase(int ucs)
|
233
|
-
{
|
234
|
-
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
235
|
-
|
236
|
-
if (!NIL_P(ch)) {
|
237
|
-
return unidata[FIX2INT(ch)].lowercase;
|
238
|
-
}
|
239
|
-
return NULL;
|
240
|
-
}
|
241
|
-
|
242
|
-
static const char*
|
243
|
-
get_titlecase(int ucs)
|
244
|
-
{
|
245
|
-
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
246
|
-
|
247
|
-
if (!NIL_P(ch)) {
|
248
|
-
return unidata[FIX2INT(ch)].titlecase;
|
249
|
-
}
|
250
|
-
return NULL;
|
251
|
-
}
|
252
|
-
|
253
|
-
static int
|
254
|
-
get_composition(const char* str)
|
255
|
-
{
|
256
|
-
VALUE ch = rb_hash_aref(composition_table, rb_str_new2(str));
|
257
|
-
|
258
|
-
if (!NIL_P(ch)) {
|
259
|
-
return FIX2INT(ch);
|
260
|
-
}
|
261
|
-
return -1;
|
262
|
-
}
|
263
|
-
|
264
|
-
static WString*
|
265
|
-
sort_canonical(WString* ustr)
|
266
|
-
{
|
267
|
-
int i = 1;
|
268
|
-
int len = ustr->len;
|
269
|
-
|
270
|
-
if (len < 2) return ustr;
|
271
|
-
|
272
|
-
while (i < len) {
|
273
|
-
int last = ustr->str[i - 1];
|
274
|
-
int ch = ustr->str[i];
|
275
|
-
int last_cc = get_cc(last);
|
276
|
-
int cc = get_cc(ch);
|
277
|
-
if (cc != 0 && last_cc != 0 && last_cc > cc) {
|
278
|
-
ustr->str[i] = last;
|
279
|
-
ustr->str[i-1] = ch;
|
280
|
-
if (i > 1) i--;
|
281
|
-
}
|
282
|
-
else {
|
283
|
-
i++;
|
284
|
-
}
|
285
|
-
}
|
286
|
-
return ustr;
|
287
|
-
}
|
288
|
-
|
289
|
-
static void
|
290
|
-
decompose_hangul(int ucs, int* l, int* v, int* t)
|
291
|
-
{
|
292
|
-
int sindex = ucs - SBASE;
|
293
|
-
if (sindex < 0 || sindex >= SCOUNT) {
|
294
|
-
*l = ucs;
|
295
|
-
*v = *t = 0;
|
296
|
-
return;
|
297
|
-
}
|
298
|
-
*l = LBASE + sindex / NCOUNT;
|
299
|
-
*v = VBASE + (sindex % NCOUNT) / TCOUNT;
|
300
|
-
*t = TBASE + sindex % TCOUNT;
|
301
|
-
if (*t == TBASE) *t = 0;
|
302
|
-
}
|
303
|
-
|
304
|
-
/*
|
305
|
-
* push decomposed str into result
|
306
|
-
*/
|
307
|
-
static WString*
|
308
|
-
decompose_internal(WString* ustr, WString* result)
|
309
|
-
{
|
310
|
-
int i;
|
311
|
-
int len = ustr->len;
|
312
|
-
|
313
|
-
for (i = 0; i < len; i++) {
|
314
|
-
int ucs = ustr->str[i];
|
315
|
-
if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
|
316
|
-
int l, v, t;
|
317
|
-
decompose_hangul(ucs, &l, &v, &t);
|
318
|
-
WStr_addWChar(result, l);
|
319
|
-
if (v) WStr_addWChar(result, v);
|
320
|
-
if (t) WStr_addWChar(result, t);
|
321
|
-
}
|
322
|
-
else {
|
323
|
-
const char* dc = get_canon(ucs);
|
324
|
-
if (!dc) {
|
325
|
-
WStr_addWChar(result, ucs);
|
326
|
-
}
|
327
|
-
else {
|
328
|
-
WString wdc;
|
329
|
-
WStr_allocWithUTF8(&wdc, dc);
|
330
|
-
decompose_internal(&wdc, result);
|
331
|
-
WStr_free(&wdc);
|
332
|
-
}
|
333
|
-
}
|
334
|
-
}
|
335
|
-
return result;
|
336
|
-
}
|
337
|
-
|
338
|
-
/*
|
339
|
-
* push decomposed str into result
|
340
|
-
*/
|
341
|
-
static WString*
|
342
|
-
decompose_safe_internal(WString* ustr, WString* result)
|
343
|
-
{
|
344
|
-
int i;
|
345
|
-
int len = ustr->len;
|
346
|
-
|
347
|
-
for (i = 0; i < len; i++) {
|
348
|
-
int ucs = ustr->str[i];
|
349
|
-
if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
|
350
|
-
int l, v, t;
|
351
|
-
decompose_hangul(ucs, &l, &v, &t);
|
352
|
-
WStr_addWChar(result, l);
|
353
|
-
if (v) WStr_addWChar(result, v);
|
354
|
-
if (t) WStr_addWChar(result, t);
|
355
|
-
}
|
356
|
-
else {
|
357
|
-
const char* dc = get_canon_ex(ucs);
|
358
|
-
if (!dc) {
|
359
|
-
WStr_addWChar(result, ucs);
|
360
|
-
}
|
361
|
-
else {
|
362
|
-
WString wdc;
|
363
|
-
WStr_allocWithUTF8(&wdc, dc);
|
364
|
-
decompose_safe_internal(&wdc, result);
|
365
|
-
WStr_free(&wdc);
|
366
|
-
}
|
367
|
-
}
|
368
|
-
}
|
369
|
-
return result;
|
370
|
-
}
|
371
|
-
|
372
|
-
/*
|
373
|
-
* push compatibility decomposed str into result
|
374
|
-
*/
|
375
|
-
static WString*
|
376
|
-
decompose_compat_internal(WString* ustr, WString* result)
|
377
|
-
{
|
378
|
-
int i;
|
379
|
-
int len = ustr->len;
|
380
|
-
|
381
|
-
for (i = 0; i < len; i++) {
|
382
|
-
int ucs = ustr->str[i];
|
383
|
-
if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
|
384
|
-
int l, v, t;
|
385
|
-
decompose_hangul(ucs, &l, &v, &t);
|
386
|
-
WStr_addWChar(result, l);
|
387
|
-
if (v) WStr_addWChar(result, v);
|
388
|
-
if (t) WStr_addWChar(result, t);
|
389
|
-
}
|
390
|
-
else {
|
391
|
-
const char* dc = get_compat(ucs);
|
392
|
-
if (!dc) {
|
393
|
-
WStr_addWChar(result, ucs);
|
394
|
-
}
|
395
|
-
else {
|
396
|
-
WString wdc;
|
397
|
-
WStr_allocWithUTF8(&wdc, dc);
|
398
|
-
decompose_compat_internal(&wdc, result);
|
399
|
-
WStr_free(&wdc);
|
400
|
-
}
|
401
|
-
}
|
402
|
-
}
|
403
|
-
return result;
|
404
|
-
}
|
405
|
-
|
406
|
-
|
407
|
-
#define UCS4toUTF8(p, c) \
|
408
|
-
do { \
|
409
|
-
if (c < 128) { \
|
410
|
-
*p++ = c; \
|
411
|
-
} \
|
412
|
-
else if (c < 2048) { \
|
413
|
-
*p++ = (c >> 6) | 192; \
|
414
|
-
*p++ = (c & 63) | 128; \
|
415
|
-
} \
|
416
|
-
else if (c < 0x10000) { \
|
417
|
-
*p++ = (c >> 12) | 224; \
|
418
|
-
*p++ = ((c >> 6) & 63) | 128; \
|
419
|
-
*p++ = (c & 63) | 128; \
|
420
|
-
} \
|
421
|
-
else if (c < 0x200000) { \
|
422
|
-
*p++ = (c >> 18) | 240; \
|
423
|
-
*p++ = ((c >> 12) & 63) | 128; \
|
424
|
-
*p++ = ((c >> 6) & 63) | 128; \
|
425
|
-
*p++ = (c & 63) | 128; \
|
426
|
-
} \
|
427
|
-
else if (c < 0x4000000) { \
|
428
|
-
*p++ = (c >> 24) | 248; \
|
429
|
-
*p++ = ((c >> 18) & 63) | 128; \
|
430
|
-
*p++ = ((c >> 12) & 63) | 128; \
|
431
|
-
*p++ = ((c >> 6) & 63) | 128; \
|
432
|
-
*p++ = (c & 63) | 128; \
|
433
|
-
} \
|
434
|
-
else if (c < 0x80000000) { \
|
435
|
-
*p++ = (c >> 30) | 252; \
|
436
|
-
*p++ = ((c >> 24) & 63) | 128; \
|
437
|
-
*p++ = ((c >> 18) & 63) | 128; \
|
438
|
-
*p++ = ((c >> 12) & 63) | 128; \
|
439
|
-
*p++ = ((c >> 6) & 63) | 128; \
|
440
|
-
*p++ = (c & 63) | 128; \
|
441
|
-
} \
|
442
|
-
} while (0)
|
443
|
-
|
444
|
-
static int
|
445
|
-
compose_pair(unsigned int c1, unsigned int c2)
|
446
|
-
{
|
447
|
-
int ret;
|
448
|
-
char ustr[13]; /* stored two UTF-8 chars */
|
449
|
-
char *p = ustr;
|
450
|
-
|
451
|
-
/* Hangul L + V */
|
452
|
-
if (c1 >= LBASE && c1 < LBASE + LCOUNT &&
|
453
|
-
c2 >= VBASE && c2 < VBASE + VCOUNT) {
|
454
|
-
return SBASE + ((c1 - LBASE) * VCOUNT + (c2 - VBASE)) * TCOUNT;
|
455
|
-
}
|
456
|
-
/* Hangul LV + T */
|
457
|
-
else if (c1 >= SBASE && c1 < SBASE + SCOUNT &&
|
458
|
-
(c1 - SBASE) % TCOUNT == 0 &&
|
459
|
-
c2 >= TBASE && c2 < TBASE + TCOUNT) {
|
460
|
-
return c1 + (c2 - TBASE);
|
461
|
-
}
|
462
|
-
UCS4toUTF8(p, c1);
|
463
|
-
UCS4toUTF8(p, c2);
|
464
|
-
*p = '\0';
|
465
|
-
ret = get_composition(ustr);
|
466
|
-
|
467
|
-
return ret;
|
468
|
-
}
|
469
|
-
|
470
|
-
/*
|
471
|
-
* push canonical composed str into result
|
472
|
-
*/
|
473
|
-
static WString*
|
474
|
-
compose_internal(WString* ustr, WString* result)
|
475
|
-
{
|
476
|
-
int starterPos = 0;
|
477
|
-
int starterCh = ustr->str[0];
|
478
|
-
int compPos = 1;
|
479
|
-
int lastClass = get_cc(starterCh);
|
480
|
-
int oldLen = ustr->len;
|
481
|
-
int decompPos;
|
482
|
-
|
483
|
-
if (oldLen == 0) return result;
|
484
|
-
if (lastClass != 0) lastClass = 256;
|
485
|
-
/* copy string */
|
486
|
-
result->len = 0;
|
487
|
-
WStr_pushWString(result, ustr);
|
488
|
-
|
489
|
-
for (decompPos = compPos; decompPos < result->len; decompPos++) {
|
490
|
-
int ch = result->str[decompPos];
|
491
|
-
int chClass = get_cc(ch);
|
492
|
-
int composite = compose_pair(starterCh, ch);
|
493
|
-
if (composite > 0 &&
|
494
|
-
(lastClass < chClass ||lastClass == 0)) {
|
495
|
-
result->str[starterPos] = composite;
|
496
|
-
starterCh = composite;
|
497
|
-
}
|
498
|
-
else {
|
499
|
-
if (chClass == 0) {
|
500
|
-
starterPos = compPos;
|
501
|
-
starterCh = ch;
|
502
|
-
}
|
503
|
-
lastClass = chClass;
|
504
|
-
result->str[compPos] = ch;
|
505
|
-
if (result->len != oldLen) {
|
506
|
-
decompPos += result->len - oldLen;
|
507
|
-
oldLen = result->len;
|
508
|
-
}
|
509
|
-
compPos++;
|
510
|
-
}
|
511
|
-
}
|
512
|
-
result->len = compPos;
|
513
|
-
return result;
|
514
|
-
}
|
515
|
-
#if 0
|
516
|
-
static WString*
|
517
|
-
compose_internal(WString* ustr, WString* result)
|
518
|
-
{
|
519
|
-
int len = ustr->len;
|
520
|
-
int starter;
|
521
|
-
int startercc;
|
522
|
-
int i;
|
523
|
-
|
524
|
-
if (len == 0) return result;
|
525
|
-
|
526
|
-
starter = ustr->str[0];
|
527
|
-
startercc = get_cc(starter);
|
528
|
-
if (startercc != 0) startercc = 256;
|
529
|
-
for (i = 1; i < len; i++) {
|
530
|
-
int ch = ustr->str[i];
|
531
|
-
int cc = get_cc(ch);
|
532
|
-
int composite;
|
533
|
-
|
534
|
-
if (startercc == 0 &&
|
535
|
-
(composite = compose_pair(starter, ch)) >= 0) {
|
536
|
-
starter = composite;
|
537
|
-
startercc = get_cc(composite);
|
538
|
-
}
|
539
|
-
else {
|
540
|
-
WStr_addWChar(result, starter);
|
541
|
-
starter = ch;
|
542
|
-
startercc = cc;
|
543
|
-
}
|
544
|
-
}
|
545
|
-
WStr_addWChar(result, starter);
|
546
|
-
|
547
|
-
return result;
|
548
|
-
}
|
549
|
-
#endif
|
550
|
-
|
551
|
-
static WString*
|
552
|
-
upcase_internal(WString* str, WString* result)
|
553
|
-
{
|
554
|
-
int i;
|
555
|
-
int len = str->len;
|
556
|
-
|
557
|
-
for (i = 0; i < len; i++) {
|
558
|
-
int ucs = str->str[i];
|
559
|
-
const char* c = get_uppercase(ucs);
|
560
|
-
if (!c) {
|
561
|
-
WStr_addWChar(result, ucs);
|
562
|
-
}
|
563
|
-
else {
|
564
|
-
WString wc;
|
565
|
-
WStr_allocWithUTF8(&wc, c);
|
566
|
-
WStr_pushWString(result, &wc);
|
567
|
-
WStr_free(&wc);
|
568
|
-
}
|
569
|
-
}
|
570
|
-
return result;
|
571
|
-
}
|
572
|
-
|
573
|
-
static WString*
|
574
|
-
downcase_internal(WString* str, WString* result)
|
575
|
-
{
|
576
|
-
int i;
|
577
|
-
int len = str->len;
|
578
|
-
|
579
|
-
for (i = 0; i < len; i++) {
|
580
|
-
int ucs = str->str[i];
|
581
|
-
const char* c = get_lowercase(ucs);
|
582
|
-
if (!c) {
|
583
|
-
WStr_addWChar(result, ucs);
|
584
|
-
}
|
585
|
-
else {
|
586
|
-
WString wc;
|
587
|
-
WStr_allocWithUTF8(&wc, c);
|
588
|
-
WStr_pushWString(result, &wc);
|
589
|
-
WStr_free(&wc);
|
590
|
-
}
|
591
|
-
}
|
592
|
-
return result;
|
593
|
-
}
|
594
|
-
|
595
|
-
static WString*
|
596
|
-
capitalize_internal(WString* str, WString* result)
|
597
|
-
{
|
598
|
-
int i;
|
599
|
-
int len = str->len;
|
600
|
-
|
601
|
-
if (len > 0) {
|
602
|
-
const char* c = get_titlecase(str->str[0]);
|
603
|
-
if (!c) {
|
604
|
-
WStr_addWChar(result, str->str[0]);
|
605
|
-
}
|
606
|
-
else {
|
607
|
-
WString wc;
|
608
|
-
WStr_allocWithUTF8(&wc, c);
|
609
|
-
WStr_pushWString(result, &wc);
|
610
|
-
WStr_free(&wc);
|
611
|
-
}
|
612
|
-
}
|
613
|
-
for (i = 1; i < len; i++) {
|
614
|
-
int ucs = str->str[i];
|
615
|
-
const char* c = get_lowercase(ucs);
|
616
|
-
if (!c) {
|
617
|
-
WStr_addWChar(result, ucs);
|
618
|
-
}
|
619
|
-
else {
|
620
|
-
WString wc;
|
621
|
-
WStr_allocWithUTF8(&wc, c);
|
622
|
-
WStr_pushWString(result, &wc);
|
623
|
-
WStr_free(&wc);
|
624
|
-
}
|
625
|
-
}
|
626
|
-
return result;
|
627
|
-
}
|
628
|
-
|
629
|
-
static VALUE
|
630
|
-
unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
|
631
|
-
{
|
632
|
-
WString wstr1;
|
633
|
-
WString wstr2;
|
634
|
-
WString result1;
|
635
|
-
WString result2;
|
636
|
-
UString ustr1;
|
637
|
-
UString ustr2;
|
638
|
-
int ret;
|
639
|
-
|
640
|
-
Check_Type(str1, T_STRING);
|
641
|
-
Check_Type(str2, T_STRING);
|
642
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
643
|
-
CONVERT_TO_UTF8(str1);
|
644
|
-
CONVERT_TO_UTF8(str2);
|
645
|
-
#endif
|
646
|
-
WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
|
647
|
-
WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
|
648
|
-
WStr_alloc(&result1);
|
649
|
-
WStr_alloc(&result2);
|
650
|
-
decompose_internal(&wstr1, &result1);
|
651
|
-
decompose_internal(&wstr2, &result2);
|
652
|
-
WStr_free(&wstr1);
|
653
|
-
WStr_free(&wstr2);
|
654
|
-
sort_canonical(&result1);
|
655
|
-
sort_canonical(&result2);
|
656
|
-
UniStr_alloc(&ustr1);
|
657
|
-
UniStr_alloc(&ustr2);
|
658
|
-
WStr_convertIntoUString(&result1, &ustr1);
|
659
|
-
WStr_convertIntoUString(&result2, &ustr2);
|
660
|
-
WStr_free(&result1);
|
661
|
-
WStr_free(&result2);
|
662
|
-
UniStr_addChar(&ustr1, '\0');
|
663
|
-
UniStr_addChar(&ustr2, '\0');
|
664
|
-
ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
|
665
|
-
UniStr_free(&ustr1);
|
666
|
-
UniStr_free(&ustr2);
|
667
|
-
|
668
|
-
return INT2FIX(ret);
|
669
|
-
}
|
670
|
-
|
671
|
-
static VALUE
|
672
|
-
unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
|
673
|
-
{
|
674
|
-
WString wstr1;
|
675
|
-
WString wstr2;
|
676
|
-
WString result1;
|
677
|
-
WString result2;
|
678
|
-
UString ustr1;
|
679
|
-
UString ustr2;
|
680
|
-
int ret;
|
681
|
-
|
682
|
-
Check_Type(str1, T_STRING);
|
683
|
-
Check_Type(str2, T_STRING);
|
684
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
685
|
-
CONVERT_TO_UTF8(str1);
|
686
|
-
CONVERT_TO_UTF8(str2);
|
687
|
-
#endif
|
688
|
-
WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
|
689
|
-
WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
|
690
|
-
WStr_alloc(&result1);
|
691
|
-
WStr_alloc(&result2);
|
692
|
-
decompose_compat_internal(&wstr1, &result1);
|
693
|
-
decompose_compat_internal(&wstr2, &result2);
|
694
|
-
WStr_free(&wstr1);
|
695
|
-
WStr_free(&wstr2);
|
696
|
-
sort_canonical(&result1);
|
697
|
-
sort_canonical(&result2);
|
698
|
-
UniStr_alloc(&ustr1);
|
699
|
-
UniStr_alloc(&ustr2);
|
700
|
-
WStr_convertIntoUString(&result1, &ustr1);
|
701
|
-
WStr_convertIntoUString(&result2, &ustr2);
|
702
|
-
WStr_free(&result1);
|
703
|
-
WStr_free(&result2);
|
704
|
-
UniStr_addChar(&ustr1, '\0');
|
705
|
-
UniStr_addChar(&ustr2, '\0');
|
706
|
-
ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
|
707
|
-
UniStr_free(&ustr1);
|
708
|
-
UniStr_free(&ustr2);
|
709
|
-
|
710
|
-
return INT2FIX(ret);
|
711
|
-
}
|
712
|
-
|
713
|
-
static VALUE
|
714
|
-
unicode_decompose(VALUE obj, VALUE str)
|
715
|
-
{
|
716
|
-
WString ustr;
|
717
|
-
WString result;
|
718
|
-
UString ret;
|
719
|
-
VALUE vret;
|
720
|
-
|
721
|
-
Check_Type(str, T_STRING);
|
722
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
723
|
-
CONVERT_TO_UTF8(str);
|
724
|
-
#endif
|
725
|
-
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
726
|
-
WStr_alloc(&result);
|
727
|
-
decompose_internal(&ustr, &result);
|
728
|
-
WStr_free(&ustr);
|
729
|
-
sort_canonical(&result);
|
730
|
-
UniStr_alloc(&ret);
|
731
|
-
WStr_convertIntoUString(&result, &ret);
|
732
|
-
WStr_free(&result);
|
733
|
-
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
734
|
-
UniStr_free(&ret);
|
735
|
-
|
736
|
-
return vret;
|
737
|
-
}
|
738
|
-
|
739
|
-
static VALUE
|
740
|
-
unicode_decompose_safe(VALUE obj, VALUE str)
|
741
|
-
{
|
742
|
-
WString ustr;
|
743
|
-
WString result;
|
744
|
-
UString ret;
|
745
|
-
VALUE vret;
|
746
|
-
|
747
|
-
Check_Type(str, T_STRING);
|
748
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
749
|
-
CONVERT_TO_UTF8(str);
|
750
|
-
#endif
|
751
|
-
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
752
|
-
WStr_alloc(&result);
|
753
|
-
decompose_safe_internal(&ustr, &result);
|
754
|
-
WStr_free(&ustr);
|
755
|
-
sort_canonical(&result);
|
756
|
-
UniStr_alloc(&ret);
|
757
|
-
WStr_convertIntoUString(&result, &ret);
|
758
|
-
WStr_free(&result);
|
759
|
-
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
760
|
-
UniStr_free(&ret);
|
761
|
-
|
762
|
-
return vret;
|
763
|
-
}
|
764
|
-
|
765
|
-
static VALUE
|
766
|
-
unicode_decompose_compat(VALUE obj, VALUE str)
|
767
|
-
{
|
768
|
-
WString ustr;
|
769
|
-
WString result;
|
770
|
-
UString ret;
|
771
|
-
VALUE vret;
|
772
|
-
|
773
|
-
Check_Type(str, T_STRING);
|
774
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
775
|
-
CONVERT_TO_UTF8(str);
|
776
|
-
#endif
|
777
|
-
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
778
|
-
WStr_alloc(&result);
|
779
|
-
decompose_compat_internal(&ustr, &result);
|
780
|
-
WStr_free(&ustr);
|
781
|
-
sort_canonical(&result);
|
782
|
-
UniStr_alloc(&ret);
|
783
|
-
WStr_convertIntoUString(&result, &ret);
|
784
|
-
WStr_free(&result);
|
785
|
-
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
786
|
-
UniStr_free(&ret);
|
787
|
-
|
788
|
-
return vret;
|
789
|
-
}
|
790
|
-
|
791
|
-
static VALUE
|
792
|
-
unicode_compose(VALUE obj, VALUE str)
|
793
|
-
{
|
794
|
-
WString ustr;
|
795
|
-
WString result;
|
796
|
-
UString ret;
|
797
|
-
VALUE vret;
|
798
|
-
|
799
|
-
Check_Type(str, T_STRING);
|
800
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
801
|
-
CONVERT_TO_UTF8(str);
|
802
|
-
#endif
|
803
|
-
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
804
|
-
sort_canonical(&ustr);
|
805
|
-
WStr_alloc(&result);
|
806
|
-
compose_internal(&ustr, &result);
|
807
|
-
WStr_free(&ustr);
|
808
|
-
UniStr_alloc(&ret);
|
809
|
-
WStr_convertIntoUString(&result, &ret);
|
810
|
-
WStr_free(&result);
|
811
|
-
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
812
|
-
UniStr_free(&ret);
|
813
|
-
|
814
|
-
return vret;
|
815
|
-
}
|
816
|
-
|
817
|
-
static VALUE
|
818
|
-
unicode_normalize_C(VALUE obj, VALUE str)
|
819
|
-
{
|
820
|
-
WString ustr1;
|
821
|
-
WString ustr2;
|
822
|
-
WString result;
|
823
|
-
UString ret;
|
824
|
-
VALUE vret;
|
825
|
-
|
826
|
-
Check_Type(str, T_STRING);
|
827
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
828
|
-
CONVERT_TO_UTF8(str);
|
829
|
-
#endif
|
830
|
-
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
|
831
|
-
WStr_alloc(&ustr2);
|
832
|
-
decompose_internal(&ustr1, &ustr2);
|
833
|
-
WStr_free(&ustr1);
|
834
|
-
sort_canonical(&ustr2);
|
835
|
-
WStr_alloc(&result);
|
836
|
-
compose_internal(&ustr2, &result);
|
837
|
-
WStr_free(&ustr2);
|
838
|
-
UniStr_alloc(&ret);
|
839
|
-
WStr_convertIntoUString(&result, &ret);
|
840
|
-
WStr_free(&result);
|
841
|
-
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
842
|
-
UniStr_free(&ret);
|
843
|
-
|
844
|
-
return vret;
|
845
|
-
}
|
846
|
-
|
847
|
-
static VALUE
|
848
|
-
unicode_normalize_safe(VALUE obj, VALUE str)
|
849
|
-
{
|
850
|
-
WString ustr1;
|
851
|
-
WString ustr2;
|
852
|
-
WString result;
|
853
|
-
UString ret;
|
854
|
-
VALUE vret;
|
855
|
-
|
856
|
-
Check_Type(str, T_STRING);
|
857
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
858
|
-
CONVERT_TO_UTF8(str);
|
859
|
-
#endif
|
860
|
-
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
|
861
|
-
WStr_alloc(&ustr2);
|
862
|
-
decompose_safe_internal(&ustr1, &ustr2);
|
863
|
-
WStr_free(&ustr1);
|
864
|
-
sort_canonical(&ustr2);
|
865
|
-
WStr_alloc(&result);
|
866
|
-
compose_internal(&ustr2, &result);
|
867
|
-
WStr_free(&ustr2);
|
868
|
-
UniStr_alloc(&ret);
|
869
|
-
WStr_convertIntoUString(&result, &ret);
|
870
|
-
WStr_free(&result);
|
871
|
-
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
872
|
-
UniStr_free(&ret);
|
873
|
-
|
874
|
-
return vret;
|
875
|
-
}
|
876
|
-
|
877
|
-
static VALUE
|
878
|
-
unicode_normalize_KC(VALUE obj, VALUE str)
|
879
|
-
{
|
880
|
-
WString ustr1;
|
881
|
-
WString ustr2;
|
882
|
-
WString result;
|
883
|
-
UString ret;
|
884
|
-
VALUE vret;
|
885
|
-
|
886
|
-
Check_Type(str, T_STRING);
|
887
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
888
|
-
CONVERT_TO_UTF8(str);
|
889
|
-
#endif
|
890
|
-
WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
|
891
|
-
WStr_alloc(&ustr2);
|
892
|
-
decompose_compat_internal(&ustr1, &ustr2);
|
893
|
-
WStr_free(&ustr1);
|
894
|
-
sort_canonical(&ustr2);
|
895
|
-
WStr_alloc(&result);
|
896
|
-
compose_internal(&ustr2, &result);
|
897
|
-
WStr_free(&ustr2);
|
898
|
-
UniStr_alloc(&ret);
|
899
|
-
WStr_convertIntoUString(&result, &ret);
|
900
|
-
WStr_free(&result);
|
901
|
-
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
902
|
-
UniStr_free(&ret);
|
903
|
-
|
904
|
-
return vret;
|
905
|
-
}
|
906
|
-
|
907
|
-
static VALUE
|
908
|
-
unicode_upcase(VALUE obj, VALUE str)
|
909
|
-
{
|
910
|
-
WString ustr;
|
911
|
-
WString result;
|
912
|
-
UString ret;
|
913
|
-
VALUE vret;
|
914
|
-
|
915
|
-
Check_Type(str, T_STRING);
|
916
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
917
|
-
CONVERT_TO_UTF8(str);
|
918
|
-
#endif
|
919
|
-
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
920
|
-
WStr_alloc(&result);
|
921
|
-
upcase_internal(&ustr, &result);
|
922
|
-
//sort_canonical(&result);
|
923
|
-
WStr_free(&ustr);
|
924
|
-
UniStr_alloc(&ret);
|
925
|
-
WStr_convertIntoUString(&result, &ret);
|
926
|
-
WStr_free(&result);
|
927
|
-
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
928
|
-
UniStr_free(&ret);
|
929
|
-
|
930
|
-
return vret;
|
931
|
-
}
|
932
|
-
|
933
|
-
static VALUE
|
934
|
-
unicode_downcase(VALUE obj, VALUE str)
|
935
|
-
{
|
936
|
-
WString ustr;
|
937
|
-
WString result;
|
938
|
-
UString ret;
|
939
|
-
VALUE vret;
|
940
|
-
|
941
|
-
Check_Type(str, T_STRING);
|
942
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
943
|
-
CONVERT_TO_UTF8(str);
|
944
|
-
#endif
|
945
|
-
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
946
|
-
WStr_alloc(&result);
|
947
|
-
downcase_internal(&ustr, &result);
|
948
|
-
//sort_canonical(&result);
|
949
|
-
WStr_free(&ustr);
|
950
|
-
UniStr_alloc(&ret);
|
951
|
-
WStr_convertIntoUString(&result, &ret);
|
952
|
-
WStr_free(&result);
|
953
|
-
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
954
|
-
UniStr_free(&ret);
|
955
|
-
|
956
|
-
return vret;
|
957
|
-
}
|
958
|
-
|
959
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
960
|
-
|
961
|
-
|
962
|
-
#endif
|
963
|
-
|
964
|
-
static VALUE
|
965
|
-
unicode_capitalize(VALUE obj, VALUE str)
|
966
|
-
{
|
967
|
-
WString ustr;
|
968
|
-
WString result;
|
969
|
-
UString ret;
|
970
|
-
VALUE vret;
|
971
|
-
|
972
|
-
Check_Type(str, T_STRING);
|
973
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
974
|
-
CONVERT_TO_UTF8(str);
|
975
|
-
#endif
|
976
|
-
WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
|
977
|
-
WStr_alloc(&result);
|
978
|
-
capitalize_internal(&ustr, &result);
|
979
|
-
//sort_canonical(&result);
|
980
|
-
WStr_free(&ustr);
|
981
|
-
UniStr_alloc(&ret);
|
982
|
-
WStr_convertIntoUString(&result, &ret);
|
983
|
-
WStr_free(&result);
|
984
|
-
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
985
|
-
UniStr_free(&ret);
|
986
|
-
|
987
|
-
return vret;
|
988
|
-
}
|
989
|
-
|
990
|
-
typedef struct _get_categories_param {
|
991
|
-
WString* wstr;
|
992
|
-
VALUE str;
|
993
|
-
VALUE* catname;
|
994
|
-
} get_categories_param;
|
995
|
-
|
996
|
-
static VALUE
|
997
|
-
get_categories_internal(get_categories_param* param)
|
998
|
-
{
|
999
|
-
WString* wstr = param->wstr;
|
1000
|
-
VALUE str = param->str;
|
1001
|
-
VALUE* catname = param->catname;
|
1002
|
-
int pos;
|
1003
|
-
int block_p = rb_block_given_p();
|
1004
|
-
volatile VALUE ret = str;
|
1005
|
-
|
1006
|
-
if (!block_p)
|
1007
|
-
ret = rb_ary_new();
|
1008
|
-
for (pos = 0; pos < wstr->len; pos++) {
|
1009
|
-
int gencat = get_gencat(wstr->str[pos]);
|
1010
|
-
if (!block_p)
|
1011
|
-
rb_ary_push(ret, catname[gencat]);
|
1012
|
-
else {
|
1013
|
-
rb_yield(catname[gencat]);
|
1014
|
-
}
|
1015
|
-
}
|
1016
|
-
|
1017
|
-
return ret;
|
1018
|
-
}
|
1019
|
-
|
1020
|
-
VALUE
|
1021
|
-
get_categories_ensure(WString* wstr)
|
1022
|
-
{
|
1023
|
-
WStr_free(wstr);
|
1024
|
-
return Qnil;
|
1025
|
-
}
|
1026
|
-
|
1027
|
-
VALUE
|
1028
|
-
unicode_get_categories(VALUE obj, VALUE str)
|
1029
|
-
{
|
1030
|
-
WString wstr;
|
1031
|
-
get_categories_param param = { &wstr, str, catname_long };
|
1032
|
-
|
1033
|
-
Check_Type(str, T_STRING);
|
1034
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
1035
|
-
CONVERT_TO_UTF8(str);
|
1036
|
-
#endif
|
1037
|
-
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1038
|
-
|
1039
|
-
return rb_ensure(get_categories_internal, (VALUE)¶m,
|
1040
|
-
get_categories_ensure, (VALUE)&wstr);
|
1041
|
-
/* wstr will be freed in get_text_elements_ensure() */
|
1042
|
-
}
|
1043
|
-
|
1044
|
-
|
1045
|
-
VALUE
|
1046
|
-
unicode_get_abbr_categories(VALUE obj, VALUE str)
|
1047
|
-
{
|
1048
|
-
WString wstr;
|
1049
|
-
get_categories_param param = { &wstr, str, catname_abbr };
|
1050
|
-
|
1051
|
-
Check_Type(str, T_STRING);
|
1052
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
1053
|
-
CONVERT_TO_UTF8(str);
|
1054
|
-
#endif
|
1055
|
-
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1056
|
-
|
1057
|
-
return rb_ensure(get_categories_internal, (VALUE)¶m,
|
1058
|
-
get_categories_ensure, (VALUE)&wstr);
|
1059
|
-
/* wstr will be freed in get_text_elements_ensure() */
|
1060
|
-
}
|
1061
|
-
|
1062
|
-
VALUE
|
1063
|
-
unicode_wcswidth(int argc, VALUE* argv, VALUE obj)
|
1064
|
-
{
|
1065
|
-
WString wstr;
|
1066
|
-
int i, count;
|
1067
|
-
int width = 0;
|
1068
|
-
int cjk_p = 0;
|
1069
|
-
VALUE str;
|
1070
|
-
VALUE cjk;
|
1071
|
-
|
1072
|
-
count = rb_scan_args(argc, argv, "11", &str, &cjk);
|
1073
|
-
if (count > 1)
|
1074
|
-
cjk_p = RTEST(cjk);
|
1075
|
-
Check_Type(str, T_STRING);
|
1076
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
1077
|
-
CONVERT_TO_UTF8(str);
|
1078
|
-
#endif
|
1079
|
-
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1080
|
-
for (i = 0; i <wstr.len; i++) {
|
1081
|
-
int c = wstr.str[i];
|
1082
|
-
int cat = get_gencat(c);
|
1083
|
-
int eaw = get_eawidth(c);
|
1084
|
-
if ((c > 0 && c < 32) || (c >= 0x7f && c < 0xa0)) {
|
1085
|
-
/* Control Characters */
|
1086
|
-
width = -1;
|
1087
|
-
break;
|
1088
|
-
}
|
1089
|
-
else if (c != 0x00ad && /* SOFT HYPHEN */
|
1090
|
-
(cat == c_Mn || cat == c_Me || /* Non-spacing Marks */
|
1091
|
-
cat == c_Cf || /* Format */
|
1092
|
-
c == 0 || /* NUL */
|
1093
|
-
(c >= 0x1160 && c <= 0x11ff))) /* HANGUL JUNGSEONG/JONGSEONG */
|
1094
|
-
/* zero width */ ;
|
1095
|
-
else if (eaw == w_F || eaw == w_W || /* Fullwidth or Wide */
|
1096
|
-
(c >= 0x4db6 && c <= 0x4dbf) || /* CJK Reserved */
|
1097
|
-
(c >= 0x9fcd && c <= 0x9fff) || /* CJK Reserved */
|
1098
|
-
(c >= 0xfa6e && c <= 0xfa6f) || /* CJK Reserved */
|
1099
|
-
(c >= 0xfada && c <= 0xfaff) || /* CJK Reserved */
|
1100
|
-
(c >= 0x2a6d7 && c <= 0x2a6ff) || /* CJK Reserved */
|
1101
|
-
(c >= 0x2b735 && c <= 0x2b73f) || /* CJK Reserved */
|
1102
|
-
(c >= 0x2b81e && c <= 0x2f7ff) || /* CJK Reserved */
|
1103
|
-
(c >= 0x2fa1e && c <= 0x2fffd) || /* CJK Reserved */
|
1104
|
-
(c >= 0x30000 && c <= 0x3fffd) || /* CJK Reserved */
|
1105
|
-
(cjk_p && eaw == w_A)) /* East Asian Ambiguous */
|
1106
|
-
width += 2;
|
1107
|
-
else
|
1108
|
-
width++; /* Halfwidth or Neutral */
|
1109
|
-
}
|
1110
|
-
WStr_free(&wstr);
|
1111
|
-
|
1112
|
-
return INT2FIX(width);
|
1113
|
-
}
|
1114
|
-
|
1115
|
-
VALUE
|
1116
|
-
wstring_to_rstring(WString* wstr, int start, int len) {
|
1117
|
-
UString ret;
|
1118
|
-
volatile VALUE vret;
|
1119
|
-
|
1120
|
-
UniStr_alloc(&ret);
|
1121
|
-
WStr_convertIntoUString2(wstr, start, len, &ret);
|
1122
|
-
vret = ENC_(rb_str_new((char*)ret.str, ret.len));
|
1123
|
-
UniStr_free(&ret);
|
1124
|
-
|
1125
|
-
return vret;
|
1126
|
-
}
|
1127
|
-
|
1128
|
-
typedef struct _get_text_elements_param {
|
1129
|
-
WString* wstr;
|
1130
|
-
VALUE str;
|
1131
|
-
} get_text_elements_param;
|
1132
|
-
|
1133
|
-
VALUE
|
1134
|
-
get_text_elements_internal(get_text_elements_param* param)
|
1135
|
-
{
|
1136
|
-
WString* wstr = param->wstr;
|
1137
|
-
VALUE str = param->str;
|
1138
|
-
int start_pos;
|
1139
|
-
int block_p = rb_block_given_p();
|
1140
|
-
volatile VALUE ret = str;
|
1141
|
-
|
1142
|
-
if (!block_p)
|
1143
|
-
ret = rb_ary_new();
|
1144
|
-
for (start_pos = 0; start_pos < wstr->len;) {
|
1145
|
-
int c0 = wstr->str[start_pos];
|
1146
|
-
int cat = get_gencat(c0);
|
1147
|
-
int length = 1;
|
1148
|
-
int j;
|
1149
|
-
|
1150
|
-
if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
|
1151
|
-
volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
|
1152
|
-
if (!block_p)
|
1153
|
-
rb_ary_push(ret, rstr);
|
1154
|
-
else
|
1155
|
-
rb_yield(rstr);
|
1156
|
-
start_pos++;
|
1157
|
-
continue;
|
1158
|
-
}
|
1159
|
-
|
1160
|
-
for (j = start_pos + 1; j < wstr->len; j++) {
|
1161
|
-
int c1 = wstr->str[j];
|
1162
|
-
int cat = get_gencat(c1);
|
1163
|
-
if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
|
1164
|
-
j + 1 < wstr->len &&
|
1165
|
-
c1 >= VBASE && c1 < VBASE + VCOUNT &&
|
1166
|
-
wstr->str[j+1] >= TBASE && wstr->str[j+1] < TBASE + TCOUNT) {
|
1167
|
-
/* Hangul L+V+T */
|
1168
|
-
length += 2;
|
1169
|
-
j++;
|
1170
|
-
}
|
1171
|
-
else if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
|
1172
|
-
c1 >= VBASE && c1< VBASE + VCOUNT) {
|
1173
|
-
/* Hangul L+V */
|
1174
|
-
length++;
|
1175
|
-
}
|
1176
|
-
else if (c0 >= SBASE && c0 < SBASE + SCOUNT &&
|
1177
|
-
(c0 - SBASE) % TCOUNT == 0 &&
|
1178
|
-
c1 >= TBASE && c1 < TBASE + TCOUNT) {
|
1179
|
-
/* Hangul LV+T */
|
1180
|
-
length++;
|
1181
|
-
}
|
1182
|
-
else if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
|
1183
|
-
/* Mark */
|
1184
|
-
length++;
|
1185
|
-
}
|
1186
|
-
else {
|
1187
|
-
volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
|
1188
|
-
if (!block_p)
|
1189
|
-
rb_ary_push(ret, rstr);
|
1190
|
-
else
|
1191
|
-
rb_yield(rstr);
|
1192
|
-
length = 0;
|
1193
|
-
break;
|
1194
|
-
}
|
1195
|
-
}
|
1196
|
-
if (length > 0) {
|
1197
|
-
volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
|
1198
|
-
if (!block_p)
|
1199
|
-
rb_ary_push(ret, rstr);
|
1200
|
-
else
|
1201
|
-
rb_yield(rstr);
|
1202
|
-
}
|
1203
|
-
start_pos = j;
|
1204
|
-
}
|
1205
|
-
return ret;
|
1206
|
-
}
|
1207
|
-
|
1208
|
-
VALUE
|
1209
|
-
get_text_elements_ensure(WString* wstr)
|
1210
|
-
{
|
1211
|
-
WStr_free(wstr);
|
1212
|
-
return Qnil;
|
1213
|
-
}
|
1214
|
-
|
1215
|
-
VALUE
|
1216
|
-
unicode_get_text_elements(VALUE obj, VALUE str)
|
1217
|
-
{
|
1218
|
-
WString wstr;
|
1219
|
-
get_text_elements_param param = { &wstr, str };
|
1220
|
-
|
1221
|
-
Check_Type(str, T_STRING);
|
1222
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
1223
|
-
CONVERT_TO_UTF8(str);
|
1224
|
-
#endif
|
1225
|
-
WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
|
1226
|
-
|
1227
|
-
return rb_ensure(get_text_elements_internal, (VALUE)¶m,
|
1228
|
-
get_text_elements_ensure, (VALUE)&wstr);
|
1229
|
-
/* wstr will be freed in get_text_elements_ensure() */
|
1230
|
-
}
|
1231
|
-
|
1232
|
-
void
|
1233
|
-
Init_unicode_native()
|
1234
|
-
{
|
1235
|
-
int i;
|
1236
|
-
|
1237
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
1238
|
-
enc_out = rb_utf8_encoding();
|
1239
|
-
#endif
|
1240
|
-
|
1241
|
-
mUnicode = rb_define_module("Unicode");
|
1242
|
-
unicode_data = rb_hash_new();
|
1243
|
-
composition_table = rb_hash_new();
|
1244
|
-
|
1245
|
-
rb_global_variable(&unicode_data);
|
1246
|
-
rb_global_variable(&composition_table);
|
1247
|
-
|
1248
|
-
for (i = 0; unidata[i].code != -1; i++) {
|
1249
|
-
int code = unidata[i].code;
|
1250
|
-
const char* canon = unidata[i].canon;
|
1251
|
-
int exclusion = unidata[i].exclusion;
|
1252
|
-
|
1253
|
-
rb_hash_aset(unicode_data, INT2FIX(code), INT2FIX(i));
|
1254
|
-
if (canon && exclusion == 0) {
|
1255
|
-
rb_hash_aset(composition_table, rb_str_new2(canon), INT2FIX(code));
|
1256
|
-
}
|
1257
|
-
}
|
1258
|
-
|
1259
|
-
for (i = 0; i < c_Cn + 1; i++) {
|
1260
|
-
catname_abbr[i] = ID2SYM(rb_intern(gencat_abbr[i]));
|
1261
|
-
catname_long[i] = ID2SYM(rb_intern(gencat_long[i]));
|
1262
|
-
rb_global_variable(&catname_abbr[i]);
|
1263
|
-
rb_global_variable(&catname_long[i]);
|
1264
|
-
}
|
1265
|
-
|
1266
|
-
rb_define_module_function(mUnicode, "strcmp",
|
1267
|
-
unicode_strcmp, 2);
|
1268
|
-
rb_define_module_function(mUnicode, "strcmp_compat",
|
1269
|
-
unicode_strcmp_compat, 2);
|
1270
|
-
|
1271
|
-
rb_define_module_function(mUnicode, "decompose",
|
1272
|
-
unicode_decompose, 1);
|
1273
|
-
rb_define_module_function(mUnicode, "decompose_safe",
|
1274
|
-
unicode_decompose_safe, 1);
|
1275
|
-
rb_define_module_function(mUnicode, "decompose_compat",
|
1276
|
-
unicode_decompose_compat, 1);
|
1277
|
-
rb_define_module_function(mUnicode, "compose",
|
1278
|
-
unicode_compose, 1);
|
1279
|
-
|
1280
|
-
rb_define_module_function(mUnicode, "normalize_D",
|
1281
|
-
unicode_decompose, 1);
|
1282
|
-
rb_define_module_function(mUnicode, "normalize_D_safe",
|
1283
|
-
unicode_decompose_safe, 1);
|
1284
|
-
rb_define_module_function(mUnicode, "normalize_KD",
|
1285
|
-
unicode_decompose_compat, 1);
|
1286
|
-
rb_define_module_function(mUnicode, "normalize_C",
|
1287
|
-
unicode_normalize_C, 1);
|
1288
|
-
rb_define_module_function(mUnicode, "normalize_C_safe",
|
1289
|
-
unicode_normalize_safe, 1);
|
1290
|
-
rb_define_module_function(mUnicode, "normalize_KC",
|
1291
|
-
unicode_normalize_KC, 1);
|
1292
|
-
|
1293
|
-
/* aliases */
|
1294
|
-
rb_define_module_function(mUnicode, "nfd",
|
1295
|
-
unicode_decompose, 1);
|
1296
|
-
rb_define_module_function(mUnicode, "nfd_safe",
|
1297
|
-
unicode_decompose_safe, 1);
|
1298
|
-
rb_define_module_function(mUnicode, "nfkd",
|
1299
|
-
unicode_decompose_compat, 1);
|
1300
|
-
rb_define_module_function(mUnicode, "nfc",
|
1301
|
-
unicode_normalize_C, 1);
|
1302
|
-
rb_define_module_function(mUnicode, "nfc_safe",
|
1303
|
-
unicode_normalize_safe, 1);
|
1304
|
-
rb_define_module_function(mUnicode, "nfkc",
|
1305
|
-
unicode_normalize_KC, 1);
|
1306
|
-
|
1307
|
-
rb_define_module_function(mUnicode, "upcase",
|
1308
|
-
unicode_upcase, 1);
|
1309
|
-
rb_define_module_function(mUnicode, "downcase",
|
1310
|
-
unicode_downcase, 1);
|
1311
|
-
rb_define_module_function(mUnicode, "capitalize",
|
1312
|
-
unicode_capitalize, 1);
|
1313
|
-
|
1314
|
-
rb_define_module_function(mUnicode, "categories",
|
1315
|
-
unicode_get_categories, 1);
|
1316
|
-
rb_define_module_function(mUnicode, "abbr_categories",
|
1317
|
-
unicode_get_abbr_categories, 1);
|
1318
|
-
rb_define_module_function(mUnicode, "width",
|
1319
|
-
unicode_wcswidth, -1);
|
1320
|
-
rb_define_module_function(mUnicode, "text_elements",
|
1321
|
-
unicode_get_text_elements, 1);
|
1322
|
-
|
1323
|
-
rb_define_const(mUnicode, "VERSION",
|
1324
|
-
rb_str_new2(UNICODE_VERSION));
|
1325
|
-
}
|