unicode 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,107 @@
1
+ Unicode Library for Ruby
2
+ Version 0.1
3
+
4
+ Yoshida Masato
5
+
6
+
7
+ - Introduction
8
+
9
+ Unicode string manipulation library for Ruby.
10
+ This library is based on UTR #15 Unicode Normalization Forms(*1).
11
+
12
+ *1 <URL:http://www.unicode.org/unicode/reports/tr15/>
13
+
14
+
15
+ - Install
16
+
17
+ This can work with ruby-1.4 or later. I recommend you to
18
+ use ruby-1.4.2 or later.
19
+
20
+ Make and install usually.
21
+ For example, when Ruby supports dynamic linking on your OS,
22
+
23
+ ruby extconf.rb
24
+ make
25
+ make install
26
+
27
+
28
+ - Usage
29
+
30
+ If you do not link this module with Ruby statically,
31
+
32
+ require "unicode"
33
+
34
+ before using.
35
+
36
+
37
+ - Module Functions
38
+
39
+ All parameters of functions must be UTF-8.
40
+
41
+ Unicode::strcmp(str1, str2)
42
+ Unicode::strcmp_compat(str1, str2)
43
+ Compares Unicode strings with normalization.
44
+ strcmp uses Normalization Form D, strcmp_compat uses
45
+ Normalization Form KD.
46
+
47
+ Unicode::decopose(str)
48
+ Unicode::decopose_compat(str)
49
+ Decompose Unicode string. Then the trailing characters
50
+ are sorted in canonical order.
51
+ decompose uses the canonical decomposition,
52
+ decompose_compat uses the compatibility decomposition.
53
+ The decomposition is based on the character decomposition
54
+ mapping in UnicodeData.txt and the Hangul decomposition
55
+ algorithm.
56
+
57
+ Unicode::compose(str)
58
+ Compose Unicode string. Before composing, the trailing
59
+ characters are sorted in canonical order.
60
+ The parameter must be decomposed.
61
+ The composition is based on the reverse of the
62
+ character decomposition mapping in UnicodeData.txt,
63
+ CompositionExclusions.txt and the Hangul composition
64
+ algorithm.
65
+
66
+ Unicode::normalize_D(str)
67
+ Unicode::normalize_KD(str)
68
+ Normalizes Unicode string in form D or form KD.
69
+ These are aliases of decompose/decompose_compat.
70
+
71
+ Unicode::normalize_C(str)
72
+ Unicode::normalize_KC(str)
73
+ Normalizes Unicode string in form C or form KC.
74
+ normalize_C = decompose + compose
75
+ normalize_KC = decompose_compat + compose
76
+
77
+ Unicode::upcase(str)
78
+ Unicode::downcase(str)
79
+ Unicode::capitalize(str)
80
+ Case conversion functions.
81
+ The mappings which these functions use are not normative
82
+ in UnicodeData.txt.
83
+
84
+ - Bugs
85
+
86
+ UTR #15 suggests that the look up for Normalization Form C
87
+ should not be implemented with a hash of string for better
88
+ performance.
89
+
90
+
91
+ - Copying
92
+
93
+ This extension module is copyrighted free software by
94
+ Yoshida Masato.
95
+
96
+ You can redistribute it and/or modify it under the same
97
+ term as Ruby.
98
+
99
+
100
+ - Author
101
+
102
+ Yoshida Masato <yoshidam@yoshidam.net>
103
+
104
+
105
+ - History
106
+
107
+ Nov 23, 1999 version 0.1
data/bin/mkunidata.rb ADDED
@@ -0,0 +1,128 @@
1
+ #! /usr/local/bin/ruby -KU
2
+
3
+ if $KCODE != 'UTF8'
4
+ raise "$KCODE must be UTF8"
5
+ end
6
+
7
+ HEAD=<<EOS
8
+ /*
9
+ * UnicodeData
10
+ * 1999 by yoshidam
11
+ *
12
+ */
13
+
14
+ #ifndef _UNIDATA_MAP
15
+ #define _UNIDATA_MAP
16
+
17
+ struct unicode_data {
18
+ const int code;
19
+ const int combining_class;
20
+ const int exclusion;
21
+ const char* const canon;
22
+ const char* const compat;
23
+ const int uppercase;
24
+ const int lowercase;
25
+ const int titlecase;
26
+ };
27
+
28
+ const static struct unicode_data unidata[] = {
29
+ EOS
30
+
31
+ TAIL=<<EOS
32
+ };
33
+
34
+ #endif
35
+ EOS
36
+
37
+ def hex2str(hex)
38
+ if hex.nil? || hex == ''
39
+ return [nil, nil]
40
+ end
41
+ canon = ""
42
+ compat = ""
43
+ chars = hex.split(" ")
44
+ if chars[0] =~ /^[0-9A-F]{4}$/
45
+ chars.each do |c|
46
+ canon << [c.hex].pack("U")
47
+ end
48
+ compat = canon
49
+ elsif chars[0] =~ /^<.+>$/
50
+ chars.shift
51
+ chars.each do |c|
52
+ compat << [c.hex].pack("U")
53
+ end
54
+ canon = nil
55
+ else
56
+ raise "unknown value: " + hex
57
+ end
58
+ [canon, compat]
59
+ end
60
+
61
+ def hex_or_nil(str)
62
+ return "-1" if str.nil?
63
+ return format("0x%04x", str.hex)
64
+ end
65
+
66
+ def printstr(str)
67
+ return "NULL" if !str
68
+ ret = ""
69
+ str.each_byte do |c|
70
+ if c >= 32 && c < 127 && c != 34 && c != 92
71
+ ret << c
72
+ else
73
+ ret << format("\\%03o", c)
74
+ end
75
+ end
76
+ return '"' + ret + '"'
77
+ end
78
+
79
+ ## scan Composition Exclusions
80
+ exclusion = {}
81
+ open(ARGV[1]) do |f|
82
+ while l = f.gets
83
+ next if l =~ /^\#/ || l =~ /^$/
84
+ code, = l.split(/\s/)
85
+ code = code.hex
86
+ exclusion[code] = true
87
+ end
88
+ end
89
+
90
+ ## scan UnicodeData
91
+ udata = {}
92
+ open(ARGV[0]) do |f|
93
+ while l = f.gets
94
+ l.chomp!
95
+ code, charname, gencat, ccclass, bidicat,decomp,
96
+ dec, digit, num, mirror, uni1_0, comment, upcase,
97
+ lowcase, titlecase = l.split(";");
98
+ code = code.hex
99
+ ccclass = ccclass.to_i
100
+ canon, compat = hex2str(decomp)
101
+ upcase = hex_or_nil(upcase)
102
+ lowcase = hex_or_nil(lowcase)
103
+ titlecase = hex_or_nil(titlecase)
104
+ udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
105
+ end
106
+ end
107
+
108
+ print HEAD
109
+ udata.sort.each do |code, data|
110
+ ccclass, canon, compat, upcase, lowcase, titlecase = data
111
+ ## Exclusions
112
+ ex = 0
113
+ if exclusion[code] ## Script-specifics or Post Composition Version
114
+ ex = 1
115
+ elsif canon =~ /^.$/ ## Singltons
116
+ ex = 2
117
+ elsif !canon.nil?
118
+ starter = canon.unpack("U*")[0]
119
+ if udata[starter][0] != 0 ## Non-stater decompositions
120
+ ex = 3
121
+ end
122
+ end
123
+ printf(" { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
124
+ code, ccclass, ex, printstr(canon),
125
+ printstr(compat), upcase, lowcase, titlecase)
126
+ end
127
+ printf(" { -1, 0, 0, NULL, NULL, -1, -1, -1 }\n")
128
+ print TAIL
data/ext/extconf.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile("unicode")
data/ext/unicode.c ADDED
@@ -0,0 +1,666 @@
1
+ /*
2
+ * Unicode Library version 0.1
3
+ * Nov 23, 1999 yoshidam
4
+ *
5
+ */
6
+
7
+ #include "ruby.h"
8
+ #include "rubyio.h"
9
+ #include <stdio.h>
10
+ #include "wstring.h"
11
+ #include "unidata.map"
12
+
13
+ static VALUE mUnicode;
14
+ static VALUE unicode_data;
15
+ static VALUE composition_table;
16
+
17
+ /* Hangul */
18
+ #define SBASE (0xac00)
19
+ #define LBASE (0x1100)
20
+ #define LCOUNT (19)
21
+ #define VBASE (0x1161)
22
+ #define VCOUNT (21)
23
+ #define TBASE (0x11a7)
24
+ #define TCOUNT (28)
25
+ #define NCOUNT (VCOUNT * TCOUNT) /* 588 */
26
+ #define SCOUNT (LCOUNT * NCOUNT) /* 11172 */
27
+
28
+ static int
29
+ get_cc(int ucs)
30
+ {
31
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
32
+
33
+ if (!NIL_P(ch)) {
34
+ return unidata[FIX2INT(ch)].combining_class;
35
+ }
36
+ return 0;
37
+ }
38
+
39
+ static const char*
40
+ get_canon(int ucs)
41
+ {
42
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
43
+
44
+ if (!NIL_P(ch)) {
45
+ return unidata[FIX2INT(ch)].canon;
46
+ }
47
+ return NULL;
48
+ }
49
+
50
+ static const char*
51
+ get_compat(int ucs)
52
+ {
53
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
54
+
55
+ if (!NIL_P(ch)) {
56
+ return unidata[FIX2INT(ch)].compat;
57
+ }
58
+ return NULL;
59
+ }
60
+
61
+ static const int
62
+ get_uppercase(int ucs)
63
+ {
64
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
65
+
66
+ if (!NIL_P(ch)) {
67
+ int uc = unidata[FIX2INT(ch)].uppercase;
68
+ if (uc > 0) return uc;
69
+ }
70
+ return ucs;
71
+ }
72
+
73
+ static int
74
+ get_lowercase(int ucs)
75
+ {
76
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
77
+
78
+ if (!NIL_P(ch)) {
79
+ int lc = unidata[FIX2INT(ch)].lowercase;
80
+ if (lc > 0) return lc;
81
+ }
82
+ return ucs;
83
+ }
84
+
85
+ static int
86
+ get_titlecase(int ucs)
87
+ {
88
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
89
+
90
+ if (!NIL_P(ch)) {
91
+ int tc = unidata[FIX2INT(ch)].titlecase;
92
+ if (tc > 0) return tc;
93
+ }
94
+ return ucs;
95
+ }
96
+
97
+ static int
98
+ get_composition(const char* str)
99
+ {
100
+ VALUE ch = rb_hash_aref(composition_table, rb_str_new2(str));
101
+
102
+ if (!NIL_P(ch)) {
103
+ return FIX2INT(ch);
104
+ }
105
+ return -1;
106
+ }
107
+
108
+ static WString*
109
+ sort_canonical(WString* ustr)
110
+ {
111
+ int i = 1;
112
+ int len = ustr->len;
113
+
114
+ if (len < 2) return ustr;
115
+
116
+ while (i < len) {
117
+ int last = ustr->str[i - 1];
118
+ int ch = ustr->str[i];
119
+ int last_cc = get_cc(last);
120
+ int cc = get_cc(ch);
121
+ if (cc != 0 && last_cc != 0 && last_cc > cc) {
122
+ ustr->str[i] = last;
123
+ ustr->str[i-1] = ch;
124
+ if (i > 1) i--;
125
+ }
126
+ else {
127
+ i++;
128
+ }
129
+ }
130
+ return ustr;
131
+ }
132
+
133
+ static void
134
+ decompose_hangul(int ucs, int* l, int* v, int* t)
135
+ {
136
+ int sindex = ucs - SBASE;
137
+ if (sindex < 0 || sindex >= SCOUNT) {
138
+ *l = ucs;
139
+ *v = *t = 0;
140
+ return;
141
+ }
142
+ *l = LBASE + sindex / NCOUNT;
143
+ *v = VBASE + (sindex % NCOUNT) / TCOUNT;
144
+ *t = TBASE + sindex % TCOUNT;
145
+ if (*t == TBASE) *t = 0;
146
+ }
147
+
148
+ /*
149
+ * push decomposed str into result
150
+ */
151
+ static WString*
152
+ decompose_internal(WString* ustr, WString* result)
153
+ {
154
+ int i;
155
+ int len = ustr->len;
156
+
157
+ for (i = 0; i < len; i++) {
158
+ int ucs = ustr->str[i];
159
+ if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
160
+ int l, v, t;
161
+ decompose_hangul(ucs, &l, &v, &t);
162
+ WStr_addWChar(result, l);
163
+ if (v) WStr_addWChar(result, v);
164
+ if (t) WStr_addWChar(result, t);
165
+ }
166
+ else {
167
+ const char* dc = get_canon(ucs);
168
+ if (!dc) {
169
+ WStr_addWChar(result, ucs);
170
+ }
171
+ else {
172
+ WString wdc;
173
+ WStr_allocWithUTF8(&wdc, dc);
174
+ decompose_internal(&wdc, result);
175
+ WStr_free(&wdc);
176
+ }
177
+ }
178
+ }
179
+ return result;
180
+ }
181
+
182
+ /*
183
+ * push compatibility decomposed str into result
184
+ */
185
+ static WString*
186
+ decompose_compat_internal(WString* ustr, WString* result)
187
+ {
188
+ int i;
189
+ int len = ustr->len;
190
+
191
+ for (i = 0; i < len; i++) {
192
+ int ucs = ustr->str[i];
193
+ if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
194
+ int l, v, t;
195
+ decompose_hangul(ucs, &l, &v, &t);
196
+ WStr_addWChar(result, l);
197
+ if (v) WStr_addWChar(result, v);
198
+ if (t) WStr_addWChar(result, t);
199
+ }
200
+ else {
201
+ const char* dc = get_compat(ucs);
202
+ if (!dc) {
203
+ WStr_addWChar(result, ucs);
204
+ }
205
+ else {
206
+ WString wdc;
207
+ WStr_allocWithUTF8(&wdc, dc);
208
+ decompose_compat_internal(&wdc, result);
209
+ WStr_free(&wdc);
210
+ }
211
+ }
212
+ }
213
+ return result;
214
+ }
215
+
216
+
217
+ #define UCS4toUTF8(p, c) \
218
+ do { \
219
+ if (c < 128) { \
220
+ *p++ = c; \
221
+ } \
222
+ else if (c < 2048) { \
223
+ *p++ = (c >> 6) | 192; \
224
+ *p++ = (c & 63) | 128; \
225
+ } \
226
+ else if (c < 0x10000) { \
227
+ *p++ = (c >> 12) | 224; \
228
+ *p++ = ((c >> 6) & 63) | 128; \
229
+ *p++ = (c & 63) | 128; \
230
+ } \
231
+ else if (c < 0x200000) { \
232
+ *p++ = (c >> 18) | 240; \
233
+ *p++ = ((c >> 12) & 63) | 128; \
234
+ *p++ = ((c >> 6) & 63) | 128; \
235
+ *p++ = (c & 63) | 128; \
236
+ } \
237
+ else if (c < 0x4000000) { \
238
+ *p++ = (c >> 24) | 248; \
239
+ *p++ = ((c >> 18) & 63) | 128; \
240
+ *p++ = ((c >> 12) & 63) | 128; \
241
+ *p++ = ((c >> 6) & 63) | 128; \
242
+ *p++ = (c & 63) | 128; \
243
+ } \
244
+ else if (c < 0x80000000) { \
245
+ *p++ = (c >> 30) | 252; \
246
+ *p++ = ((c >> 24) & 63) | 128; \
247
+ *p++ = ((c >> 18) & 63) | 128; \
248
+ *p++ = ((c >> 12) & 63) | 128; \
249
+ *p++ = ((c >> 6) & 63) | 128; \
250
+ *p++ = (c & 63) | 128; \
251
+ } \
252
+ } while (0)
253
+
254
+ static int
255
+ compose_pair(int c1, int c2)
256
+ {
257
+ int ret;
258
+ char ustr[13]; /* stored two UTF-8 chars */
259
+ char *p = ustr;
260
+
261
+ /* Hangul L + V */
262
+ if (c1 >= LBASE && c1 < LBASE + LCOUNT &&
263
+ c2 >= VBASE && c2 < VBASE + VCOUNT) {
264
+ return SBASE + ((c1 - LBASE) * VCOUNT + (c2 - VBASE)) * TCOUNT;
265
+ }
266
+ /* Hangul LV + T */
267
+ else if (c1 >= SBASE && c1 < SBASE + SCOUNT &&
268
+ (c1 - SBASE) % TCOUNT == 0 &&
269
+ c2 >= TBASE && c2 < TBASE + TCOUNT) {
270
+ return c1 + (c2 - TBASE);
271
+ }
272
+ UCS4toUTF8(p, c1);
273
+ UCS4toUTF8(p, c2);
274
+ *p = '\0';
275
+ ret = get_composition(ustr);
276
+
277
+ return ret;
278
+ }
279
+
280
+ /*
281
+ * push canonical composed str into result
282
+ */
283
+ static WString*
284
+ compose_internal(WString* ustr, WString* result)
285
+ {
286
+ int len = ustr->len;
287
+ int starter;
288
+ int startercc;
289
+ int i;
290
+
291
+ if (len == 0) return result;
292
+
293
+ starter = ustr->str[0];
294
+ startercc = get_cc(starter);
295
+ if (startercc != 0) startercc = 256;
296
+ for (i = 1; i < len; i++) {
297
+ int ch = ustr->str[i];
298
+ int cc = get_cc(ch);
299
+ int composite;
300
+
301
+ if (startercc == 0 &&
302
+ (composite = compose_pair(starter, ch)) >= 0) {
303
+ starter = composite;
304
+ startercc = get_cc(composite);
305
+ }
306
+ else {
307
+ WStr_addWChar(result, starter);
308
+ starter = ch;
309
+ startercc = cc;
310
+ }
311
+ }
312
+ WStr_addWChar(result, starter);
313
+
314
+ return result;
315
+ }
316
+
317
+ static WString*
318
+ upcase_internal(WString* str)
319
+ {
320
+ int i;
321
+
322
+ for (i = 0; i < str->len; i++) {
323
+ int uc = get_uppercase(str->str[i]);
324
+ if (uc > 0) str->str[i] = uc;
325
+ }
326
+
327
+ return str;
328
+ }
329
+
330
+ static WString*
331
+ downcase_internal(WString* str)
332
+ {
333
+ int i;
334
+
335
+ for (i = 0; i < str->len; i++) {
336
+ int lc = get_lowercase(str->str[i]);
337
+ if (lc > 0) str->str[i] = lc;
338
+ }
339
+
340
+ return str;
341
+ }
342
+
343
+ static WString*
344
+ capitalize_internal(WString* str)
345
+ {
346
+ int i;
347
+
348
+ if (str->len > 1) {
349
+ int tc = get_titlecase(str->str[0]);
350
+ if (tc > 0) str->str[0] = tc;
351
+ }
352
+ for (i = 1; i < str->len; i++) {
353
+ int lc = get_lowercase(str->str[i]);
354
+ if (lc > 0) str->str[i] = lc;
355
+ }
356
+
357
+ return str;
358
+ }
359
+
360
+ static VALUE
361
+ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
362
+ {
363
+ WString wstr1;
364
+ WString wstr2;
365
+ WString result1;
366
+ WString result2;
367
+ UString ustr1;
368
+ UString ustr2;
369
+ int ret;
370
+
371
+ Check_Type(str1, T_STRING);
372
+ Check_Type(str2, T_STRING);
373
+ WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);
374
+ WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);
375
+ WStr_alloc(&result1);
376
+ WStr_alloc(&result2);
377
+ decompose_internal(&wstr1, &result1);
378
+ decompose_internal(&wstr2, &result2);
379
+ WStr_free(&wstr1);
380
+ WStr_free(&wstr2);
381
+ sort_canonical(&result1);
382
+ sort_canonical(&result2);
383
+ UStr_alloc(&ustr1);
384
+ UStr_alloc(&ustr2);
385
+ WStr_convertIntoUString(&result1, &ustr1);
386
+ WStr_convertIntoUString(&result2, &ustr2);
387
+ WStr_free(&result1);
388
+ WStr_free(&result2);
389
+ UStr_addChar(&ustr1, '\0');
390
+ UStr_addChar(&ustr2, '\0');
391
+ ret = strcmp(ustr1.str, ustr2.str);
392
+ UStr_free(&ustr1);
393
+ UStr_free(&ustr2);
394
+
395
+ return INT2FIX(ret);
396
+ }
397
+
398
+ static VALUE
399
+ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
400
+ {
401
+ WString wstr1;
402
+ WString wstr2;
403
+ WString result1;
404
+ WString result2;
405
+ UString ustr1;
406
+ UString ustr2;
407
+ int ret;
408
+
409
+ Check_Type(str1, T_STRING);
410
+ Check_Type(str2, T_STRING);
411
+ WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);
412
+ WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);
413
+ WStr_alloc(&result1);
414
+ WStr_alloc(&result2);
415
+ decompose_compat_internal(&wstr1, &result1);
416
+ decompose_compat_internal(&wstr2, &result2);
417
+ WStr_free(&wstr1);
418
+ WStr_free(&wstr2);
419
+ sort_canonical(&result1);
420
+ sort_canonical(&result2);
421
+ UStr_alloc(&ustr1);
422
+ UStr_alloc(&ustr2);
423
+ WStr_convertIntoUString(&result1, &ustr1);
424
+ WStr_convertIntoUString(&result2, &ustr2);
425
+ WStr_free(&result1);
426
+ WStr_free(&result2);
427
+ UStr_addChar(&ustr1, '\0');
428
+ UStr_addChar(&ustr2, '\0');
429
+ ret = strcmp(ustr1.str, ustr2.str);
430
+ UStr_free(&ustr1);
431
+ UStr_free(&ustr2);
432
+
433
+ return INT2FIX(ret);
434
+ }
435
+
436
+ static VALUE
437
+ unicode_decompose(VALUE obj, VALUE str)
438
+ {
439
+ WString ustr;
440
+ WString result;
441
+ UString ret;
442
+ VALUE vret;
443
+
444
+ Check_Type(str, T_STRING);
445
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
446
+ WStr_alloc(&result);
447
+ decompose_internal(&ustr, &result);
448
+ WStr_free(&ustr);
449
+ sort_canonical(&result);
450
+ UStr_alloc(&ret);
451
+ WStr_convertIntoUString(&result, &ret);
452
+ WStr_free(&result);
453
+ vret = rb_str_new(ret.str, ret.len);
454
+ UStr_free(&ret);
455
+
456
+ return vret;
457
+ }
458
+
459
+ static VALUE
460
+ unicode_decompose_compat(VALUE obj, VALUE str)
461
+ {
462
+ WString ustr;
463
+ WString result;
464
+ UString ret;
465
+ VALUE vret;
466
+
467
+ Check_Type(str, T_STRING);
468
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
469
+ WStr_alloc(&result);
470
+ decompose_compat_internal(&ustr, &result);
471
+ WStr_free(&ustr);
472
+ sort_canonical(&result);
473
+ UStr_alloc(&ret);
474
+ WStr_convertIntoUString(&result, &ret);
475
+ WStr_free(&result);
476
+ vret = rb_str_new(ret.str, ret.len);
477
+ UStr_free(&ret);
478
+
479
+ return vret;
480
+ }
481
+
482
+ static VALUE
483
+ unicode_compose(VALUE obj, VALUE str)
484
+ {
485
+ WString ustr;
486
+ WString result;
487
+ UString ret;
488
+ VALUE vret;
489
+
490
+ Check_Type(str, T_STRING);
491
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
492
+ sort_canonical(&ustr);
493
+ WStr_alloc(&result);
494
+ compose_internal(&ustr, &result);
495
+ WStr_free(&ustr);
496
+ UStr_alloc(&ret);
497
+ WStr_convertIntoUString(&result, &ret);
498
+ WStr_free(&result);
499
+ vret = rb_str_new(ret.str, ret.len);
500
+ UStr_free(&ret);
501
+
502
+ return vret;
503
+ }
504
+
505
+ static VALUE
506
+ unicode_normalize_C(VALUE obj, VALUE str)
507
+ {
508
+ WString ustr1;
509
+ WString ustr2;
510
+ WString result;
511
+ UString ret;
512
+ VALUE vret;
513
+
514
+ Check_Type(str, T_STRING);
515
+ WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);
516
+ WStr_alloc(&ustr2);
517
+ decompose_internal(&ustr1, &ustr2);
518
+ WStr_free(&ustr1);
519
+ sort_canonical(&ustr2);
520
+ WStr_alloc(&result);
521
+ compose_internal(&ustr2, &result);
522
+ WStr_free(&ustr2);
523
+ UStr_alloc(&ret);
524
+ WStr_convertIntoUString(&result, &ret);
525
+ WStr_free(&result);
526
+ vret = rb_str_new(ret.str, ret.len);
527
+ UStr_free(&ret);
528
+
529
+ return vret;
530
+ }
531
+
532
+ static VALUE
533
+ unicode_normalize_KC(VALUE obj, VALUE str)
534
+ {
535
+ WString ustr1;
536
+ WString ustr2;
537
+ WString result;
538
+ UString ret;
539
+ VALUE vret;
540
+
541
+ Check_Type(str, T_STRING);
542
+ WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);
543
+ WStr_alloc(&ustr2);
544
+ decompose_compat_internal(&ustr1, &ustr2);
545
+ WStr_free(&ustr1);
546
+ sort_canonical(&ustr2);
547
+ WStr_alloc(&result);
548
+ compose_internal(&ustr2, &result);
549
+ WStr_free(&ustr2);
550
+ UStr_alloc(&ret);
551
+ WStr_convertIntoUString(&result, &ret);
552
+ WStr_free(&result);
553
+ vret = rb_str_new(ret.str, ret.len);
554
+ UStr_free(&ret);
555
+
556
+ return vret;
557
+ }
558
+
559
+ static VALUE
560
+ unicode_upcase(VALUE obj, VALUE str)
561
+ {
562
+ WString ustr;
563
+ UString ret;
564
+ VALUE vret;
565
+
566
+ Check_Type(str, T_STRING);
567
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
568
+ upcase_internal(&ustr);
569
+ UStr_alloc(&ret);
570
+ WStr_convertIntoUString(&ustr, &ret);
571
+ WStr_free(&ustr);
572
+ vret = rb_str_new(ret.str, ret.len);
573
+ UStr_free(&ret);
574
+
575
+ return vret;
576
+ }
577
+
578
+ static VALUE
579
+ unicode_downcase(VALUE obj, VALUE str)
580
+ {
581
+ WString ustr;
582
+ UString ret;
583
+ VALUE vret;
584
+
585
+ Check_Type(str, T_STRING);
586
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
587
+ downcase_internal(&ustr);
588
+ UStr_alloc(&ret);
589
+ WStr_convertIntoUString(&ustr, &ret);
590
+ WStr_free(&ustr);
591
+ vret = rb_str_new(ret.str, ret.len);
592
+ UStr_free(&ret);
593
+
594
+ return vret;
595
+ }
596
+
597
+ static VALUE
598
+ unicode_capitalize(VALUE obj, VALUE str)
599
+ {
600
+ WString ustr;
601
+ UString ret;
602
+ VALUE vret;
603
+
604
+ Check_Type(str, T_STRING);
605
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
606
+ capitalize_internal(&ustr);
607
+ UStr_alloc(&ret);
608
+ WStr_convertIntoUString(&ustr, &ret);
609
+ WStr_free(&ustr);
610
+ vret = rb_str_new(ret.str, ret.len);
611
+ UStr_free(&ret);
612
+
613
+ return vret;
614
+ }
615
+
616
+ void
617
+ Init_unicode()
618
+ {
619
+ int i;
620
+
621
+ mUnicode = rb_define_module("Unicode");
622
+ unicode_data = rb_hash_new();
623
+ composition_table = rb_hash_new();
624
+
625
+ rb_global_variable(&unicode_data);
626
+ rb_global_variable(&composition_table);
627
+
628
+ for (i = 0; unidata[i].code != -1; i++) {
629
+ int code = unidata[i].code;
630
+ const char* canon = unidata[i].canon;
631
+ int exclusion = unidata[i].exclusion;
632
+
633
+ rb_hash_aset(unicode_data, INT2FIX(code), INT2FIX(i));
634
+ if (canon && exclusion == 0) {
635
+ rb_hash_aset(composition_table, rb_str_new2(canon), INT2FIX(code));
636
+ }
637
+ }
638
+
639
+ rb_define_module_function(mUnicode, "strcmp",
640
+ unicode_strcmp, 2);
641
+ rb_define_module_function(mUnicode, "strcmp_compat",
642
+ unicode_strcmp_compat, 2);
643
+
644
+ rb_define_module_function(mUnicode, "decompose",
645
+ unicode_decompose, 1);
646
+ rb_define_module_function(mUnicode, "decompose_compat",
647
+ unicode_decompose_compat, 1);
648
+ rb_define_module_function(mUnicode, "compose",
649
+ unicode_compose, 1);
650
+
651
+ rb_define_module_function(mUnicode, "normalize_D",
652
+ unicode_decompose, 1);
653
+ rb_define_module_function(mUnicode, "normalize_KD",
654
+ unicode_decompose_compat, 1);
655
+ rb_define_module_function(mUnicode, "normalize_C",
656
+ unicode_normalize_C, 1);
657
+ rb_define_module_function(mUnicode, "normalize_KC",
658
+ unicode_normalize_KC, 1);
659
+
660
+ rb_define_module_function(mUnicode, "upcase",
661
+ unicode_upcase, 1);
662
+ rb_define_module_function(mUnicode, "downcase",
663
+ unicode_downcase, 1);
664
+ rb_define_module_function(mUnicode, "capitalize",
665
+ unicode_capitalize, 1);
666
+ }