unicode 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,107 @@
1
+ Unicode Library for Ruby
2
+ Version 0.1
3
+
4
+ Yoshida Masato
5
+
6
+
7
+ - Introduction
8
+
9
+ Unicode string manipulation library for Ruby.
10
+ This library is based on UTR #15 Unicode Normalization Forms(*1).
11
+
12
+ *1 <URL:http://www.unicode.org/unicode/reports/tr15/>
13
+
14
+
15
+ - Install
16
+
17
+ This can work with ruby-1.4 or later. I recommend you to
18
+ use ruby-1.4.2 or later.
19
+
20
+ Make and install usually.
21
+ For example, when Ruby supports dynamic linking on your OS,
22
+
23
+ ruby extconf.rb
24
+ make
25
+ make install
26
+
27
+
28
+ - Usage
29
+
30
+ If you do not link this module with Ruby statically,
31
+
32
+ require "unicode"
33
+
34
+ before using.
35
+
36
+
37
+ - Module Functions
38
+
39
+ All parameters of functions must be UTF-8.
40
+
41
+ Unicode::strcmp(str1, str2)
42
+ Unicode::strcmp_compat(str1, str2)
43
+ Compares Unicode strings with normalization.
44
+ strcmp uses Normalization Form D, strcmp_compat uses
45
+ Normalization Form KD.
46
+
47
+ Unicode::decopose(str)
48
+ Unicode::decopose_compat(str)
49
+ Decompose Unicode string. Then the trailing characters
50
+ are sorted in canonical order.
51
+ decompose uses the canonical decomposition,
52
+ decompose_compat uses the compatibility decomposition.
53
+ The decomposition is based on the character decomposition
54
+ mapping in UnicodeData.txt and the Hangul decomposition
55
+ algorithm.
56
+
57
+ Unicode::compose(str)
58
+ Compose Unicode string. Before composing, the trailing
59
+ characters are sorted in canonical order.
60
+ The parameter must be decomposed.
61
+ The composition is based on the reverse of the
62
+ character decomposition mapping in UnicodeData.txt,
63
+ CompositionExclusions.txt and the Hangul composition
64
+ algorithm.
65
+
66
+ Unicode::normalize_D(str)
67
+ Unicode::normalize_KD(str)
68
+ Normalizes Unicode string in form D or form KD.
69
+ These are aliases of decompose/decompose_compat.
70
+
71
+ Unicode::normalize_C(str)
72
+ Unicode::normalize_KC(str)
73
+ Normalizes Unicode string in form C or form KC.
74
+ normalize_C = decompose + compose
75
+ normalize_KC = decompose_compat + compose
76
+
77
+ Unicode::upcase(str)
78
+ Unicode::downcase(str)
79
+ Unicode::capitalize(str)
80
+ Case conversion functions.
81
+ The mappings which these functions use are not normative
82
+ in UnicodeData.txt.
83
+
84
+ - Bugs
85
+
86
+ UTR #15 suggests that the look up for Normalization Form C
87
+ should not be implemented with a hash of string for better
88
+ performance.
89
+
90
+
91
+ - Copying
92
+
93
+ This extension module is copyrighted free software by
94
+ Yoshida Masato.
95
+
96
+ You can redistribute it and/or modify it under the same
97
+ term as Ruby.
98
+
99
+
100
+ - Author
101
+
102
+ Yoshida Masato <yoshidam@yoshidam.net>
103
+
104
+
105
+ - History
106
+
107
+ Nov 23, 1999 version 0.1
data/bin/mkunidata.rb ADDED
@@ -0,0 +1,128 @@
1
+ #! /usr/local/bin/ruby -KU
2
+
3
+ if $KCODE != 'UTF8'
4
+ raise "$KCODE must be UTF8"
5
+ end
6
+
7
+ HEAD=<<EOS
8
+ /*
9
+ * UnicodeData
10
+ * 1999 by yoshidam
11
+ *
12
+ */
13
+
14
+ #ifndef _UNIDATA_MAP
15
+ #define _UNIDATA_MAP
16
+
17
+ struct unicode_data {
18
+ const int code;
19
+ const int combining_class;
20
+ const int exclusion;
21
+ const char* const canon;
22
+ const char* const compat;
23
+ const int uppercase;
24
+ const int lowercase;
25
+ const int titlecase;
26
+ };
27
+
28
+ const static struct unicode_data unidata[] = {
29
+ EOS
30
+
31
+ TAIL=<<EOS
32
+ };
33
+
34
+ #endif
35
+ EOS
36
+
37
+ def hex2str(hex)
38
+ if hex.nil? || hex == ''
39
+ return [nil, nil]
40
+ end
41
+ canon = ""
42
+ compat = ""
43
+ chars = hex.split(" ")
44
+ if chars[0] =~ /^[0-9A-F]{4}$/
45
+ chars.each do |c|
46
+ canon << [c.hex].pack("U")
47
+ end
48
+ compat = canon
49
+ elsif chars[0] =~ /^<.+>$/
50
+ chars.shift
51
+ chars.each do |c|
52
+ compat << [c.hex].pack("U")
53
+ end
54
+ canon = nil
55
+ else
56
+ raise "unknown value: " + hex
57
+ end
58
+ [canon, compat]
59
+ end
60
+
61
+ def hex_or_nil(str)
62
+ return "-1" if str.nil?
63
+ return format("0x%04x", str.hex)
64
+ end
65
+
66
+ def printstr(str)
67
+ return "NULL" if !str
68
+ ret = ""
69
+ str.each_byte do |c|
70
+ if c >= 32 && c < 127 && c != 34 && c != 92
71
+ ret << c
72
+ else
73
+ ret << format("\\%03o", c)
74
+ end
75
+ end
76
+ return '"' + ret + '"'
77
+ end
78
+
79
+ ## scan Composition Exclusions
80
+ exclusion = {}
81
+ open(ARGV[1]) do |f|
82
+ while l = f.gets
83
+ next if l =~ /^\#/ || l =~ /^$/
84
+ code, = l.split(/\s/)
85
+ code = code.hex
86
+ exclusion[code] = true
87
+ end
88
+ end
89
+
90
+ ## scan UnicodeData
91
+ udata = {}
92
+ open(ARGV[0]) do |f|
93
+ while l = f.gets
94
+ l.chomp!
95
+ code, charname, gencat, ccclass, bidicat,decomp,
96
+ dec, digit, num, mirror, uni1_0, comment, upcase,
97
+ lowcase, titlecase = l.split(";");
98
+ code = code.hex
99
+ ccclass = ccclass.to_i
100
+ canon, compat = hex2str(decomp)
101
+ upcase = hex_or_nil(upcase)
102
+ lowcase = hex_or_nil(lowcase)
103
+ titlecase = hex_or_nil(titlecase)
104
+ udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
105
+ end
106
+ end
107
+
108
+ print HEAD
109
+ udata.sort.each do |code, data|
110
+ ccclass, canon, compat, upcase, lowcase, titlecase = data
111
+ ## Exclusions
112
+ ex = 0
113
+ if exclusion[code] ## Script-specifics or Post Composition Version
114
+ ex = 1
115
+ elsif canon =~ /^.$/ ## Singltons
116
+ ex = 2
117
+ elsif !canon.nil?
118
+ starter = canon.unpack("U*")[0]
119
+ if udata[starter][0] != 0 ## Non-stater decompositions
120
+ ex = 3
121
+ end
122
+ end
123
+ printf(" { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
124
+ code, ccclass, ex, printstr(canon),
125
+ printstr(compat), upcase, lowcase, titlecase)
126
+ end
127
+ printf(" { -1, 0, 0, NULL, NULL, -1, -1, -1 }\n")
128
+ print TAIL
data/ext/extconf.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile("unicode")
data/ext/unicode.c ADDED
@@ -0,0 +1,666 @@
1
+ /*
2
+ * Unicode Library version 0.1
3
+ * Nov 23, 1999 yoshidam
4
+ *
5
+ */
6
+
7
+ #include "ruby.h"
8
+ #include "rubyio.h"
9
+ #include <stdio.h>
10
+ #include "wstring.h"
11
+ #include "unidata.map"
12
+
13
+ static VALUE mUnicode;
14
+ static VALUE unicode_data;
15
+ static VALUE composition_table;
16
+
17
+ /* Hangul */
18
+ #define SBASE (0xac00)
19
+ #define LBASE (0x1100)
20
+ #define LCOUNT (19)
21
+ #define VBASE (0x1161)
22
+ #define VCOUNT (21)
23
+ #define TBASE (0x11a7)
24
+ #define TCOUNT (28)
25
+ #define NCOUNT (VCOUNT * TCOUNT) /* 588 */
26
+ #define SCOUNT (LCOUNT * NCOUNT) /* 11172 */
27
+
28
+ static int
29
+ get_cc(int ucs)
30
+ {
31
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
32
+
33
+ if (!NIL_P(ch)) {
34
+ return unidata[FIX2INT(ch)].combining_class;
35
+ }
36
+ return 0;
37
+ }
38
+
39
+ static const char*
40
+ get_canon(int ucs)
41
+ {
42
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
43
+
44
+ if (!NIL_P(ch)) {
45
+ return unidata[FIX2INT(ch)].canon;
46
+ }
47
+ return NULL;
48
+ }
49
+
50
+ static const char*
51
+ get_compat(int ucs)
52
+ {
53
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
54
+
55
+ if (!NIL_P(ch)) {
56
+ return unidata[FIX2INT(ch)].compat;
57
+ }
58
+ return NULL;
59
+ }
60
+
61
+ static const int
62
+ get_uppercase(int ucs)
63
+ {
64
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
65
+
66
+ if (!NIL_P(ch)) {
67
+ int uc = unidata[FIX2INT(ch)].uppercase;
68
+ if (uc > 0) return uc;
69
+ }
70
+ return ucs;
71
+ }
72
+
73
+ static int
74
+ get_lowercase(int ucs)
75
+ {
76
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
77
+
78
+ if (!NIL_P(ch)) {
79
+ int lc = unidata[FIX2INT(ch)].lowercase;
80
+ if (lc > 0) return lc;
81
+ }
82
+ return ucs;
83
+ }
84
+
85
+ static int
86
+ get_titlecase(int ucs)
87
+ {
88
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
89
+
90
+ if (!NIL_P(ch)) {
91
+ int tc = unidata[FIX2INT(ch)].titlecase;
92
+ if (tc > 0) return tc;
93
+ }
94
+ return ucs;
95
+ }
96
+
97
+ static int
98
+ get_composition(const char* str)
99
+ {
100
+ VALUE ch = rb_hash_aref(composition_table, rb_str_new2(str));
101
+
102
+ if (!NIL_P(ch)) {
103
+ return FIX2INT(ch);
104
+ }
105
+ return -1;
106
+ }
107
+
108
+ static WString*
109
+ sort_canonical(WString* ustr)
110
+ {
111
+ int i = 1;
112
+ int len = ustr->len;
113
+
114
+ if (len < 2) return ustr;
115
+
116
+ while (i < len) {
117
+ int last = ustr->str[i - 1];
118
+ int ch = ustr->str[i];
119
+ int last_cc = get_cc(last);
120
+ int cc = get_cc(ch);
121
+ if (cc != 0 && last_cc != 0 && last_cc > cc) {
122
+ ustr->str[i] = last;
123
+ ustr->str[i-1] = ch;
124
+ if (i > 1) i--;
125
+ }
126
+ else {
127
+ i++;
128
+ }
129
+ }
130
+ return ustr;
131
+ }
132
+
133
+ static void
134
+ decompose_hangul(int ucs, int* l, int* v, int* t)
135
+ {
136
+ int sindex = ucs - SBASE;
137
+ if (sindex < 0 || sindex >= SCOUNT) {
138
+ *l = ucs;
139
+ *v = *t = 0;
140
+ return;
141
+ }
142
+ *l = LBASE + sindex / NCOUNT;
143
+ *v = VBASE + (sindex % NCOUNT) / TCOUNT;
144
+ *t = TBASE + sindex % TCOUNT;
145
+ if (*t == TBASE) *t = 0;
146
+ }
147
+
148
+ /*
149
+ * push decomposed str into result
150
+ */
151
+ static WString*
152
+ decompose_internal(WString* ustr, WString* result)
153
+ {
154
+ int i;
155
+ int len = ustr->len;
156
+
157
+ for (i = 0; i < len; i++) {
158
+ int ucs = ustr->str[i];
159
+ if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
160
+ int l, v, t;
161
+ decompose_hangul(ucs, &l, &v, &t);
162
+ WStr_addWChar(result, l);
163
+ if (v) WStr_addWChar(result, v);
164
+ if (t) WStr_addWChar(result, t);
165
+ }
166
+ else {
167
+ const char* dc = get_canon(ucs);
168
+ if (!dc) {
169
+ WStr_addWChar(result, ucs);
170
+ }
171
+ else {
172
+ WString wdc;
173
+ WStr_allocWithUTF8(&wdc, dc);
174
+ decompose_internal(&wdc, result);
175
+ WStr_free(&wdc);
176
+ }
177
+ }
178
+ }
179
+ return result;
180
+ }
181
+
182
+ /*
183
+ * push compatibility decomposed str into result
184
+ */
185
+ static WString*
186
+ decompose_compat_internal(WString* ustr, WString* result)
187
+ {
188
+ int i;
189
+ int len = ustr->len;
190
+
191
+ for (i = 0; i < len; i++) {
192
+ int ucs = ustr->str[i];
193
+ if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
194
+ int l, v, t;
195
+ decompose_hangul(ucs, &l, &v, &t);
196
+ WStr_addWChar(result, l);
197
+ if (v) WStr_addWChar(result, v);
198
+ if (t) WStr_addWChar(result, t);
199
+ }
200
+ else {
201
+ const char* dc = get_compat(ucs);
202
+ if (!dc) {
203
+ WStr_addWChar(result, ucs);
204
+ }
205
+ else {
206
+ WString wdc;
207
+ WStr_allocWithUTF8(&wdc, dc);
208
+ decompose_compat_internal(&wdc, result);
209
+ WStr_free(&wdc);
210
+ }
211
+ }
212
+ }
213
+ return result;
214
+ }
215
+
216
+
217
+ #define UCS4toUTF8(p, c) \
218
+ do { \
219
+ if (c < 128) { \
220
+ *p++ = c; \
221
+ } \
222
+ else if (c < 2048) { \
223
+ *p++ = (c >> 6) | 192; \
224
+ *p++ = (c & 63) | 128; \
225
+ } \
226
+ else if (c < 0x10000) { \
227
+ *p++ = (c >> 12) | 224; \
228
+ *p++ = ((c >> 6) & 63) | 128; \
229
+ *p++ = (c & 63) | 128; \
230
+ } \
231
+ else if (c < 0x200000) { \
232
+ *p++ = (c >> 18) | 240; \
233
+ *p++ = ((c >> 12) & 63) | 128; \
234
+ *p++ = ((c >> 6) & 63) | 128; \
235
+ *p++ = (c & 63) | 128; \
236
+ } \
237
+ else if (c < 0x4000000) { \
238
+ *p++ = (c >> 24) | 248; \
239
+ *p++ = ((c >> 18) & 63) | 128; \
240
+ *p++ = ((c >> 12) & 63) | 128; \
241
+ *p++ = ((c >> 6) & 63) | 128; \
242
+ *p++ = (c & 63) | 128; \
243
+ } \
244
+ else if (c < 0x80000000) { \
245
+ *p++ = (c >> 30) | 252; \
246
+ *p++ = ((c >> 24) & 63) | 128; \
247
+ *p++ = ((c >> 18) & 63) | 128; \
248
+ *p++ = ((c >> 12) & 63) | 128; \
249
+ *p++ = ((c >> 6) & 63) | 128; \
250
+ *p++ = (c & 63) | 128; \
251
+ } \
252
+ } while (0)
253
+
254
+ static int
255
+ compose_pair(int c1, int c2)
256
+ {
257
+ int ret;
258
+ char ustr[13]; /* stored two UTF-8 chars */
259
+ char *p = ustr;
260
+
261
+ /* Hangul L + V */
262
+ if (c1 >= LBASE && c1 < LBASE + LCOUNT &&
263
+ c2 >= VBASE && c2 < VBASE + VCOUNT) {
264
+ return SBASE + ((c1 - LBASE) * VCOUNT + (c2 - VBASE)) * TCOUNT;
265
+ }
266
+ /* Hangul LV + T */
267
+ else if (c1 >= SBASE && c1 < SBASE + SCOUNT &&
268
+ (c1 - SBASE) % TCOUNT == 0 &&
269
+ c2 >= TBASE && c2 < TBASE + TCOUNT) {
270
+ return c1 + (c2 - TBASE);
271
+ }
272
+ UCS4toUTF8(p, c1);
273
+ UCS4toUTF8(p, c2);
274
+ *p = '\0';
275
+ ret = get_composition(ustr);
276
+
277
+ return ret;
278
+ }
279
+
280
+ /*
281
+ * push canonical composed str into result
282
+ */
283
+ static WString*
284
+ compose_internal(WString* ustr, WString* result)
285
+ {
286
+ int len = ustr->len;
287
+ int starter;
288
+ int startercc;
289
+ int i;
290
+
291
+ if (len == 0) return result;
292
+
293
+ starter = ustr->str[0];
294
+ startercc = get_cc(starter);
295
+ if (startercc != 0) startercc = 256;
296
+ for (i = 1; i < len; i++) {
297
+ int ch = ustr->str[i];
298
+ int cc = get_cc(ch);
299
+ int composite;
300
+
301
+ if (startercc == 0 &&
302
+ (composite = compose_pair(starter, ch)) >= 0) {
303
+ starter = composite;
304
+ startercc = get_cc(composite);
305
+ }
306
+ else {
307
+ WStr_addWChar(result, starter);
308
+ starter = ch;
309
+ startercc = cc;
310
+ }
311
+ }
312
+ WStr_addWChar(result, starter);
313
+
314
+ return result;
315
+ }
316
+
317
+ static WString*
318
+ upcase_internal(WString* str)
319
+ {
320
+ int i;
321
+
322
+ for (i = 0; i < str->len; i++) {
323
+ int uc = get_uppercase(str->str[i]);
324
+ if (uc > 0) str->str[i] = uc;
325
+ }
326
+
327
+ return str;
328
+ }
329
+
330
+ static WString*
331
+ downcase_internal(WString* str)
332
+ {
333
+ int i;
334
+
335
+ for (i = 0; i < str->len; i++) {
336
+ int lc = get_lowercase(str->str[i]);
337
+ if (lc > 0) str->str[i] = lc;
338
+ }
339
+
340
+ return str;
341
+ }
342
+
343
+ static WString*
344
+ capitalize_internal(WString* str)
345
+ {
346
+ int i;
347
+
348
+ if (str->len > 1) {
349
+ int tc = get_titlecase(str->str[0]);
350
+ if (tc > 0) str->str[0] = tc;
351
+ }
352
+ for (i = 1; i < str->len; i++) {
353
+ int lc = get_lowercase(str->str[i]);
354
+ if (lc > 0) str->str[i] = lc;
355
+ }
356
+
357
+ return str;
358
+ }
359
+
360
+ static VALUE
361
+ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
362
+ {
363
+ WString wstr1;
364
+ WString wstr2;
365
+ WString result1;
366
+ WString result2;
367
+ UString ustr1;
368
+ UString ustr2;
369
+ int ret;
370
+
371
+ Check_Type(str1, T_STRING);
372
+ Check_Type(str2, T_STRING);
373
+ WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);
374
+ WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);
375
+ WStr_alloc(&result1);
376
+ WStr_alloc(&result2);
377
+ decompose_internal(&wstr1, &result1);
378
+ decompose_internal(&wstr2, &result2);
379
+ WStr_free(&wstr1);
380
+ WStr_free(&wstr2);
381
+ sort_canonical(&result1);
382
+ sort_canonical(&result2);
383
+ UStr_alloc(&ustr1);
384
+ UStr_alloc(&ustr2);
385
+ WStr_convertIntoUString(&result1, &ustr1);
386
+ WStr_convertIntoUString(&result2, &ustr2);
387
+ WStr_free(&result1);
388
+ WStr_free(&result2);
389
+ UStr_addChar(&ustr1, '\0');
390
+ UStr_addChar(&ustr2, '\0');
391
+ ret = strcmp(ustr1.str, ustr2.str);
392
+ UStr_free(&ustr1);
393
+ UStr_free(&ustr2);
394
+
395
+ return INT2FIX(ret);
396
+ }
397
+
398
+ static VALUE
399
+ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
400
+ {
401
+ WString wstr1;
402
+ WString wstr2;
403
+ WString result1;
404
+ WString result2;
405
+ UString ustr1;
406
+ UString ustr2;
407
+ int ret;
408
+
409
+ Check_Type(str1, T_STRING);
410
+ Check_Type(str2, T_STRING);
411
+ WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);
412
+ WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);
413
+ WStr_alloc(&result1);
414
+ WStr_alloc(&result2);
415
+ decompose_compat_internal(&wstr1, &result1);
416
+ decompose_compat_internal(&wstr2, &result2);
417
+ WStr_free(&wstr1);
418
+ WStr_free(&wstr2);
419
+ sort_canonical(&result1);
420
+ sort_canonical(&result2);
421
+ UStr_alloc(&ustr1);
422
+ UStr_alloc(&ustr2);
423
+ WStr_convertIntoUString(&result1, &ustr1);
424
+ WStr_convertIntoUString(&result2, &ustr2);
425
+ WStr_free(&result1);
426
+ WStr_free(&result2);
427
+ UStr_addChar(&ustr1, '\0');
428
+ UStr_addChar(&ustr2, '\0');
429
+ ret = strcmp(ustr1.str, ustr2.str);
430
+ UStr_free(&ustr1);
431
+ UStr_free(&ustr2);
432
+
433
+ return INT2FIX(ret);
434
+ }
435
+
436
+ static VALUE
437
+ unicode_decompose(VALUE obj, VALUE str)
438
+ {
439
+ WString ustr;
440
+ WString result;
441
+ UString ret;
442
+ VALUE vret;
443
+
444
+ Check_Type(str, T_STRING);
445
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
446
+ WStr_alloc(&result);
447
+ decompose_internal(&ustr, &result);
448
+ WStr_free(&ustr);
449
+ sort_canonical(&result);
450
+ UStr_alloc(&ret);
451
+ WStr_convertIntoUString(&result, &ret);
452
+ WStr_free(&result);
453
+ vret = rb_str_new(ret.str, ret.len);
454
+ UStr_free(&ret);
455
+
456
+ return vret;
457
+ }
458
+
459
+ static VALUE
460
+ unicode_decompose_compat(VALUE obj, VALUE str)
461
+ {
462
+ WString ustr;
463
+ WString result;
464
+ UString ret;
465
+ VALUE vret;
466
+
467
+ Check_Type(str, T_STRING);
468
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
469
+ WStr_alloc(&result);
470
+ decompose_compat_internal(&ustr, &result);
471
+ WStr_free(&ustr);
472
+ sort_canonical(&result);
473
+ UStr_alloc(&ret);
474
+ WStr_convertIntoUString(&result, &ret);
475
+ WStr_free(&result);
476
+ vret = rb_str_new(ret.str, ret.len);
477
+ UStr_free(&ret);
478
+
479
+ return vret;
480
+ }
481
+
482
+ static VALUE
483
+ unicode_compose(VALUE obj, VALUE str)
484
+ {
485
+ WString ustr;
486
+ WString result;
487
+ UString ret;
488
+ VALUE vret;
489
+
490
+ Check_Type(str, T_STRING);
491
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
492
+ sort_canonical(&ustr);
493
+ WStr_alloc(&result);
494
+ compose_internal(&ustr, &result);
495
+ WStr_free(&ustr);
496
+ UStr_alloc(&ret);
497
+ WStr_convertIntoUString(&result, &ret);
498
+ WStr_free(&result);
499
+ vret = rb_str_new(ret.str, ret.len);
500
+ UStr_free(&ret);
501
+
502
+ return vret;
503
+ }
504
+
505
+ static VALUE
506
+ unicode_normalize_C(VALUE obj, VALUE str)
507
+ {
508
+ WString ustr1;
509
+ WString ustr2;
510
+ WString result;
511
+ UString ret;
512
+ VALUE vret;
513
+
514
+ Check_Type(str, T_STRING);
515
+ WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);
516
+ WStr_alloc(&ustr2);
517
+ decompose_internal(&ustr1, &ustr2);
518
+ WStr_free(&ustr1);
519
+ sort_canonical(&ustr2);
520
+ WStr_alloc(&result);
521
+ compose_internal(&ustr2, &result);
522
+ WStr_free(&ustr2);
523
+ UStr_alloc(&ret);
524
+ WStr_convertIntoUString(&result, &ret);
525
+ WStr_free(&result);
526
+ vret = rb_str_new(ret.str, ret.len);
527
+ UStr_free(&ret);
528
+
529
+ return vret;
530
+ }
531
+
532
+ static VALUE
533
+ unicode_normalize_KC(VALUE obj, VALUE str)
534
+ {
535
+ WString ustr1;
536
+ WString ustr2;
537
+ WString result;
538
+ UString ret;
539
+ VALUE vret;
540
+
541
+ Check_Type(str, T_STRING);
542
+ WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);
543
+ WStr_alloc(&ustr2);
544
+ decompose_compat_internal(&ustr1, &ustr2);
545
+ WStr_free(&ustr1);
546
+ sort_canonical(&ustr2);
547
+ WStr_alloc(&result);
548
+ compose_internal(&ustr2, &result);
549
+ WStr_free(&ustr2);
550
+ UStr_alloc(&ret);
551
+ WStr_convertIntoUString(&result, &ret);
552
+ WStr_free(&result);
553
+ vret = rb_str_new(ret.str, ret.len);
554
+ UStr_free(&ret);
555
+
556
+ return vret;
557
+ }
558
+
559
+ static VALUE
560
+ unicode_upcase(VALUE obj, VALUE str)
561
+ {
562
+ WString ustr;
563
+ UString ret;
564
+ VALUE vret;
565
+
566
+ Check_Type(str, T_STRING);
567
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
568
+ upcase_internal(&ustr);
569
+ UStr_alloc(&ret);
570
+ WStr_convertIntoUString(&ustr, &ret);
571
+ WStr_free(&ustr);
572
+ vret = rb_str_new(ret.str, ret.len);
573
+ UStr_free(&ret);
574
+
575
+ return vret;
576
+ }
577
+
578
+ static VALUE
579
+ unicode_downcase(VALUE obj, VALUE str)
580
+ {
581
+ WString ustr;
582
+ UString ret;
583
+ VALUE vret;
584
+
585
+ Check_Type(str, T_STRING);
586
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
587
+ downcase_internal(&ustr);
588
+ UStr_alloc(&ret);
589
+ WStr_convertIntoUString(&ustr, &ret);
590
+ WStr_free(&ustr);
591
+ vret = rb_str_new(ret.str, ret.len);
592
+ UStr_free(&ret);
593
+
594
+ return vret;
595
+ }
596
+
597
+ static VALUE
598
+ unicode_capitalize(VALUE obj, VALUE str)
599
+ {
600
+ WString ustr;
601
+ UString ret;
602
+ VALUE vret;
603
+
604
+ Check_Type(str, T_STRING);
605
+ WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
606
+ capitalize_internal(&ustr);
607
+ UStr_alloc(&ret);
608
+ WStr_convertIntoUString(&ustr, &ret);
609
+ WStr_free(&ustr);
610
+ vret = rb_str_new(ret.str, ret.len);
611
+ UStr_free(&ret);
612
+
613
+ return vret;
614
+ }
615
+
616
+ void
617
+ Init_unicode()
618
+ {
619
+ int i;
620
+
621
+ mUnicode = rb_define_module("Unicode");
622
+ unicode_data = rb_hash_new();
623
+ composition_table = rb_hash_new();
624
+
625
+ rb_global_variable(&unicode_data);
626
+ rb_global_variable(&composition_table);
627
+
628
+ for (i = 0; unidata[i].code != -1; i++) {
629
+ int code = unidata[i].code;
630
+ const char* canon = unidata[i].canon;
631
+ int exclusion = unidata[i].exclusion;
632
+
633
+ rb_hash_aset(unicode_data, INT2FIX(code), INT2FIX(i));
634
+ if (canon && exclusion == 0) {
635
+ rb_hash_aset(composition_table, rb_str_new2(canon), INT2FIX(code));
636
+ }
637
+ }
638
+
639
+ rb_define_module_function(mUnicode, "strcmp",
640
+ unicode_strcmp, 2);
641
+ rb_define_module_function(mUnicode, "strcmp_compat",
642
+ unicode_strcmp_compat, 2);
643
+
644
+ rb_define_module_function(mUnicode, "decompose",
645
+ unicode_decompose, 1);
646
+ rb_define_module_function(mUnicode, "decompose_compat",
647
+ unicode_decompose_compat, 1);
648
+ rb_define_module_function(mUnicode, "compose",
649
+ unicode_compose, 1);
650
+
651
+ rb_define_module_function(mUnicode, "normalize_D",
652
+ unicode_decompose, 1);
653
+ rb_define_module_function(mUnicode, "normalize_KD",
654
+ unicode_decompose_compat, 1);
655
+ rb_define_module_function(mUnicode, "normalize_C",
656
+ unicode_normalize_C, 1);
657
+ rb_define_module_function(mUnicode, "normalize_KC",
658
+ unicode_normalize_KC, 1);
659
+
660
+ rb_define_module_function(mUnicode, "upcase",
661
+ unicode_upcase, 1);
662
+ rb_define_module_function(mUnicode, "downcase",
663
+ unicode_downcase, 1);
664
+ rb_define_module_function(mUnicode, "capitalize",
665
+ unicode_capitalize, 1);
666
+ }