unicode 0.3.1-x86-mswin32-60

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,113 @@
1
+ Unicode Library for Ruby
2
+ Version 0.3.0
3
+
4
+ Yoshida Masato
5
+
6
+
7
+ - Introduction
8
+
9
+ Unicode string manipulation library for Ruby.
10
+ This library is based on UTR #15 Unicode Normalization Forms(*1).
11
+
12
+ *1 <URL:http://www.unicode.org/unicode/reports/tr15/>
13
+
14
+
15
+ - Install
16
+
17
+ This can work with ruby-1.8 or later. I recommend you to
18
+ use ruby-1.8.1 or later.
19
+
20
+ Make and install usually.
21
+ For example, when Ruby supports dynamic linking on your OS,
22
+
23
+ ruby extconf.rb
24
+ make
25
+ make install
26
+
27
+
28
+ - Usage
29
+
30
+ If you do not link this module with Ruby statically,
31
+
32
+ require "unicode"
33
+
34
+ before using.
35
+
36
+
37
+ - Module Functions
38
+
39
+ All parameters of functions must be UTF-8 strings.
40
+
41
+ Unicode::strcmp(str1, str2)
42
+ Unicode::strcmp_compat(str1, str2)
43
+ Compare Unicode strings with a normalization.
44
+ strcmp uses the Normalization Form D, strcmp_compat uses
45
+ Normalization Form KD.
46
+
47
+ Unicode::decompose(str)
48
+ Unicode::decompose_compat(str)
49
+ Decompose Unicode string. Then the trailing characters
50
+ are sorted in canonical order.
51
+ decompose uses the canonical decomposition,
52
+ decompose_compat uses the compatibility decomposition.
53
+ The decomposition is based on the character decomposition
54
+ mapping in UnicodeData.txt and the Hangul decomposition
55
+ algorithm.
56
+
57
+ Unicode::compose(str)
58
+ Compose Unicode string. Before composing, the trailing
59
+ characters are sorted in canonical order.
60
+ The parameter must be decomposed.
61
+ The composition is based on the reverse of the
62
+ character decomposition mapping in UnicodeData.txt,
63
+ CompositionExclusions.txt and the Hangul composition
64
+ algorithm.
65
+
66
+ Unicode::normalize_D(str)
67
+ Unicode::normalize_KD(str)
68
+ Normalize Unicode string in form D or form KD.
69
+ These are aliases of decompose/decompose_compat.
70
+
71
+ Unicode::normalize_C(str)
72
+ Unicode::normalize_KC(str)
73
+ Normalize Unicode string in form C or form KC.
74
+ normalize_C = decompose + compose
75
+ normalize_KC = decompose_compat + compose
76
+
77
+ Unicode::upcase(str)
78
+ Unicode::downcase(str)
79
+ Unicode::capitalize(str)
80
+ Case conversion functions.
81
+ The mappings that are used by these functions are not normative
82
+ in UnicodeData.txt.
83
+
84
+ - Bugs
85
+
86
+ UTR #15 suggests that the look up for Normalization Form C
87
+ should not be implemented with a hash of string for better
88
+ performance.
89
+
90
+ Case conversion functions should reflecte UTR #21.
91
+
92
+
93
+ - Copying
94
+
95
+ This extension module is copyrighted free software by
96
+ Yoshida Masato.
97
+
98
+ You can redistribute it and/or modify it under the same
99
+ term as Ruby.
100
+
101
+
102
+ - Author
103
+
104
+ Yoshida Masato <yoshidam@yoshidam.net>
105
+
106
+
107
+ - History
108
+
109
+ Feb 26, 2010 version 0.3.0 fix a capitalize bug and support SpecialCasing
110
+ Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
111
+ Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
112
+ Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
113
+ Nov 23, 1999 version 0.1
data/Rakefile ADDED
@@ -0,0 +1,16 @@
1
+ require "rake/clean"
2
+ require "rake/extensiontask"
3
+ require "rubygems/package_task"
4
+
5
+ CLEAN << "pkg" << "tmp"
6
+
7
+ gem_spec = eval(File.read(File.expand_path("../unicode.gemspec", __FILE__)))
8
+
9
+ Rake::GemPackageTask.new(gem_spec) {|pkg|}
10
+
11
+ Rake::ExtensionTask.new('unicode_native', gem_spec) do |ext|
12
+ ext.cross_compile = true
13
+ ext.cross_platform = ['x86-mingw32', 'x86-mswin32-60']
14
+ ext.ext_dir = "ext/unicode"
15
+ ext.lib_dir = "lib/unicode"
16
+ end
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile("unicode/unicode_native")
@@ -0,0 +1,789 @@
1
+ /*
2
+ * Unicode Library version 0.3
3
+ * FEb 26, 2010: version 0.3
4
+ * Dec 29, 2009: version 0.2
5
+ * Nov 23, 1999 yoshidam
6
+ *
7
+ */
8
+
9
+ #include "ruby.h"
10
+ #ifdef HAVE_RUBY_IO_H
11
+ # include "ruby/io.h"
12
+ #else
13
+ # include "rubyio.h"
14
+ #endif
15
+ #include <stdio.h>
16
+ #include "wstring.h"
17
+ #include "unidata.map"
18
+
19
+ #ifndef RSTRING_PTR
20
+ # define RSTRING_PTR(s) (RSTRING(s)->ptr)
21
+ # define RSTRING_LEN(s) (RSTRING(s)->len)
22
+ #endif
23
+
24
+ #ifdef HAVE_RUBY_ENCODING_H
25
+ static rb_encoding* enc_out;
26
+ # define ENC_(o) (rb_enc_associate(o, enc_out))
27
+ #else
28
+ # define ENC_(o) (o)
29
+ #endif
30
+
31
+ inline static VALUE
32
+ taintObject(VALUE src, VALUE obj) {
33
+ if (OBJ_TAINTED(src))
34
+ OBJ_TAINT(obj);
35
+ return obj;
36
+ }
37
+ #define TO_(src, obj) (taintObject(src, obj))
38
+
39
+ #ifdef HAVE_RUBY_ENCODING_H
40
+ # define CONVERT_TO_UTF8(str) do { \
41
+ int encindex = ENCODING_GET(str); \
42
+ volatile VALUE encobj; \
43
+ if (encindex != rb_utf8_encindex() && \
44
+ encindex != rb_usascii_encindex()) { \
45
+ encobj = rb_enc_from_encoding(enc_out); \
46
+ str = rb_str_encode(str, encobj, 0, Qnil); \
47
+ } \
48
+ } while (0)
49
+ #endif
50
+
51
+ static VALUE mUnicode;
52
+ static VALUE unicode_data;
53
+ static VALUE composition_table;
54
+
55
+ /* Hangul */
56
+ #define SBASE (0xac00)
57
+ #define LBASE (0x1100)
58
+ #define LCOUNT (19)
59
+ #define VBASE (0x1161)
60
+ #define VCOUNT (21)
61
+ #define TBASE (0x11a7)
62
+ #define TCOUNT (28)
63
+ #define NCOUNT (VCOUNT * TCOUNT) /* 588 */
64
+ #define SCOUNT (LCOUNT * NCOUNT) /* 11172 */
65
+
66
+ static int
67
+ get_cc(int ucs)
68
+ {
69
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
70
+
71
+ if (!NIL_P(ch)) {
72
+ return unidata[FIX2INT(ch)].combining_class;
73
+ }
74
+ return 0;
75
+ }
76
+
77
+ static const char*
78
+ get_canon(int ucs)
79
+ {
80
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
81
+
82
+ if (!NIL_P(ch)) {
83
+ return unidata[FIX2INT(ch)].canon;
84
+ }
85
+ return NULL;
86
+ }
87
+
88
+ static const char*
89
+ get_compat(int ucs)
90
+ {
91
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
92
+
93
+ if (!NIL_P(ch)) {
94
+ return unidata[FIX2INT(ch)].compat;
95
+ }
96
+ return NULL;
97
+ }
98
+
99
+ static const char*
100
+ get_uppercase(int ucs)
101
+ {
102
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
103
+
104
+ if (!NIL_P(ch)) {
105
+ return unidata[FIX2INT(ch)].uppercase;
106
+ }
107
+ return NULL;
108
+ }
109
+
110
+ static const char*
111
+ get_lowercase(int ucs)
112
+ {
113
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
114
+
115
+ if (!NIL_P(ch)) {
116
+ return unidata[FIX2INT(ch)].lowercase;
117
+ }
118
+ return NULL;
119
+ }
120
+
121
+ static const char*
122
+ get_titlecase(int ucs)
123
+ {
124
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
125
+
126
+ if (!NIL_P(ch)) {
127
+ return unidata[FIX2INT(ch)].titlecase;
128
+ }
129
+ return NULL;
130
+ }
131
+
132
+ static int
133
+ get_composition(const char* str)
134
+ {
135
+ VALUE ch = rb_hash_aref(composition_table, rb_str_new2(str));
136
+
137
+ if (!NIL_P(ch)) {
138
+ return FIX2INT(ch);
139
+ }
140
+ return -1;
141
+ }
142
+
143
+ static WString*
144
+ sort_canonical(WString* ustr)
145
+ {
146
+ int i = 1;
147
+ int len = ustr->len;
148
+
149
+ if (len < 2) return ustr;
150
+
151
+ while (i < len) {
152
+ int last = ustr->str[i - 1];
153
+ int ch = ustr->str[i];
154
+ int last_cc = get_cc(last);
155
+ int cc = get_cc(ch);
156
+ if (cc != 0 && last_cc != 0 && last_cc > cc) {
157
+ ustr->str[i] = last;
158
+ ustr->str[i-1] = ch;
159
+ if (i > 1) i--;
160
+ }
161
+ else {
162
+ i++;
163
+ }
164
+ }
165
+ return ustr;
166
+ }
167
+
168
+ static void
169
+ decompose_hangul(int ucs, int* l, int* v, int* t)
170
+ {
171
+ int sindex = ucs - SBASE;
172
+ if (sindex < 0 || sindex >= SCOUNT) {
173
+ *l = ucs;
174
+ *v = *t = 0;
175
+ return;
176
+ }
177
+ *l = LBASE + sindex / NCOUNT;
178
+ *v = VBASE + (sindex % NCOUNT) / TCOUNT;
179
+ *t = TBASE + sindex % TCOUNT;
180
+ if (*t == TBASE) *t = 0;
181
+ }
182
+
183
+ /*
184
+ * push decomposed str into result
185
+ */
186
+ static WString*
187
+ decompose_internal(WString* ustr, WString* result)
188
+ {
189
+ int i;
190
+ int len = ustr->len;
191
+
192
+ for (i = 0; i < len; i++) {
193
+ int ucs = ustr->str[i];
194
+ if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
195
+ int l, v, t;
196
+ decompose_hangul(ucs, &l, &v, &t);
197
+ WStr_addWChar(result, l);
198
+ if (v) WStr_addWChar(result, v);
199
+ if (t) WStr_addWChar(result, t);
200
+ }
201
+ else {
202
+ const char* dc = get_canon(ucs);
203
+ if (!dc) {
204
+ WStr_addWChar(result, ucs);
205
+ }
206
+ else {
207
+ WString wdc;
208
+ WStr_allocWithUTF8(&wdc, dc);
209
+ decompose_internal(&wdc, result);
210
+ WStr_free(&wdc);
211
+ }
212
+ }
213
+ }
214
+ return result;
215
+ }
216
+
217
+ /*
218
+ * push compatibility decomposed str into result
219
+ */
220
+ static WString*
221
+ decompose_compat_internal(WString* ustr, WString* result)
222
+ {
223
+ int i;
224
+ int len = ustr->len;
225
+
226
+ for (i = 0; i < len; i++) {
227
+ int ucs = ustr->str[i];
228
+ if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
229
+ int l, v, t;
230
+ decompose_hangul(ucs, &l, &v, &t);
231
+ WStr_addWChar(result, l);
232
+ if (v) WStr_addWChar(result, v);
233
+ if (t) WStr_addWChar(result, t);
234
+ }
235
+ else {
236
+ const char* dc = get_compat(ucs);
237
+ if (!dc) {
238
+ WStr_addWChar(result, ucs);
239
+ }
240
+ else {
241
+ WString wdc;
242
+ WStr_allocWithUTF8(&wdc, dc);
243
+ decompose_compat_internal(&wdc, result);
244
+ WStr_free(&wdc);
245
+ }
246
+ }
247
+ }
248
+ return result;
249
+ }
250
+
251
+
252
+ #define UCS4toUTF8(p, c) \
253
+ do { \
254
+ if (c < 128) { \
255
+ *p++ = c; \
256
+ } \
257
+ else if (c < 2048) { \
258
+ *p++ = (c >> 6) | 192; \
259
+ *p++ = (c & 63) | 128; \
260
+ } \
261
+ else if (c < 0x10000) { \
262
+ *p++ = (c >> 12) | 224; \
263
+ *p++ = ((c >> 6) & 63) | 128; \
264
+ *p++ = (c & 63) | 128; \
265
+ } \
266
+ else if (c < 0x200000) { \
267
+ *p++ = (c >> 18) | 240; \
268
+ *p++ = ((c >> 12) & 63) | 128; \
269
+ *p++ = ((c >> 6) & 63) | 128; \
270
+ *p++ = (c & 63) | 128; \
271
+ } \
272
+ else if (c < 0x4000000) { \
273
+ *p++ = (c >> 24) | 248; \
274
+ *p++ = ((c >> 18) & 63) | 128; \
275
+ *p++ = ((c >> 12) & 63) | 128; \
276
+ *p++ = ((c >> 6) & 63) | 128; \
277
+ *p++ = (c & 63) | 128; \
278
+ } \
279
+ else if (c < 0x80000000) { \
280
+ *p++ = (c >> 30) | 252; \
281
+ *p++ = ((c >> 24) & 63) | 128; \
282
+ *p++ = ((c >> 18) & 63) | 128; \
283
+ *p++ = ((c >> 12) & 63) | 128; \
284
+ *p++ = ((c >> 6) & 63) | 128; \
285
+ *p++ = (c & 63) | 128; \
286
+ } \
287
+ } while (0)
288
+
289
+ static int
290
+ compose_pair(unsigned int c1, unsigned int c2)
291
+ {
292
+ int ret;
293
+ char ustr[13]; /* stored two UTF-8 chars */
294
+ char *p = ustr;
295
+
296
+ /* Hangul L + V */
297
+ if (c1 >= LBASE && c1 < LBASE + LCOUNT &&
298
+ c2 >= VBASE && c2 < VBASE + VCOUNT) {
299
+ return SBASE + ((c1 - LBASE) * VCOUNT + (c2 - VBASE)) * TCOUNT;
300
+ }
301
+ /* Hangul LV + T */
302
+ else if (c1 >= SBASE && c1 < SBASE + SCOUNT &&
303
+ (c1 - SBASE) % TCOUNT == 0 &&
304
+ c2 >= TBASE && c2 < TBASE + TCOUNT) {
305
+ return c1 + (c2 - TBASE);
306
+ }
307
+ UCS4toUTF8(p, c1);
308
+ UCS4toUTF8(p, c2);
309
+ *p = '\0';
310
+ ret = get_composition(ustr);
311
+
312
+ return ret;
313
+ }
314
+
315
+ /*
316
+ * push canonical composed str into result
317
+ */
318
+ static WString*
319
+ compose_internal(WString* ustr, WString* result)
320
+ {
321
+ int len = ustr->len;
322
+ int starter;
323
+ int startercc;
324
+ int i;
325
+
326
+ if (len == 0) return result;
327
+
328
+ starter = ustr->str[0];
329
+ startercc = get_cc(starter);
330
+ if (startercc != 0) startercc = 256;
331
+ for (i = 1; i < len; i++) {
332
+ int ch = ustr->str[i];
333
+ int cc = get_cc(ch);
334
+ int composite;
335
+
336
+ if (startercc == 0 &&
337
+ (composite = compose_pair(starter, ch)) >= 0) {
338
+ starter = composite;
339
+ startercc = get_cc(composite);
340
+ }
341
+ else {
342
+ WStr_addWChar(result, starter);
343
+ starter = ch;
344
+ startercc = cc;
345
+ }
346
+ }
347
+ WStr_addWChar(result, starter);
348
+
349
+ return result;
350
+ }
351
+
352
+ static WString*
353
+ upcase_internal(WString* str, WString* result)
354
+ {
355
+ int i;
356
+ int len = str->len;
357
+
358
+ for (i = 0; i < len; i++) {
359
+ int ucs = str->str[i];
360
+ const char* c = get_uppercase(ucs);
361
+ if (!c) {
362
+ WStr_addWChar(result, ucs);
363
+ }
364
+ else {
365
+ WString wc;
366
+ WStr_allocWithUTF8(&wc, c);
367
+ WStr_pushWString(result, &wc);
368
+ WStr_free(&wc);
369
+ }
370
+ }
371
+ return result;
372
+ }
373
+
374
+ static WString*
375
+ downcase_internal(WString* str, WString* result)
376
+ {
377
+ int i;
378
+ int len = str->len;
379
+
380
+ for (i = 0; i < len; i++) {
381
+ int ucs = str->str[i];
382
+ const char* c = get_lowercase(ucs);
383
+ if (!c) {
384
+ WStr_addWChar(result, ucs);
385
+ }
386
+ else {
387
+ WString wc;
388
+ WStr_allocWithUTF8(&wc, c);
389
+ WStr_pushWString(result, &wc);
390
+ WStr_free(&wc);
391
+ }
392
+ }
393
+ return result;
394
+ }
395
+
396
+ static WString*
397
+ capitalize_internal(WString* str, WString* result)
398
+ {
399
+ int i;
400
+ int len = str->len;
401
+
402
+ if (len > 0) {
403
+ const char* c = get_titlecase(str->str[0]);
404
+ if (!c) {
405
+ WStr_addWChar(result, str->str[0]);
406
+ }
407
+ else {
408
+ WString wc;
409
+ WStr_allocWithUTF8(&wc, c);
410
+ WStr_pushWString(result, &wc);
411
+ WStr_free(&wc);
412
+ }
413
+ }
414
+ for (i = 1; i < len; i++) {
415
+ int ucs = str->str[i];
416
+ const char* c = get_lowercase(ucs);
417
+ if (!c) {
418
+ WStr_addWChar(result, ucs);
419
+ }
420
+ else {
421
+ WString wc;
422
+ WStr_allocWithUTF8(&wc, c);
423
+ WStr_pushWString(result, &wc);
424
+ WStr_free(&wc);
425
+ }
426
+ }
427
+ return result;
428
+ }
429
+
430
+ static VALUE
431
+ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
432
+ {
433
+ WString wstr1;
434
+ WString wstr2;
435
+ WString result1;
436
+ WString result2;
437
+ UString ustr1;
438
+ UString ustr2;
439
+ int ret;
440
+
441
+ Check_Type(str1, T_STRING);
442
+ Check_Type(str2, T_STRING);
443
+ #ifdef HAVE_RUBY_ENCODING_H
444
+ CONVERT_TO_UTF8(str1);
445
+ CONVERT_TO_UTF8(str2);
446
+ #endif
447
+ WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
448
+ WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
449
+ WStr_alloc(&result1);
450
+ WStr_alloc(&result2);
451
+ decompose_internal(&wstr1, &result1);
452
+ decompose_internal(&wstr2, &result2);
453
+ WStr_free(&wstr1);
454
+ WStr_free(&wstr2);
455
+ sort_canonical(&result1);
456
+ sort_canonical(&result2);
457
+ UniStr_alloc(&ustr1);
458
+ UniStr_alloc(&ustr2);
459
+ WStr_convertIntoUString(&result1, &ustr1);
460
+ WStr_convertIntoUString(&result2, &ustr2);
461
+ WStr_free(&result1);
462
+ WStr_free(&result2);
463
+ UniStr_addChar(&ustr1, '\0');
464
+ UniStr_addChar(&ustr2, '\0');
465
+ ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
466
+ UniStr_free(&ustr1);
467
+ UniStr_free(&ustr2);
468
+
469
+ return INT2FIX(ret);
470
+ }
471
+
472
+ static VALUE
473
+ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
474
+ {
475
+ WString wstr1;
476
+ WString wstr2;
477
+ WString result1;
478
+ WString result2;
479
+ UString ustr1;
480
+ UString ustr2;
481
+ int ret;
482
+
483
+ Check_Type(str1, T_STRING);
484
+ Check_Type(str2, T_STRING);
485
+ #ifdef HAVE_RUBY_ENCODING_H
486
+ CONVERT_TO_UTF8(str1);
487
+ CONVERT_TO_UTF8(str2);
488
+ #endif
489
+ WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
490
+ WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
491
+ WStr_alloc(&result1);
492
+ WStr_alloc(&result2);
493
+ decompose_compat_internal(&wstr1, &result1);
494
+ decompose_compat_internal(&wstr2, &result2);
495
+ WStr_free(&wstr1);
496
+ WStr_free(&wstr2);
497
+ sort_canonical(&result1);
498
+ sort_canonical(&result2);
499
+ UniStr_alloc(&ustr1);
500
+ UniStr_alloc(&ustr2);
501
+ WStr_convertIntoUString(&result1, &ustr1);
502
+ WStr_convertIntoUString(&result2, &ustr2);
503
+ WStr_free(&result1);
504
+ WStr_free(&result2);
505
+ UniStr_addChar(&ustr1, '\0');
506
+ UniStr_addChar(&ustr2, '\0');
507
+ ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
508
+ UniStr_free(&ustr1);
509
+ UniStr_free(&ustr2);
510
+
511
+ return INT2FIX(ret);
512
+ }
513
+
514
+ static VALUE
515
+ unicode_decompose(VALUE obj, VALUE str)
516
+ {
517
+ WString ustr;
518
+ WString result;
519
+ UString ret;
520
+ VALUE vret;
521
+
522
+ Check_Type(str, T_STRING);
523
+ #ifdef HAVE_RUBY_ENCODING_H
524
+ CONVERT_TO_UTF8(str);
525
+ #endif
526
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
527
+ WStr_alloc(&result);
528
+ decompose_internal(&ustr, &result);
529
+ WStr_free(&ustr);
530
+ sort_canonical(&result);
531
+ UniStr_alloc(&ret);
532
+ WStr_convertIntoUString(&result, &ret);
533
+ WStr_free(&result);
534
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
535
+ UniStr_free(&ret);
536
+
537
+ return vret;
538
+ }
539
+
540
+ static VALUE
541
+ unicode_decompose_compat(VALUE obj, VALUE str)
542
+ {
543
+ WString ustr;
544
+ WString result;
545
+ UString ret;
546
+ VALUE vret;
547
+
548
+ Check_Type(str, T_STRING);
549
+ #ifdef HAVE_RUBY_ENCODING_H
550
+ CONVERT_TO_UTF8(str);
551
+ #endif
552
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
553
+ WStr_alloc(&result);
554
+ decompose_compat_internal(&ustr, &result);
555
+ WStr_free(&ustr);
556
+ sort_canonical(&result);
557
+ UniStr_alloc(&ret);
558
+ WStr_convertIntoUString(&result, &ret);
559
+ WStr_free(&result);
560
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
561
+ UniStr_free(&ret);
562
+
563
+ return vret;
564
+ }
565
+
566
+ static VALUE
567
+ unicode_compose(VALUE obj, VALUE str)
568
+ {
569
+ WString ustr;
570
+ WString result;
571
+ UString ret;
572
+ VALUE vret;
573
+
574
+ Check_Type(str, T_STRING);
575
+ #ifdef HAVE_RUBY_ENCODING_H
576
+ CONVERT_TO_UTF8(str);
577
+ #endif
578
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
579
+ sort_canonical(&ustr);
580
+ WStr_alloc(&result);
581
+ compose_internal(&ustr, &result);
582
+ WStr_free(&ustr);
583
+ UniStr_alloc(&ret);
584
+ WStr_convertIntoUString(&result, &ret);
585
+ WStr_free(&result);
586
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
587
+ UniStr_free(&ret);
588
+
589
+ return vret;
590
+ }
591
+
592
+ static VALUE
593
+ unicode_normalize_C(VALUE obj, VALUE str)
594
+ {
595
+ WString ustr1;
596
+ WString ustr2;
597
+ WString result;
598
+ UString ret;
599
+ VALUE vret;
600
+
601
+ Check_Type(str, T_STRING);
602
+ #ifdef HAVE_RUBY_ENCODING_H
603
+ CONVERT_TO_UTF8(str);
604
+ #endif
605
+ WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
606
+ WStr_alloc(&ustr2);
607
+ decompose_internal(&ustr1, &ustr2);
608
+ WStr_free(&ustr1);
609
+ sort_canonical(&ustr2);
610
+ WStr_alloc(&result);
611
+ compose_internal(&ustr2, &result);
612
+ WStr_free(&ustr2);
613
+ UniStr_alloc(&ret);
614
+ WStr_convertIntoUString(&result, &ret);
615
+ WStr_free(&result);
616
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
617
+ UniStr_free(&ret);
618
+
619
+ return vret;
620
+ }
621
+
622
+ static VALUE
623
+ unicode_normalize_KC(VALUE obj, VALUE str)
624
+ {
625
+ WString ustr1;
626
+ WString ustr2;
627
+ WString result;
628
+ UString ret;
629
+ VALUE vret;
630
+
631
+ Check_Type(str, T_STRING);
632
+ #ifdef HAVE_RUBY_ENCODING_H
633
+ CONVERT_TO_UTF8(str);
634
+ #endif
635
+ WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
636
+ WStr_alloc(&ustr2);
637
+ decompose_compat_internal(&ustr1, &ustr2);
638
+ WStr_free(&ustr1);
639
+ sort_canonical(&ustr2);
640
+ WStr_alloc(&result);
641
+ compose_internal(&ustr2, &result);
642
+ WStr_free(&ustr2);
643
+ UniStr_alloc(&ret);
644
+ WStr_convertIntoUString(&result, &ret);
645
+ WStr_free(&result);
646
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
647
+ UniStr_free(&ret);
648
+
649
+ return vret;
650
+ }
651
+
652
+ static VALUE
653
+ unicode_upcase(VALUE obj, VALUE str)
654
+ {
655
+ WString ustr;
656
+ WString result;
657
+ UString ret;
658
+ VALUE vret;
659
+
660
+ Check_Type(str, T_STRING);
661
+ #ifdef HAVE_RUBY_ENCODING_H
662
+ CONVERT_TO_UTF8(str);
663
+ #endif
664
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
665
+ WStr_alloc(&result);
666
+ upcase_internal(&ustr, &result);
667
+ //sort_canonical(&result);
668
+ WStr_free(&ustr);
669
+ UniStr_alloc(&ret);
670
+ WStr_convertIntoUString(&result, &ret);
671
+ WStr_free(&result);
672
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
673
+ UniStr_free(&ret);
674
+
675
+ return vret;
676
+ }
677
+
678
+ static VALUE
679
+ unicode_downcase(VALUE obj, VALUE str)
680
+ {
681
+ WString ustr;
682
+ WString result;
683
+ UString ret;
684
+ VALUE vret;
685
+
686
+ Check_Type(str, T_STRING);
687
+ #ifdef HAVE_RUBY_ENCODING_H
688
+ CONVERT_TO_UTF8(str);
689
+ #endif
690
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
691
+ WStr_alloc(&result);
692
+ downcase_internal(&ustr, &result);
693
+ //sort_canonical(&result);
694
+ WStr_free(&ustr);
695
+ UniStr_alloc(&ret);
696
+ WStr_convertIntoUString(&result, &ret);
697
+ WStr_free(&result);
698
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
699
+ UniStr_free(&ret);
700
+
701
+ return vret;
702
+ }
703
+
704
+ #ifdef HAVE_RUBY_ENCODING_H
705
+
706
+
707
+ #endif
708
+
709
+ static VALUE
710
+ unicode_capitalize(VALUE obj, VALUE str)
711
+ {
712
+ WString ustr;
713
+ WString result;
714
+ UString ret;
715
+ VALUE vret;
716
+
717
+ Check_Type(str, T_STRING);
718
+ #ifdef HAVE_RUBY_ENCODING_H
719
+ CONVERT_TO_UTF8(str);
720
+ #endif
721
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
722
+ WStr_alloc(&result);
723
+ capitalize_internal(&ustr, &result);
724
+ //sort_canonical(&result);
725
+ WStr_free(&ustr);
726
+ UniStr_alloc(&ret);
727
+ WStr_convertIntoUString(&result, &ret);
728
+ WStr_free(&result);
729
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
730
+ UniStr_free(&ret);
731
+
732
+ return vret;
733
+ }
734
+
735
+ void
736
+ Init_unicode_native()
737
+ {
738
+ int i;
739
+
740
+ #ifdef HAVE_RUBY_ENCODING_H
741
+ enc_out = rb_utf8_encoding();
742
+ #endif
743
+
744
+ mUnicode = rb_define_module("Unicode");
745
+ unicode_data = rb_hash_new();
746
+ composition_table = rb_hash_new();
747
+
748
+ rb_global_variable(&unicode_data);
749
+ rb_global_variable(&composition_table);
750
+
751
+ for (i = 0; unidata[i].code != -1; i++) {
752
+ int code = unidata[i].code;
753
+ const char* canon = unidata[i].canon;
754
+ int exclusion = unidata[i].exclusion;
755
+
756
+ rb_hash_aset(unicode_data, INT2FIX(code), INT2FIX(i));
757
+ if (canon && exclusion == 0) {
758
+ rb_hash_aset(composition_table, rb_str_new2(canon), INT2FIX(code));
759
+ }
760
+ }
761
+
762
+ rb_define_module_function(mUnicode, "strcmp",
763
+ unicode_strcmp, 2);
764
+ rb_define_module_function(mUnicode, "strcmp_compat",
765
+ unicode_strcmp_compat, 2);
766
+
767
+ rb_define_module_function(mUnicode, "decompose",
768
+ unicode_decompose, 1);
769
+ rb_define_module_function(mUnicode, "decompose_compat",
770
+ unicode_decompose_compat, 1);
771
+ rb_define_module_function(mUnicode, "compose",
772
+ unicode_compose, 1);
773
+
774
+ rb_define_module_function(mUnicode, "normalize_D",
775
+ unicode_decompose, 1);
776
+ rb_define_module_function(mUnicode, "normalize_KD",
777
+ unicode_decompose_compat, 1);
778
+ rb_define_module_function(mUnicode, "normalize_C",
779
+ unicode_normalize_C, 1);
780
+ rb_define_module_function(mUnicode, "normalize_KC",
781
+ unicode_normalize_KC, 1);
782
+
783
+ rb_define_module_function(mUnicode, "upcase",
784
+ unicode_upcase, 1);
785
+ rb_define_module_function(mUnicode, "downcase",
786
+ unicode_downcase, 1);
787
+ rb_define_module_function(mUnicode, "capitalize",
788
+ unicode_capitalize, 1);
789
+ }