unicode 0.3.1-x86-mswin32-60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,113 @@
1
+ Unicode Library for Ruby
2
+ Version 0.3.0
3
+
4
+ Yoshida Masato
5
+
6
+
7
+ - Introduction
8
+
9
+ Unicode string manipulation library for Ruby.
10
+ This library is based on UTR #15 Unicode Normalization Forms(*1).
11
+
12
+ *1 <URL:http://www.unicode.org/unicode/reports/tr15/>
13
+
14
+
15
+ - Install
16
+
17
+ This can work with ruby-1.8 or later. I recommend you to
18
+ use ruby-1.8.1 or later.
19
+
20
+ Make and install usually.
21
+ For example, when Ruby supports dynamic linking on your OS,
22
+
23
+ ruby extconf.rb
24
+ make
25
+ make install
26
+
27
+
28
+ - Usage
29
+
30
+ If you do not link this module with Ruby statically,
31
+
32
+ require "unicode"
33
+
34
+ before using.
35
+
36
+
37
+ - Module Functions
38
+
39
+ All parameters of functions must be UTF-8 strings.
40
+
41
+ Unicode::strcmp(str1, str2)
42
+ Unicode::strcmp_compat(str1, str2)
43
+ Compare Unicode strings with a normalization.
44
+ strcmp uses the Normalization Form D, strcmp_compat uses
45
+ Normalization Form KD.
46
+
47
+ Unicode::decompose(str)
48
+ Unicode::decompose_compat(str)
49
+ Decompose Unicode string. Then the trailing characters
50
+ are sorted in canonical order.
51
+ decompose uses the canonical decomposition,
52
+ decompose_compat uses the compatibility decomposition.
53
+ The decomposition is based on the character decomposition
54
+ mapping in UnicodeData.txt and the Hangul decomposition
55
+ algorithm.
56
+
57
+ Unicode::compose(str)
58
+ Compose Unicode string. Before composing, the trailing
59
+ characters are sorted in canonical order.
60
+ The parameter must be decomposed.
61
+ The composition is based on the reverse of the
62
+ character decomposition mapping in UnicodeData.txt,
63
+ CompositionExclusions.txt and the Hangul composition
64
+ algorithm.
65
+
66
+ Unicode::normalize_D(str)
67
+ Unicode::normalize_KD(str)
68
+ Normalize Unicode string in form D or form KD.
69
+ These are aliases of decompose/decompose_compat.
70
+
71
+ Unicode::normalize_C(str)
72
+ Unicode::normalize_KC(str)
73
+ Normalize Unicode string in form C or form KC.
74
+ normalize_C = decompose + compose
75
+ normalize_KC = decompose_compat + compose
76
+
77
+ Unicode::upcase(str)
78
+ Unicode::downcase(str)
79
+ Unicode::capitalize(str)
80
+ Case conversion functions.
81
+ The mappings that are used by these functions are not normative
82
+ in UnicodeData.txt.
83
+
84
+ - Bugs
85
+
86
+ UTR #15 suggests that the look up for Normalization Form C
87
+ should not be implemented with a hash of string for better
88
+ performance.
89
+
90
+ Case conversion functions should reflecte UTR #21.
91
+
92
+
93
+ - Copying
94
+
95
+ This extension module is copyrighted free software by
96
+ Yoshida Masato.
97
+
98
+ You can redistribute it and/or modify it under the same
99
+ term as Ruby.
100
+
101
+
102
+ - Author
103
+
104
+ Yoshida Masato <yoshidam@yoshidam.net>
105
+
106
+
107
+ - History
108
+
109
+ Feb 26, 2010 version 0.3.0 fix a capitalize bug and support SpecialCasing
110
+ Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
111
+ Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
112
+ Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
113
+ Nov 23, 1999 version 0.1
data/Rakefile ADDED
@@ -0,0 +1,16 @@
1
+ require "rake/clean"
2
+ require "rake/extensiontask"
3
+ require "rubygems/package_task"
4
+
5
+ CLEAN << "pkg" << "tmp"
6
+
7
+ gem_spec = eval(File.read(File.expand_path("../unicode.gemspec", __FILE__)))
8
+
9
+ Rake::GemPackageTask.new(gem_spec) {|pkg|}
10
+
11
+ Rake::ExtensionTask.new('unicode_native', gem_spec) do |ext|
12
+ ext.cross_compile = true
13
+ ext.cross_platform = ['x86-mingw32', 'x86-mswin32-60']
14
+ ext.ext_dir = "ext/unicode"
15
+ ext.lib_dir = "lib/unicode"
16
+ end
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile("unicode/unicode_native")
@@ -0,0 +1,789 @@
1
+ /*
2
+ * Unicode Library version 0.3
3
+ * FEb 26, 2010: version 0.3
4
+ * Dec 29, 2009: version 0.2
5
+ * Nov 23, 1999 yoshidam
6
+ *
7
+ */
8
+
9
+ #include "ruby.h"
10
+ #ifdef HAVE_RUBY_IO_H
11
+ # include "ruby/io.h"
12
+ #else
13
+ # include "rubyio.h"
14
+ #endif
15
+ #include <stdio.h>
16
+ #include "wstring.h"
17
+ #include "unidata.map"
18
+
19
+ #ifndef RSTRING_PTR
20
+ # define RSTRING_PTR(s) (RSTRING(s)->ptr)
21
+ # define RSTRING_LEN(s) (RSTRING(s)->len)
22
+ #endif
23
+
24
+ #ifdef HAVE_RUBY_ENCODING_H
25
+ static rb_encoding* enc_out;
26
+ # define ENC_(o) (rb_enc_associate(o, enc_out))
27
+ #else
28
+ # define ENC_(o) (o)
29
+ #endif
30
+
31
+ inline static VALUE
32
+ taintObject(VALUE src, VALUE obj) {
33
+ if (OBJ_TAINTED(src))
34
+ OBJ_TAINT(obj);
35
+ return obj;
36
+ }
37
+ #define TO_(src, obj) (taintObject(src, obj))
38
+
39
+ #ifdef HAVE_RUBY_ENCODING_H
40
+ # define CONVERT_TO_UTF8(str) do { \
41
+ int encindex = ENCODING_GET(str); \
42
+ volatile VALUE encobj; \
43
+ if (encindex != rb_utf8_encindex() && \
44
+ encindex != rb_usascii_encindex()) { \
45
+ encobj = rb_enc_from_encoding(enc_out); \
46
+ str = rb_str_encode(str, encobj, 0, Qnil); \
47
+ } \
48
+ } while (0)
49
+ #endif
50
+
51
+ static VALUE mUnicode;
52
+ static VALUE unicode_data;
53
+ static VALUE composition_table;
54
+
55
+ /* Hangul */
56
+ #define SBASE (0xac00)
57
+ #define LBASE (0x1100)
58
+ #define LCOUNT (19)
59
+ #define VBASE (0x1161)
60
+ #define VCOUNT (21)
61
+ #define TBASE (0x11a7)
62
+ #define TCOUNT (28)
63
+ #define NCOUNT (VCOUNT * TCOUNT) /* 588 */
64
+ #define SCOUNT (LCOUNT * NCOUNT) /* 11172 */
65
+
66
+ static int
67
+ get_cc(int ucs)
68
+ {
69
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
70
+
71
+ if (!NIL_P(ch)) {
72
+ return unidata[FIX2INT(ch)].combining_class;
73
+ }
74
+ return 0;
75
+ }
76
+
77
+ static const char*
78
+ get_canon(int ucs)
79
+ {
80
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
81
+
82
+ if (!NIL_P(ch)) {
83
+ return unidata[FIX2INT(ch)].canon;
84
+ }
85
+ return NULL;
86
+ }
87
+
88
+ static const char*
89
+ get_compat(int ucs)
90
+ {
91
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
92
+
93
+ if (!NIL_P(ch)) {
94
+ return unidata[FIX2INT(ch)].compat;
95
+ }
96
+ return NULL;
97
+ }
98
+
99
+ static const char*
100
+ get_uppercase(int ucs)
101
+ {
102
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
103
+
104
+ if (!NIL_P(ch)) {
105
+ return unidata[FIX2INT(ch)].uppercase;
106
+ }
107
+ return NULL;
108
+ }
109
+
110
+ static const char*
111
+ get_lowercase(int ucs)
112
+ {
113
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
114
+
115
+ if (!NIL_P(ch)) {
116
+ return unidata[FIX2INT(ch)].lowercase;
117
+ }
118
+ return NULL;
119
+ }
120
+
121
+ static const char*
122
+ get_titlecase(int ucs)
123
+ {
124
+ VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
125
+
126
+ if (!NIL_P(ch)) {
127
+ return unidata[FIX2INT(ch)].titlecase;
128
+ }
129
+ return NULL;
130
+ }
131
+
132
+ static int
133
+ get_composition(const char* str)
134
+ {
135
+ VALUE ch = rb_hash_aref(composition_table, rb_str_new2(str));
136
+
137
+ if (!NIL_P(ch)) {
138
+ return FIX2INT(ch);
139
+ }
140
+ return -1;
141
+ }
142
+
143
+ static WString*
144
+ sort_canonical(WString* ustr)
145
+ {
146
+ int i = 1;
147
+ int len = ustr->len;
148
+
149
+ if (len < 2) return ustr;
150
+
151
+ while (i < len) {
152
+ int last = ustr->str[i - 1];
153
+ int ch = ustr->str[i];
154
+ int last_cc = get_cc(last);
155
+ int cc = get_cc(ch);
156
+ if (cc != 0 && last_cc != 0 && last_cc > cc) {
157
+ ustr->str[i] = last;
158
+ ustr->str[i-1] = ch;
159
+ if (i > 1) i--;
160
+ }
161
+ else {
162
+ i++;
163
+ }
164
+ }
165
+ return ustr;
166
+ }
167
+
168
+ static void
169
+ decompose_hangul(int ucs, int* l, int* v, int* t)
170
+ {
171
+ int sindex = ucs - SBASE;
172
+ if (sindex < 0 || sindex >= SCOUNT) {
173
+ *l = ucs;
174
+ *v = *t = 0;
175
+ return;
176
+ }
177
+ *l = LBASE + sindex / NCOUNT;
178
+ *v = VBASE + (sindex % NCOUNT) / TCOUNT;
179
+ *t = TBASE + sindex % TCOUNT;
180
+ if (*t == TBASE) *t = 0;
181
+ }
182
+
183
+ /*
184
+ * push decomposed str into result
185
+ */
186
+ static WString*
187
+ decompose_internal(WString* ustr, WString* result)
188
+ {
189
+ int i;
190
+ int len = ustr->len;
191
+
192
+ for (i = 0; i < len; i++) {
193
+ int ucs = ustr->str[i];
194
+ if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
195
+ int l, v, t;
196
+ decompose_hangul(ucs, &l, &v, &t);
197
+ WStr_addWChar(result, l);
198
+ if (v) WStr_addWChar(result, v);
199
+ if (t) WStr_addWChar(result, t);
200
+ }
201
+ else {
202
+ const char* dc = get_canon(ucs);
203
+ if (!dc) {
204
+ WStr_addWChar(result, ucs);
205
+ }
206
+ else {
207
+ WString wdc;
208
+ WStr_allocWithUTF8(&wdc, dc);
209
+ decompose_internal(&wdc, result);
210
+ WStr_free(&wdc);
211
+ }
212
+ }
213
+ }
214
+ return result;
215
+ }
216
+
217
+ /*
218
+ * push compatibility decomposed str into result
219
+ */
220
+ static WString*
221
+ decompose_compat_internal(WString* ustr, WString* result)
222
+ {
223
+ int i;
224
+ int len = ustr->len;
225
+
226
+ for (i = 0; i < len; i++) {
227
+ int ucs = ustr->str[i];
228
+ if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
229
+ int l, v, t;
230
+ decompose_hangul(ucs, &l, &v, &t);
231
+ WStr_addWChar(result, l);
232
+ if (v) WStr_addWChar(result, v);
233
+ if (t) WStr_addWChar(result, t);
234
+ }
235
+ else {
236
+ const char* dc = get_compat(ucs);
237
+ if (!dc) {
238
+ WStr_addWChar(result, ucs);
239
+ }
240
+ else {
241
+ WString wdc;
242
+ WStr_allocWithUTF8(&wdc, dc);
243
+ decompose_compat_internal(&wdc, result);
244
+ WStr_free(&wdc);
245
+ }
246
+ }
247
+ }
248
+ return result;
249
+ }
250
+
251
+
252
+ #define UCS4toUTF8(p, c) \
253
+ do { \
254
+ if (c < 128) { \
255
+ *p++ = c; \
256
+ } \
257
+ else if (c < 2048) { \
258
+ *p++ = (c >> 6) | 192; \
259
+ *p++ = (c & 63) | 128; \
260
+ } \
261
+ else if (c < 0x10000) { \
262
+ *p++ = (c >> 12) | 224; \
263
+ *p++ = ((c >> 6) & 63) | 128; \
264
+ *p++ = (c & 63) | 128; \
265
+ } \
266
+ else if (c < 0x200000) { \
267
+ *p++ = (c >> 18) | 240; \
268
+ *p++ = ((c >> 12) & 63) | 128; \
269
+ *p++ = ((c >> 6) & 63) | 128; \
270
+ *p++ = (c & 63) | 128; \
271
+ } \
272
+ else if (c < 0x4000000) { \
273
+ *p++ = (c >> 24) | 248; \
274
+ *p++ = ((c >> 18) & 63) | 128; \
275
+ *p++ = ((c >> 12) & 63) | 128; \
276
+ *p++ = ((c >> 6) & 63) | 128; \
277
+ *p++ = (c & 63) | 128; \
278
+ } \
279
+ else if (c < 0x80000000) { \
280
+ *p++ = (c >> 30) | 252; \
281
+ *p++ = ((c >> 24) & 63) | 128; \
282
+ *p++ = ((c >> 18) & 63) | 128; \
283
+ *p++ = ((c >> 12) & 63) | 128; \
284
+ *p++ = ((c >> 6) & 63) | 128; \
285
+ *p++ = (c & 63) | 128; \
286
+ } \
287
+ } while (0)
288
+
289
+ static int
290
+ compose_pair(unsigned int c1, unsigned int c2)
291
+ {
292
+ int ret;
293
+ char ustr[13]; /* stored two UTF-8 chars */
294
+ char *p = ustr;
295
+
296
+ /* Hangul L + V */
297
+ if (c1 >= LBASE && c1 < LBASE + LCOUNT &&
298
+ c2 >= VBASE && c2 < VBASE + VCOUNT) {
299
+ return SBASE + ((c1 - LBASE) * VCOUNT + (c2 - VBASE)) * TCOUNT;
300
+ }
301
+ /* Hangul LV + T */
302
+ else if (c1 >= SBASE && c1 < SBASE + SCOUNT &&
303
+ (c1 - SBASE) % TCOUNT == 0 &&
304
+ c2 >= TBASE && c2 < TBASE + TCOUNT) {
305
+ return c1 + (c2 - TBASE);
306
+ }
307
+ UCS4toUTF8(p, c1);
308
+ UCS4toUTF8(p, c2);
309
+ *p = '\0';
310
+ ret = get_composition(ustr);
311
+
312
+ return ret;
313
+ }
314
+
315
+ /*
316
+ * push canonical composed str into result
317
+ */
318
+ static WString*
319
+ compose_internal(WString* ustr, WString* result)
320
+ {
321
+ int len = ustr->len;
322
+ int starter;
323
+ int startercc;
324
+ int i;
325
+
326
+ if (len == 0) return result;
327
+
328
+ starter = ustr->str[0];
329
+ startercc = get_cc(starter);
330
+ if (startercc != 0) startercc = 256;
331
+ for (i = 1; i < len; i++) {
332
+ int ch = ustr->str[i];
333
+ int cc = get_cc(ch);
334
+ int composite;
335
+
336
+ if (startercc == 0 &&
337
+ (composite = compose_pair(starter, ch)) >= 0) {
338
+ starter = composite;
339
+ startercc = get_cc(composite);
340
+ }
341
+ else {
342
+ WStr_addWChar(result, starter);
343
+ starter = ch;
344
+ startercc = cc;
345
+ }
346
+ }
347
+ WStr_addWChar(result, starter);
348
+
349
+ return result;
350
+ }
351
+
352
+ static WString*
353
+ upcase_internal(WString* str, WString* result)
354
+ {
355
+ int i;
356
+ int len = str->len;
357
+
358
+ for (i = 0; i < len; i++) {
359
+ int ucs = str->str[i];
360
+ const char* c = get_uppercase(ucs);
361
+ if (!c) {
362
+ WStr_addWChar(result, ucs);
363
+ }
364
+ else {
365
+ WString wc;
366
+ WStr_allocWithUTF8(&wc, c);
367
+ WStr_pushWString(result, &wc);
368
+ WStr_free(&wc);
369
+ }
370
+ }
371
+ return result;
372
+ }
373
+
374
+ static WString*
375
+ downcase_internal(WString* str, WString* result)
376
+ {
377
+ int i;
378
+ int len = str->len;
379
+
380
+ for (i = 0; i < len; i++) {
381
+ int ucs = str->str[i];
382
+ const char* c = get_lowercase(ucs);
383
+ if (!c) {
384
+ WStr_addWChar(result, ucs);
385
+ }
386
+ else {
387
+ WString wc;
388
+ WStr_allocWithUTF8(&wc, c);
389
+ WStr_pushWString(result, &wc);
390
+ WStr_free(&wc);
391
+ }
392
+ }
393
+ return result;
394
+ }
395
+
396
+ static WString*
397
+ capitalize_internal(WString* str, WString* result)
398
+ {
399
+ int i;
400
+ int len = str->len;
401
+
402
+ if (len > 0) {
403
+ const char* c = get_titlecase(str->str[0]);
404
+ if (!c) {
405
+ WStr_addWChar(result, str->str[0]);
406
+ }
407
+ else {
408
+ WString wc;
409
+ WStr_allocWithUTF8(&wc, c);
410
+ WStr_pushWString(result, &wc);
411
+ WStr_free(&wc);
412
+ }
413
+ }
414
+ for (i = 1; i < len; i++) {
415
+ int ucs = str->str[i];
416
+ const char* c = get_lowercase(ucs);
417
+ if (!c) {
418
+ WStr_addWChar(result, ucs);
419
+ }
420
+ else {
421
+ WString wc;
422
+ WStr_allocWithUTF8(&wc, c);
423
+ WStr_pushWString(result, &wc);
424
+ WStr_free(&wc);
425
+ }
426
+ }
427
+ return result;
428
+ }
429
+
430
+ static VALUE
431
+ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
432
+ {
433
+ WString wstr1;
434
+ WString wstr2;
435
+ WString result1;
436
+ WString result2;
437
+ UString ustr1;
438
+ UString ustr2;
439
+ int ret;
440
+
441
+ Check_Type(str1, T_STRING);
442
+ Check_Type(str2, T_STRING);
443
+ #ifdef HAVE_RUBY_ENCODING_H
444
+ CONVERT_TO_UTF8(str1);
445
+ CONVERT_TO_UTF8(str2);
446
+ #endif
447
+ WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
448
+ WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
449
+ WStr_alloc(&result1);
450
+ WStr_alloc(&result2);
451
+ decompose_internal(&wstr1, &result1);
452
+ decompose_internal(&wstr2, &result2);
453
+ WStr_free(&wstr1);
454
+ WStr_free(&wstr2);
455
+ sort_canonical(&result1);
456
+ sort_canonical(&result2);
457
+ UniStr_alloc(&ustr1);
458
+ UniStr_alloc(&ustr2);
459
+ WStr_convertIntoUString(&result1, &ustr1);
460
+ WStr_convertIntoUString(&result2, &ustr2);
461
+ WStr_free(&result1);
462
+ WStr_free(&result2);
463
+ UniStr_addChar(&ustr1, '\0');
464
+ UniStr_addChar(&ustr2, '\0');
465
+ ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
466
+ UniStr_free(&ustr1);
467
+ UniStr_free(&ustr2);
468
+
469
+ return INT2FIX(ret);
470
+ }
471
+
472
+ static VALUE
473
+ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
474
+ {
475
+ WString wstr1;
476
+ WString wstr2;
477
+ WString result1;
478
+ WString result2;
479
+ UString ustr1;
480
+ UString ustr2;
481
+ int ret;
482
+
483
+ Check_Type(str1, T_STRING);
484
+ Check_Type(str2, T_STRING);
485
+ #ifdef HAVE_RUBY_ENCODING_H
486
+ CONVERT_TO_UTF8(str1);
487
+ CONVERT_TO_UTF8(str2);
488
+ #endif
489
+ WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
490
+ WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
491
+ WStr_alloc(&result1);
492
+ WStr_alloc(&result2);
493
+ decompose_compat_internal(&wstr1, &result1);
494
+ decompose_compat_internal(&wstr2, &result2);
495
+ WStr_free(&wstr1);
496
+ WStr_free(&wstr2);
497
+ sort_canonical(&result1);
498
+ sort_canonical(&result2);
499
+ UniStr_alloc(&ustr1);
500
+ UniStr_alloc(&ustr2);
501
+ WStr_convertIntoUString(&result1, &ustr1);
502
+ WStr_convertIntoUString(&result2, &ustr2);
503
+ WStr_free(&result1);
504
+ WStr_free(&result2);
505
+ UniStr_addChar(&ustr1, '\0');
506
+ UniStr_addChar(&ustr2, '\0');
507
+ ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
508
+ UniStr_free(&ustr1);
509
+ UniStr_free(&ustr2);
510
+
511
+ return INT2FIX(ret);
512
+ }
513
+
514
+ static VALUE
515
+ unicode_decompose(VALUE obj, VALUE str)
516
+ {
517
+ WString ustr;
518
+ WString result;
519
+ UString ret;
520
+ VALUE vret;
521
+
522
+ Check_Type(str, T_STRING);
523
+ #ifdef HAVE_RUBY_ENCODING_H
524
+ CONVERT_TO_UTF8(str);
525
+ #endif
526
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
527
+ WStr_alloc(&result);
528
+ decompose_internal(&ustr, &result);
529
+ WStr_free(&ustr);
530
+ sort_canonical(&result);
531
+ UniStr_alloc(&ret);
532
+ WStr_convertIntoUString(&result, &ret);
533
+ WStr_free(&result);
534
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
535
+ UniStr_free(&ret);
536
+
537
+ return vret;
538
+ }
539
+
540
+ static VALUE
541
+ unicode_decompose_compat(VALUE obj, VALUE str)
542
+ {
543
+ WString ustr;
544
+ WString result;
545
+ UString ret;
546
+ VALUE vret;
547
+
548
+ Check_Type(str, T_STRING);
549
+ #ifdef HAVE_RUBY_ENCODING_H
550
+ CONVERT_TO_UTF8(str);
551
+ #endif
552
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
553
+ WStr_alloc(&result);
554
+ decompose_compat_internal(&ustr, &result);
555
+ WStr_free(&ustr);
556
+ sort_canonical(&result);
557
+ UniStr_alloc(&ret);
558
+ WStr_convertIntoUString(&result, &ret);
559
+ WStr_free(&result);
560
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
561
+ UniStr_free(&ret);
562
+
563
+ return vret;
564
+ }
565
+
566
+ static VALUE
567
+ unicode_compose(VALUE obj, VALUE str)
568
+ {
569
+ WString ustr;
570
+ WString result;
571
+ UString ret;
572
+ VALUE vret;
573
+
574
+ Check_Type(str, T_STRING);
575
+ #ifdef HAVE_RUBY_ENCODING_H
576
+ CONVERT_TO_UTF8(str);
577
+ #endif
578
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
579
+ sort_canonical(&ustr);
580
+ WStr_alloc(&result);
581
+ compose_internal(&ustr, &result);
582
+ WStr_free(&ustr);
583
+ UniStr_alloc(&ret);
584
+ WStr_convertIntoUString(&result, &ret);
585
+ WStr_free(&result);
586
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
587
+ UniStr_free(&ret);
588
+
589
+ return vret;
590
+ }
591
+
592
+ static VALUE
593
+ unicode_normalize_C(VALUE obj, VALUE str)
594
+ {
595
+ WString ustr1;
596
+ WString ustr2;
597
+ WString result;
598
+ UString ret;
599
+ VALUE vret;
600
+
601
+ Check_Type(str, T_STRING);
602
+ #ifdef HAVE_RUBY_ENCODING_H
603
+ CONVERT_TO_UTF8(str);
604
+ #endif
605
+ WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
606
+ WStr_alloc(&ustr2);
607
+ decompose_internal(&ustr1, &ustr2);
608
+ WStr_free(&ustr1);
609
+ sort_canonical(&ustr2);
610
+ WStr_alloc(&result);
611
+ compose_internal(&ustr2, &result);
612
+ WStr_free(&ustr2);
613
+ UniStr_alloc(&ret);
614
+ WStr_convertIntoUString(&result, &ret);
615
+ WStr_free(&result);
616
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
617
+ UniStr_free(&ret);
618
+
619
+ return vret;
620
+ }
621
+
622
+ static VALUE
623
+ unicode_normalize_KC(VALUE obj, VALUE str)
624
+ {
625
+ WString ustr1;
626
+ WString ustr2;
627
+ WString result;
628
+ UString ret;
629
+ VALUE vret;
630
+
631
+ Check_Type(str, T_STRING);
632
+ #ifdef HAVE_RUBY_ENCODING_H
633
+ CONVERT_TO_UTF8(str);
634
+ #endif
635
+ WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
636
+ WStr_alloc(&ustr2);
637
+ decompose_compat_internal(&ustr1, &ustr2);
638
+ WStr_free(&ustr1);
639
+ sort_canonical(&ustr2);
640
+ WStr_alloc(&result);
641
+ compose_internal(&ustr2, &result);
642
+ WStr_free(&ustr2);
643
+ UniStr_alloc(&ret);
644
+ WStr_convertIntoUString(&result, &ret);
645
+ WStr_free(&result);
646
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
647
+ UniStr_free(&ret);
648
+
649
+ return vret;
650
+ }
651
+
652
+ static VALUE
653
+ unicode_upcase(VALUE obj, VALUE str)
654
+ {
655
+ WString ustr;
656
+ WString result;
657
+ UString ret;
658
+ VALUE vret;
659
+
660
+ Check_Type(str, T_STRING);
661
+ #ifdef HAVE_RUBY_ENCODING_H
662
+ CONVERT_TO_UTF8(str);
663
+ #endif
664
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
665
+ WStr_alloc(&result);
666
+ upcase_internal(&ustr, &result);
667
+ //sort_canonical(&result);
668
+ WStr_free(&ustr);
669
+ UniStr_alloc(&ret);
670
+ WStr_convertIntoUString(&result, &ret);
671
+ WStr_free(&result);
672
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
673
+ UniStr_free(&ret);
674
+
675
+ return vret;
676
+ }
677
+
678
+ static VALUE
679
+ unicode_downcase(VALUE obj, VALUE str)
680
+ {
681
+ WString ustr;
682
+ WString result;
683
+ UString ret;
684
+ VALUE vret;
685
+
686
+ Check_Type(str, T_STRING);
687
+ #ifdef HAVE_RUBY_ENCODING_H
688
+ CONVERT_TO_UTF8(str);
689
+ #endif
690
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
691
+ WStr_alloc(&result);
692
+ downcase_internal(&ustr, &result);
693
+ //sort_canonical(&result);
694
+ WStr_free(&ustr);
695
+ UniStr_alloc(&ret);
696
+ WStr_convertIntoUString(&result, &ret);
697
+ WStr_free(&result);
698
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
699
+ UniStr_free(&ret);
700
+
701
+ return vret;
702
+ }
703
+
704
+ #ifdef HAVE_RUBY_ENCODING_H
705
+
706
+
707
+ #endif
708
+
709
+ static VALUE
710
+ unicode_capitalize(VALUE obj, VALUE str)
711
+ {
712
+ WString ustr;
713
+ WString result;
714
+ UString ret;
715
+ VALUE vret;
716
+
717
+ Check_Type(str, T_STRING);
718
+ #ifdef HAVE_RUBY_ENCODING_H
719
+ CONVERT_TO_UTF8(str);
720
+ #endif
721
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
722
+ WStr_alloc(&result);
723
+ capitalize_internal(&ustr, &result);
724
+ //sort_canonical(&result);
725
+ WStr_free(&ustr);
726
+ UniStr_alloc(&ret);
727
+ WStr_convertIntoUString(&result, &ret);
728
+ WStr_free(&result);
729
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
730
+ UniStr_free(&ret);
731
+
732
+ return vret;
733
+ }
734
+
735
+ void
736
+ Init_unicode_native()
737
+ {
738
+ int i;
739
+
740
+ #ifdef HAVE_RUBY_ENCODING_H
741
+ enc_out = rb_utf8_encoding();
742
+ #endif
743
+
744
+ mUnicode = rb_define_module("Unicode");
745
+ unicode_data = rb_hash_new();
746
+ composition_table = rb_hash_new();
747
+
748
+ rb_global_variable(&unicode_data);
749
+ rb_global_variable(&composition_table);
750
+
751
+ for (i = 0; unidata[i].code != -1; i++) {
752
+ int code = unidata[i].code;
753
+ const char* canon = unidata[i].canon;
754
+ int exclusion = unidata[i].exclusion;
755
+
756
+ rb_hash_aset(unicode_data, INT2FIX(code), INT2FIX(i));
757
+ if (canon && exclusion == 0) {
758
+ rb_hash_aset(composition_table, rb_str_new2(canon), INT2FIX(code));
759
+ }
760
+ }
761
+
762
+ rb_define_module_function(mUnicode, "strcmp",
763
+ unicode_strcmp, 2);
764
+ rb_define_module_function(mUnicode, "strcmp_compat",
765
+ unicode_strcmp_compat, 2);
766
+
767
+ rb_define_module_function(mUnicode, "decompose",
768
+ unicode_decompose, 1);
769
+ rb_define_module_function(mUnicode, "decompose_compat",
770
+ unicode_decompose_compat, 1);
771
+ rb_define_module_function(mUnicode, "compose",
772
+ unicode_compose, 1);
773
+
774
+ rb_define_module_function(mUnicode, "normalize_D",
775
+ unicode_decompose, 1);
776
+ rb_define_module_function(mUnicode, "normalize_KD",
777
+ unicode_decompose_compat, 1);
778
+ rb_define_module_function(mUnicode, "normalize_C",
779
+ unicode_normalize_C, 1);
780
+ rb_define_module_function(mUnicode, "normalize_KC",
781
+ unicode_normalize_KC, 1);
782
+
783
+ rb_define_module_function(mUnicode, "upcase",
784
+ unicode_upcase, 1);
785
+ rb_define_module_function(mUnicode, "downcase",
786
+ unicode_downcase, 1);
787
+ rb_define_module_function(mUnicode, "capitalize",
788
+ unicode_capitalize, 1);
789
+ }