unicode 0.3.1-x86-mingw32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +113 -0
- data/Rakefile +16 -0
- data/ext/unicode/extconf.rb +3 -0
- data/ext/unicode/unicode.c +789 -0
- data/ext/unicode/unidata.map +21854 -0
- data/ext/unicode/ustring.c +208 -0
- data/ext/unicode/ustring.h +48 -0
- data/ext/unicode/wstring.c +189 -0
- data/ext/unicode/wstring.h +41 -0
- data/lib/unicode.rb +6 -0
- data/lib/unicode/unicode_native.so +0 -0
- data/test/test.rb +69 -0
- data/tools/README +6 -0
- data/tools/mkunidata.rb +169 -0
- data/unicode.gemspec +13 -0
- metadata +81 -0
data/README
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
Unicode Library for Ruby
|
2
|
+
Version 0.3.0
|
3
|
+
|
4
|
+
Yoshida Masato
|
5
|
+
|
6
|
+
|
7
|
+
- Introduction
|
8
|
+
|
9
|
+
Unicode string manipulation library for Ruby.
|
10
|
+
This library is based on UTR #15 Unicode Normalization Forms(*1).
|
11
|
+
|
12
|
+
*1 <URL:http://www.unicode.org/unicode/reports/tr15/>
|
13
|
+
|
14
|
+
|
15
|
+
- Install
|
16
|
+
|
17
|
+
This can work with ruby-1.8 or later. I recommend you to
|
18
|
+
use ruby-1.8.1 or later.
|
19
|
+
|
20
|
+
Make and install usually.
|
21
|
+
For example, when Ruby supports dynamic linking on your OS,
|
22
|
+
|
23
|
+
ruby extconf.rb
|
24
|
+
make
|
25
|
+
make install
|
26
|
+
|
27
|
+
|
28
|
+
- Usage
|
29
|
+
|
30
|
+
If you do not link this module with Ruby statically,
|
31
|
+
|
32
|
+
require "unicode"
|
33
|
+
|
34
|
+
before using.
|
35
|
+
|
36
|
+
|
37
|
+
- Module Functions
|
38
|
+
|
39
|
+
All parameters of functions must be UTF-8 strings.
|
40
|
+
|
41
|
+
Unicode::strcmp(str1, str2)
|
42
|
+
Unicode::strcmp_compat(str1, str2)
|
43
|
+
Compare Unicode strings with a normalization.
|
44
|
+
strcmp uses the Normalization Form D, strcmp_compat uses
|
45
|
+
Normalization Form KD.
|
46
|
+
|
47
|
+
Unicode::decompose(str)
|
48
|
+
Unicode::decompose_compat(str)
|
49
|
+
Decompose Unicode string. Then the trailing characters
|
50
|
+
are sorted in canonical order.
|
51
|
+
decompose uses the canonical decomposition,
|
52
|
+
decompose_compat uses the compatibility decomposition.
|
53
|
+
The decomposition is based on the character decomposition
|
54
|
+
mapping in UnicodeData.txt and the Hangul decomposition
|
55
|
+
algorithm.
|
56
|
+
|
57
|
+
Unicode::compose(str)
|
58
|
+
Compose Unicode string. Before composing, the trailing
|
59
|
+
characters are sorted in canonical order.
|
60
|
+
The parameter must be decomposed.
|
61
|
+
The composition is based on the reverse of the
|
62
|
+
character decomposition mapping in UnicodeData.txt,
|
63
|
+
CompositionExclusions.txt and the Hangul composition
|
64
|
+
algorithm.
|
65
|
+
|
66
|
+
Unicode::normalize_D(str)
|
67
|
+
Unicode::normalize_KD(str)
|
68
|
+
Normalize Unicode string in form D or form KD.
|
69
|
+
These are aliases of decompose/decompose_compat.
|
70
|
+
|
71
|
+
Unicode::normalize_C(str)
|
72
|
+
Unicode::normalize_KC(str)
|
73
|
+
Normalize Unicode string in form C or form KC.
|
74
|
+
normalize_C = decompose + compose
|
75
|
+
normalize_KC = decompose_compat + compose
|
76
|
+
|
77
|
+
Unicode::upcase(str)
|
78
|
+
Unicode::downcase(str)
|
79
|
+
Unicode::capitalize(str)
|
80
|
+
Case conversion functions.
|
81
|
+
The mappings that are used by these functions are not normative
|
82
|
+
in UnicodeData.txt.
|
83
|
+
|
84
|
+
- Bugs
|
85
|
+
|
86
|
+
UTR #15 suggests that the look up for Normalization Form C
|
87
|
+
should not be implemented with a hash of string for better
|
88
|
+
performance.
|
89
|
+
|
90
|
+
Case conversion functions should reflecte UTR #21.
|
91
|
+
|
92
|
+
|
93
|
+
- Copying
|
94
|
+
|
95
|
+
This extension module is copyrighted free software by
|
96
|
+
Yoshida Masato.
|
97
|
+
|
98
|
+
You can redistribute it and/or modify it under the same
|
99
|
+
term as Ruby.
|
100
|
+
|
101
|
+
|
102
|
+
- Author
|
103
|
+
|
104
|
+
Yoshida Masato <yoshidam@yoshidam.net>
|
105
|
+
|
106
|
+
|
107
|
+
- History
|
108
|
+
|
109
|
+
Feb 26, 2010 version 0.3.0 fix a capitalize bug and support SpecialCasing
|
110
|
+
Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
|
111
|
+
Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
|
112
|
+
Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
|
113
|
+
Nov 23, 1999 version 0.1
|
data/Rakefile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require "rake/clean"
|
2
|
+
require "rake/extensiontask"
|
3
|
+
require "rubygems/package_task"
|
4
|
+
|
5
|
+
CLEAN << "pkg" << "tmp"
|
6
|
+
|
7
|
+
gem_spec = eval(File.read(File.expand_path("../unicode.gemspec", __FILE__)))
|
8
|
+
|
9
|
+
Rake::GemPackageTask.new(gem_spec) {|pkg|}
|
10
|
+
|
11
|
+
Rake::ExtensionTask.new('unicode_native', gem_spec) do |ext|
|
12
|
+
ext.cross_compile = true
|
13
|
+
ext.cross_platform = ['x86-mingw32', 'x86-mswin32-60']
|
14
|
+
ext.ext_dir = "ext/unicode"
|
15
|
+
ext.lib_dir = "lib/unicode"
|
16
|
+
end
|
@@ -0,0 +1,789 @@
|
|
1
|
+
/*
|
2
|
+
* Unicode Library version 0.3
|
3
|
+
* FEb 26, 2010: version 0.3
|
4
|
+
* Dec 29, 2009: version 0.2
|
5
|
+
* Nov 23, 1999 yoshidam
|
6
|
+
*
|
7
|
+
*/
|
8
|
+
|
9
|
+
#include "ruby.h"
|
10
|
+
#ifdef HAVE_RUBY_IO_H
|
11
|
+
# include "ruby/io.h"
|
12
|
+
#else
|
13
|
+
# include "rubyio.h"
|
14
|
+
#endif
|
15
|
+
#include <stdio.h>
|
16
|
+
#include "wstring.h"
|
17
|
+
#include "unidata.map"
|
18
|
+
|
19
|
+
#ifndef RSTRING_PTR
|
20
|
+
# define RSTRING_PTR(s) (RSTRING(s)->ptr)
|
21
|
+
# define RSTRING_LEN(s) (RSTRING(s)->len)
|
22
|
+
#endif
|
23
|
+
|
24
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
25
|
+
static rb_encoding* enc_out;
|
26
|
+
# define ENC_(o) (rb_enc_associate(o, enc_out))
|
27
|
+
#else
|
28
|
+
# define ENC_(o) (o)
|
29
|
+
#endif
|
30
|
+
|
31
|
+
inline static VALUE
|
32
|
+
taintObject(VALUE src, VALUE obj) {
|
33
|
+
if (OBJ_TAINTED(src))
|
34
|
+
OBJ_TAINT(obj);
|
35
|
+
return obj;
|
36
|
+
}
|
37
|
+
#define TO_(src, obj) (taintObject(src, obj))
|
38
|
+
|
39
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
40
|
+
# define CONVERT_TO_UTF8(str) do { \
|
41
|
+
int encindex = ENCODING_GET(str); \
|
42
|
+
volatile VALUE encobj; \
|
43
|
+
if (encindex != rb_utf8_encindex() && \
|
44
|
+
encindex != rb_usascii_encindex()) { \
|
45
|
+
encobj = rb_enc_from_encoding(enc_out); \
|
46
|
+
str = rb_str_encode(str, encobj, 0, Qnil); \
|
47
|
+
} \
|
48
|
+
} while (0)
|
49
|
+
#endif
|
50
|
+
|
51
|
+
static VALUE mUnicode;
|
52
|
+
static VALUE unicode_data;
|
53
|
+
static VALUE composition_table;
|
54
|
+
|
55
|
+
/* Hangul */
|
56
|
+
#define SBASE (0xac00)
|
57
|
+
#define LBASE (0x1100)
|
58
|
+
#define LCOUNT (19)
|
59
|
+
#define VBASE (0x1161)
|
60
|
+
#define VCOUNT (21)
|
61
|
+
#define TBASE (0x11a7)
|
62
|
+
#define TCOUNT (28)
|
63
|
+
#define NCOUNT (VCOUNT * TCOUNT) /* 588 */
|
64
|
+
#define SCOUNT (LCOUNT * NCOUNT) /* 11172 */
|
65
|
+
|
66
|
+
static int
|
67
|
+
get_cc(int ucs)
|
68
|
+
{
|
69
|
+
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
70
|
+
|
71
|
+
if (!NIL_P(ch)) {
|
72
|
+
return unidata[FIX2INT(ch)].combining_class;
|
73
|
+
}
|
74
|
+
return 0;
|
75
|
+
}
|
76
|
+
|
77
|
+
static const char*
|
78
|
+
get_canon(int ucs)
|
79
|
+
{
|
80
|
+
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
81
|
+
|
82
|
+
if (!NIL_P(ch)) {
|
83
|
+
return unidata[FIX2INT(ch)].canon;
|
84
|
+
}
|
85
|
+
return NULL;
|
86
|
+
}
|
87
|
+
|
88
|
+
static const char*
|
89
|
+
get_compat(int ucs)
|
90
|
+
{
|
91
|
+
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
92
|
+
|
93
|
+
if (!NIL_P(ch)) {
|
94
|
+
return unidata[FIX2INT(ch)].compat;
|
95
|
+
}
|
96
|
+
return NULL;
|
97
|
+
}
|
98
|
+
|
99
|
+
static const char*
|
100
|
+
get_uppercase(int ucs)
|
101
|
+
{
|
102
|
+
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
103
|
+
|
104
|
+
if (!NIL_P(ch)) {
|
105
|
+
return unidata[FIX2INT(ch)].uppercase;
|
106
|
+
}
|
107
|
+
return NULL;
|
108
|
+
}
|
109
|
+
|
110
|
+
static const char*
|
111
|
+
get_lowercase(int ucs)
|
112
|
+
{
|
113
|
+
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
114
|
+
|
115
|
+
if (!NIL_P(ch)) {
|
116
|
+
return unidata[FIX2INT(ch)].lowercase;
|
117
|
+
}
|
118
|
+
return NULL;
|
119
|
+
}
|
120
|
+
|
121
|
+
static const char*
|
122
|
+
get_titlecase(int ucs)
|
123
|
+
{
|
124
|
+
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
125
|
+
|
126
|
+
if (!NIL_P(ch)) {
|
127
|
+
return unidata[FIX2INT(ch)].titlecase;
|
128
|
+
}
|
129
|
+
return NULL;
|
130
|
+
}
|
131
|
+
|
132
|
+
static int
|
133
|
+
get_composition(const char* str)
|
134
|
+
{
|
135
|
+
VALUE ch = rb_hash_aref(composition_table, rb_str_new2(str));
|
136
|
+
|
137
|
+
if (!NIL_P(ch)) {
|
138
|
+
return FIX2INT(ch);
|
139
|
+
}
|
140
|
+
return -1;
|
141
|
+
}
|
142
|
+
|
143
|
+
static WString*
|
144
|
+
sort_canonical(WString* ustr)
|
145
|
+
{
|
146
|
+
int i = 1;
|
147
|
+
int len = ustr->len;
|
148
|
+
|
149
|
+
if (len < 2) return ustr;
|
150
|
+
|
151
|
+
while (i < len) {
|
152
|
+
int last = ustr->str[i - 1];
|
153
|
+
int ch = ustr->str[i];
|
154
|
+
int last_cc = get_cc(last);
|
155
|
+
int cc = get_cc(ch);
|
156
|
+
if (cc != 0 && last_cc != 0 && last_cc > cc) {
|
157
|
+
ustr->str[i] = last;
|
158
|
+
ustr->str[i-1] = ch;
|
159
|
+
if (i > 1) i--;
|
160
|
+
}
|
161
|
+
else {
|
162
|
+
i++;
|
163
|
+
}
|
164
|
+
}
|
165
|
+
return ustr;
|
166
|
+
}
|
167
|
+
|
168
|
+
static void
|
169
|
+
decompose_hangul(int ucs, int* l, int* v, int* t)
|
170
|
+
{
|
171
|
+
int sindex = ucs - SBASE;
|
172
|
+
if (sindex < 0 || sindex >= SCOUNT) {
|
173
|
+
*l = ucs;
|
174
|
+
*v = *t = 0;
|
175
|
+
return;
|
176
|
+
}
|
177
|
+
*l = LBASE + sindex / NCOUNT;
|
178
|
+
*v = VBASE + (sindex % NCOUNT) / TCOUNT;
|
179
|
+
*t = TBASE + sindex % TCOUNT;
|
180
|
+
if (*t == TBASE) *t = 0;
|
181
|
+
}
|
182
|
+
|
183
|
+
/*
|
184
|
+
* push decomposed str into result
|
185
|
+
*/
|
186
|
+
static WString*
|
187
|
+
decompose_internal(WString* ustr, WString* result)
|
188
|
+
{
|
189
|
+
int i;
|
190
|
+
int len = ustr->len;
|
191
|
+
|
192
|
+
for (i = 0; i < len; i++) {
|
193
|
+
int ucs = ustr->str[i];
|
194
|
+
if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
|
195
|
+
int l, v, t;
|
196
|
+
decompose_hangul(ucs, &l, &v, &t);
|
197
|
+
WStr_addWChar(result, l);
|
198
|
+
if (v) WStr_addWChar(result, v);
|
199
|
+
if (t) WStr_addWChar(result, t);
|
200
|
+
}
|
201
|
+
else {
|
202
|
+
const char* dc = get_canon(ucs);
|
203
|
+
if (!dc) {
|
204
|
+
WStr_addWChar(result, ucs);
|
205
|
+
}
|
206
|
+
else {
|
207
|
+
WString wdc;
|
208
|
+
WStr_allocWithUTF8(&wdc, dc);
|
209
|
+
decompose_internal(&wdc, result);
|
210
|
+
WStr_free(&wdc);
|
211
|
+
}
|
212
|
+
}
|
213
|
+
}
|
214
|
+
return result;
|
215
|
+
}
|
216
|
+
|
217
|
+
/*
|
218
|
+
* push compatibility decomposed str into result
|
219
|
+
*/
|
220
|
+
static WString*
|
221
|
+
decompose_compat_internal(WString* ustr, WString* result)
|
222
|
+
{
|
223
|
+
int i;
|
224
|
+
int len = ustr->len;
|
225
|
+
|
226
|
+
for (i = 0; i < len; i++) {
|
227
|
+
int ucs = ustr->str[i];
|
228
|
+
if (ucs >= SBASE && ucs < SBASE + SCOUNT) {
|
229
|
+
int l, v, t;
|
230
|
+
decompose_hangul(ucs, &l, &v, &t);
|
231
|
+
WStr_addWChar(result, l);
|
232
|
+
if (v) WStr_addWChar(result, v);
|
233
|
+
if (t) WStr_addWChar(result, t);
|
234
|
+
}
|
235
|
+
else {
|
236
|
+
const char* dc = get_compat(ucs);
|
237
|
+
if (!dc) {
|
238
|
+
WStr_addWChar(result, ucs);
|
239
|
+
}
|
240
|
+
else {
|
241
|
+
WString wdc;
|
242
|
+
WStr_allocWithUTF8(&wdc, dc);
|
243
|
+
decompose_compat_internal(&wdc, result);
|
244
|
+
WStr_free(&wdc);
|
245
|
+
}
|
246
|
+
}
|
247
|
+
}
|
248
|
+
return result;
|
249
|
+
}
|
250
|
+
|
251
|
+
|
252
|
+
#define UCS4toUTF8(p, c) \
|
253
|
+
do { \
|
254
|
+
if (c < 128) { \
|
255
|
+
*p++ = c; \
|
256
|
+
} \
|
257
|
+
else if (c < 2048) { \
|
258
|
+
*p++ = (c >> 6) | 192; \
|
259
|
+
*p++ = (c & 63) | 128; \
|
260
|
+
} \
|
261
|
+
else if (c < 0x10000) { \
|
262
|
+
*p++ = (c >> 12) | 224; \
|
263
|
+
*p++ = ((c >> 6) & 63) | 128; \
|
264
|
+
*p++ = (c & 63) | 128; \
|
265
|
+
} \
|
266
|
+
else if (c < 0x200000) { \
|
267
|
+
*p++ = (c >> 18) | 240; \
|
268
|
+
*p++ = ((c >> 12) & 63) | 128; \
|
269
|
+
*p++ = ((c >> 6) & 63) | 128; \
|
270
|
+
*p++ = (c & 63) | 128; \
|
271
|
+
} \
|
272
|
+
else if (c < 0x4000000) { \
|
273
|
+
*p++ = (c >> 24) | 248; \
|
274
|
+
*p++ = ((c >> 18) & 63) | 128; \
|
275
|
+
*p++ = ((c >> 12) & 63) | 128; \
|
276
|
+
*p++ = ((c >> 6) & 63) | 128; \
|
277
|
+
*p++ = (c & 63) | 128; \
|
278
|
+
} \
|
279
|
+
else if (c < 0x80000000) { \
|
280
|
+
*p++ = (c >> 30) | 252; \
|
281
|
+
*p++ = ((c >> 24) & 63) | 128; \
|
282
|
+
*p++ = ((c >> 18) & 63) | 128; \
|
283
|
+
*p++ = ((c >> 12) & 63) | 128; \
|
284
|
+
*p++ = ((c >> 6) & 63) | 128; \
|
285
|
+
*p++ = (c & 63) | 128; \
|
286
|
+
} \
|
287
|
+
} while (0)
|
288
|
+
|
289
|
+
static int
|
290
|
+
compose_pair(unsigned int c1, unsigned int c2)
|
291
|
+
{
|
292
|
+
int ret;
|
293
|
+
char ustr[13]; /* stored two UTF-8 chars */
|
294
|
+
char *p = ustr;
|
295
|
+
|
296
|
+
/* Hangul L + V */
|
297
|
+
if (c1 >= LBASE && c1 < LBASE + LCOUNT &&
|
298
|
+
c2 >= VBASE && c2 < VBASE + VCOUNT) {
|
299
|
+
return SBASE + ((c1 - LBASE) * VCOUNT + (c2 - VBASE)) * TCOUNT;
|
300
|
+
}
|
301
|
+
/* Hangul LV + T */
|
302
|
+
else if (c1 >= SBASE && c1 < SBASE + SCOUNT &&
|
303
|
+
(c1 - SBASE) % TCOUNT == 0 &&
|
304
|
+
c2 >= TBASE && c2 < TBASE + TCOUNT) {
|
305
|
+
return c1 + (c2 - TBASE);
|
306
|
+
}
|
307
|
+
UCS4toUTF8(p, c1);
|
308
|
+
UCS4toUTF8(p, c2);
|
309
|
+
*p = '\0';
|
310
|
+
ret = get_composition(ustr);
|
311
|
+
|
312
|
+
return ret;
|
313
|
+
}
|
314
|
+
|
315
|
+
/*
|
316
|
+
* push canonical composed str into result
|
317
|
+
*/
|
318
|
+
static WString*
|
319
|
+
compose_internal(WString* ustr, WString* result)
|
320
|
+
{
|
321
|
+
int len = ustr->len;
|
322
|
+
int starter;
|
323
|
+
int startercc;
|
324
|
+
int i;
|
325
|
+
|
326
|
+
if (len == 0) return result;
|
327
|
+
|
328
|
+
starter = ustr->str[0];
|
329
|
+
startercc = get_cc(starter);
|
330
|
+
if (startercc != 0) startercc = 256;
|
331
|
+
for (i = 1; i < len; i++) {
|
332
|
+
int ch = ustr->str[i];
|
333
|
+
int cc = get_cc(ch);
|
334
|
+
int composite;
|
335
|
+
|
336
|
+
if (startercc == 0 &&
|
337
|
+
(composite = compose_pair(starter, ch)) >= 0) {
|
338
|
+
starter = composite;
|
339
|
+
startercc = get_cc(composite);
|
340
|
+
}
|
341
|
+
else {
|
342
|
+
WStr_addWChar(result, starter);
|
343
|
+
starter = ch;
|
344
|
+
startercc = cc;
|
345
|
+
}
|
346
|
+
}
|
347
|
+
WStr_addWChar(result, starter);
|
348
|
+
|
349
|
+
return result;
|
350
|
+
}
|
351
|
+
|
352
|
+
static WString*
|
353
|
+
upcase_internal(WString* str, WString* result)
|
354
|
+
{
|
355
|
+
int i;
|
356
|
+
int len = str->len;
|
357
|
+
|
358
|
+
for (i = 0; i < len; i++) {
|
359
|
+
int ucs = str->str[i];
|
360
|
+
const char* c = get_uppercase(ucs);
|
361
|
+
if (!c) {
|
362
|
+
WStr_addWChar(result, ucs);
|
363
|
+
}
|
364
|
+
else {
|
365
|
+
WString wc;
|
366
|
+
WStr_allocWithUTF8(&wc, c);
|
367
|
+
WStr_pushWString(result, &wc);
|
368
|
+
WStr_free(&wc);
|
369
|
+
}
|
370
|
+
}
|
371
|
+
return result;
|
372
|
+
}
|
373
|
+
|
374
|
+
static WString*
|
375
|
+
downcase_internal(WString* str, WString* result)
|
376
|
+
{
|
377
|
+
int i;
|
378
|
+
int len = str->len;
|
379
|
+
|
380
|
+
for (i = 0; i < len; i++) {
|
381
|
+
int ucs = str->str[i];
|
382
|
+
const char* c = get_lowercase(ucs);
|
383
|
+
if (!c) {
|
384
|
+
WStr_addWChar(result, ucs);
|
385
|
+
}
|
386
|
+
else {
|
387
|
+
WString wc;
|
388
|
+
WStr_allocWithUTF8(&wc, c);
|
389
|
+
WStr_pushWString(result, &wc);
|
390
|
+
WStr_free(&wc);
|
391
|
+
}
|
392
|
+
}
|
393
|
+
return result;
|
394
|
+
}
|
395
|
+
|
396
|
+
static WString*
|
397
|
+
capitalize_internal(WString* str, WString* result)
|
398
|
+
{
|
399
|
+
int i;
|
400
|
+
int len = str->len;
|
401
|
+
|
402
|
+
if (len > 0) {
|
403
|
+
const char* c = get_titlecase(str->str[0]);
|
404
|
+
if (!c) {
|
405
|
+
WStr_addWChar(result, str->str[0]);
|
406
|
+
}
|
407
|
+
else {
|
408
|
+
WString wc;
|
409
|
+
WStr_allocWithUTF8(&wc, c);
|
410
|
+
WStr_pushWString(result, &wc);
|
411
|
+
WStr_free(&wc);
|
412
|
+
}
|
413
|
+
}
|
414
|
+
for (i = 1; i < len; i++) {
|
415
|
+
int ucs = str->str[i];
|
416
|
+
const char* c = get_lowercase(ucs);
|
417
|
+
if (!c) {
|
418
|
+
WStr_addWChar(result, ucs);
|
419
|
+
}
|
420
|
+
else {
|
421
|
+
WString wc;
|
422
|
+
WStr_allocWithUTF8(&wc, c);
|
423
|
+
WStr_pushWString(result, &wc);
|
424
|
+
WStr_free(&wc);
|
425
|
+
}
|
426
|
+
}
|
427
|
+
return result;
|
428
|
+
}
|
429
|
+
|
430
|
+
static VALUE
|
431
|
+
unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
|
432
|
+
{
|
433
|
+
WString wstr1;
|
434
|
+
WString wstr2;
|
435
|
+
WString result1;
|
436
|
+
WString result2;
|
437
|
+
UString ustr1;
|
438
|
+
UString ustr2;
|
439
|
+
int ret;
|
440
|
+
|
441
|
+
Check_Type(str1, T_STRING);
|
442
|
+
Check_Type(str2, T_STRING);
|
443
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
444
|
+
CONVERT_TO_UTF8(str1);
|
445
|
+
CONVERT_TO_UTF8(str2);
|
446
|
+
#endif
|
447
|
+
WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
|
448
|
+
WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
|
449
|
+
WStr_alloc(&result1);
|
450
|
+
WStr_alloc(&result2);
|
451
|
+
decompose_internal(&wstr1, &result1);
|
452
|
+
decompose_internal(&wstr2, &result2);
|
453
|
+
WStr_free(&wstr1);
|
454
|
+
WStr_free(&wstr2);
|
455
|
+
sort_canonical(&result1);
|
456
|
+
sort_canonical(&result2);
|
457
|
+
UniStr_alloc(&ustr1);
|
458
|
+
UniStr_alloc(&ustr2);
|
459
|
+
WStr_convertIntoUString(&result1, &ustr1);
|
460
|
+
WStr_convertIntoUString(&result2, &ustr2);
|
461
|
+
WStr_free(&result1);
|
462
|
+
WStr_free(&result2);
|
463
|
+
UniStr_addChar(&ustr1, '\0');
|
464
|
+
UniStr_addChar(&ustr2, '\0');
|
465
|
+
ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
|
466
|
+
UniStr_free(&ustr1);
|
467
|
+
UniStr_free(&ustr2);
|
468
|
+
|
469
|
+
return INT2FIX(ret);
|
470
|
+
}
|
471
|
+
|
472
|
+
static VALUE
|
473
|
+
unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
|
474
|
+
{
|
475
|
+
WString wstr1;
|
476
|
+
WString wstr2;
|
477
|
+
WString result1;
|
478
|
+
WString result2;
|
479
|
+
UString ustr1;
|
480
|
+
UString ustr2;
|
481
|
+
int ret;
|
482
|
+
|
483
|
+
Check_Type(str1, T_STRING);
|
484
|
+
Check_Type(str2, T_STRING);
|
485
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
486
|
+
CONVERT_TO_UTF8(str1);
|
487
|
+
CONVERT_TO_UTF8(str2);
|
488
|
+
#endif
|
489
|
+
WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
|
490
|
+
WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
|
491
|
+
WStr_alloc(&result1);
|
492
|
+
WStr_alloc(&result2);
|
493
|
+
decompose_compat_internal(&wstr1, &result1);
|
494
|
+
decompose_compat_internal(&wstr2, &result2);
|
495
|
+
WStr_free(&wstr1);
|
496
|
+
WStr_free(&wstr2);
|
497
|
+
sort_canonical(&result1);
|
498
|
+
sort_canonical(&result2);
|
499
|
+
UniStr_alloc(&ustr1);
|
500
|
+
UniStr_alloc(&ustr2);
|
501
|
+
WStr_convertIntoUString(&result1, &ustr1);
|
502
|
+
WStr_convertIntoUString(&result2, &ustr2);
|
503
|
+
WStr_free(&result1);
|
504
|
+
WStr_free(&result2);
|
505
|
+
UniStr_addChar(&ustr1, '\0');
|
506
|
+
UniStr_addChar(&ustr2, '\0');
|
507
|
+
ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
|
508
|
+
UniStr_free(&ustr1);
|
509
|
+
UniStr_free(&ustr2);
|
510
|
+
|
511
|
+
return INT2FIX(ret);
|
512
|
+
}
|
513
|
+
|
514
|
+
static VALUE
|
515
|
+
unicode_decompose(VALUE obj, VALUE str)
|
516
|
+
{
|
517
|
+
WString ustr;
|
518
|
+
WString result;
|
519
|
+
UString ret;
|
520
|
+
VALUE vret;
|
521
|
+
|
522
|
+
Check_Type(str, T_STRING);
|
523
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
524
|
+
CONVERT_TO_UTF8(str);
|
525
|
+
#endif
|
526
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
527
|
+
WStr_alloc(&result);
|
528
|
+
decompose_internal(&ustr, &result);
|
529
|
+
WStr_free(&ustr);
|
530
|
+
sort_canonical(&result);
|
531
|
+
UniStr_alloc(&ret);
|
532
|
+
WStr_convertIntoUString(&result, &ret);
|
533
|
+
WStr_free(&result);
|
534
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
535
|
+
UniStr_free(&ret);
|
536
|
+
|
537
|
+
return vret;
|
538
|
+
}
|
539
|
+
|
540
|
+
static VALUE
|
541
|
+
unicode_decompose_compat(VALUE obj, VALUE str)
|
542
|
+
{
|
543
|
+
WString ustr;
|
544
|
+
WString result;
|
545
|
+
UString ret;
|
546
|
+
VALUE vret;
|
547
|
+
|
548
|
+
Check_Type(str, T_STRING);
|
549
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
550
|
+
CONVERT_TO_UTF8(str);
|
551
|
+
#endif
|
552
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
553
|
+
WStr_alloc(&result);
|
554
|
+
decompose_compat_internal(&ustr, &result);
|
555
|
+
WStr_free(&ustr);
|
556
|
+
sort_canonical(&result);
|
557
|
+
UniStr_alloc(&ret);
|
558
|
+
WStr_convertIntoUString(&result, &ret);
|
559
|
+
WStr_free(&result);
|
560
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
561
|
+
UniStr_free(&ret);
|
562
|
+
|
563
|
+
return vret;
|
564
|
+
}
|
565
|
+
|
566
|
+
static VALUE
|
567
|
+
unicode_compose(VALUE obj, VALUE str)
|
568
|
+
{
|
569
|
+
WString ustr;
|
570
|
+
WString result;
|
571
|
+
UString ret;
|
572
|
+
VALUE vret;
|
573
|
+
|
574
|
+
Check_Type(str, T_STRING);
|
575
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
576
|
+
CONVERT_TO_UTF8(str);
|
577
|
+
#endif
|
578
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
579
|
+
sort_canonical(&ustr);
|
580
|
+
WStr_alloc(&result);
|
581
|
+
compose_internal(&ustr, &result);
|
582
|
+
WStr_free(&ustr);
|
583
|
+
UniStr_alloc(&ret);
|
584
|
+
WStr_convertIntoUString(&result, &ret);
|
585
|
+
WStr_free(&result);
|
586
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
587
|
+
UniStr_free(&ret);
|
588
|
+
|
589
|
+
return vret;
|
590
|
+
}
|
591
|
+
|
592
|
+
static VALUE
|
593
|
+
unicode_normalize_C(VALUE obj, VALUE str)
|
594
|
+
{
|
595
|
+
WString ustr1;
|
596
|
+
WString ustr2;
|
597
|
+
WString result;
|
598
|
+
UString ret;
|
599
|
+
VALUE vret;
|
600
|
+
|
601
|
+
Check_Type(str, T_STRING);
|
602
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
603
|
+
CONVERT_TO_UTF8(str);
|
604
|
+
#endif
|
605
|
+
WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
|
606
|
+
WStr_alloc(&ustr2);
|
607
|
+
decompose_internal(&ustr1, &ustr2);
|
608
|
+
WStr_free(&ustr1);
|
609
|
+
sort_canonical(&ustr2);
|
610
|
+
WStr_alloc(&result);
|
611
|
+
compose_internal(&ustr2, &result);
|
612
|
+
WStr_free(&ustr2);
|
613
|
+
UniStr_alloc(&ret);
|
614
|
+
WStr_convertIntoUString(&result, &ret);
|
615
|
+
WStr_free(&result);
|
616
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
617
|
+
UniStr_free(&ret);
|
618
|
+
|
619
|
+
return vret;
|
620
|
+
}
|
621
|
+
|
622
|
+
static VALUE
|
623
|
+
unicode_normalize_KC(VALUE obj, VALUE str)
|
624
|
+
{
|
625
|
+
WString ustr1;
|
626
|
+
WString ustr2;
|
627
|
+
WString result;
|
628
|
+
UString ret;
|
629
|
+
VALUE vret;
|
630
|
+
|
631
|
+
Check_Type(str, T_STRING);
|
632
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
633
|
+
CONVERT_TO_UTF8(str);
|
634
|
+
#endif
|
635
|
+
WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
|
636
|
+
WStr_alloc(&ustr2);
|
637
|
+
decompose_compat_internal(&ustr1, &ustr2);
|
638
|
+
WStr_free(&ustr1);
|
639
|
+
sort_canonical(&ustr2);
|
640
|
+
WStr_alloc(&result);
|
641
|
+
compose_internal(&ustr2, &result);
|
642
|
+
WStr_free(&ustr2);
|
643
|
+
UniStr_alloc(&ret);
|
644
|
+
WStr_convertIntoUString(&result, &ret);
|
645
|
+
WStr_free(&result);
|
646
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
647
|
+
UniStr_free(&ret);
|
648
|
+
|
649
|
+
return vret;
|
650
|
+
}
|
651
|
+
|
652
|
+
static VALUE
|
653
|
+
unicode_upcase(VALUE obj, VALUE str)
|
654
|
+
{
|
655
|
+
WString ustr;
|
656
|
+
WString result;
|
657
|
+
UString ret;
|
658
|
+
VALUE vret;
|
659
|
+
|
660
|
+
Check_Type(str, T_STRING);
|
661
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
662
|
+
CONVERT_TO_UTF8(str);
|
663
|
+
#endif
|
664
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
665
|
+
WStr_alloc(&result);
|
666
|
+
upcase_internal(&ustr, &result);
|
667
|
+
//sort_canonical(&result);
|
668
|
+
WStr_free(&ustr);
|
669
|
+
UniStr_alloc(&ret);
|
670
|
+
WStr_convertIntoUString(&result, &ret);
|
671
|
+
WStr_free(&result);
|
672
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
673
|
+
UniStr_free(&ret);
|
674
|
+
|
675
|
+
return vret;
|
676
|
+
}
|
677
|
+
|
678
|
+
static VALUE
|
679
|
+
unicode_downcase(VALUE obj, VALUE str)
|
680
|
+
{
|
681
|
+
WString ustr;
|
682
|
+
WString result;
|
683
|
+
UString ret;
|
684
|
+
VALUE vret;
|
685
|
+
|
686
|
+
Check_Type(str, T_STRING);
|
687
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
688
|
+
CONVERT_TO_UTF8(str);
|
689
|
+
#endif
|
690
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
691
|
+
WStr_alloc(&result);
|
692
|
+
downcase_internal(&ustr, &result);
|
693
|
+
//sort_canonical(&result);
|
694
|
+
WStr_free(&ustr);
|
695
|
+
UniStr_alloc(&ret);
|
696
|
+
WStr_convertIntoUString(&result, &ret);
|
697
|
+
WStr_free(&result);
|
698
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
699
|
+
UniStr_free(&ret);
|
700
|
+
|
701
|
+
return vret;
|
702
|
+
}
|
703
|
+
|
704
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
705
|
+
|
706
|
+
|
707
|
+
#endif
|
708
|
+
|
709
|
+
static VALUE
|
710
|
+
unicode_capitalize(VALUE obj, VALUE str)
|
711
|
+
{
|
712
|
+
WString ustr;
|
713
|
+
WString result;
|
714
|
+
UString ret;
|
715
|
+
VALUE vret;
|
716
|
+
|
717
|
+
Check_Type(str, T_STRING);
|
718
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
719
|
+
CONVERT_TO_UTF8(str);
|
720
|
+
#endif
|
721
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
722
|
+
WStr_alloc(&result);
|
723
|
+
capitalize_internal(&ustr, &result);
|
724
|
+
//sort_canonical(&result);
|
725
|
+
WStr_free(&ustr);
|
726
|
+
UniStr_alloc(&ret);
|
727
|
+
WStr_convertIntoUString(&result, &ret);
|
728
|
+
WStr_free(&result);
|
729
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
730
|
+
UniStr_free(&ret);
|
731
|
+
|
732
|
+
return vret;
|
733
|
+
}
|
734
|
+
|
735
|
+
void
|
736
|
+
Init_unicode_native()
|
737
|
+
{
|
738
|
+
int i;
|
739
|
+
|
740
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
741
|
+
enc_out = rb_utf8_encoding();
|
742
|
+
#endif
|
743
|
+
|
744
|
+
mUnicode = rb_define_module("Unicode");
|
745
|
+
unicode_data = rb_hash_new();
|
746
|
+
composition_table = rb_hash_new();
|
747
|
+
|
748
|
+
rb_global_variable(&unicode_data);
|
749
|
+
rb_global_variable(&composition_table);
|
750
|
+
|
751
|
+
for (i = 0; unidata[i].code != -1; i++) {
|
752
|
+
int code = unidata[i].code;
|
753
|
+
const char* canon = unidata[i].canon;
|
754
|
+
int exclusion = unidata[i].exclusion;
|
755
|
+
|
756
|
+
rb_hash_aset(unicode_data, INT2FIX(code), INT2FIX(i));
|
757
|
+
if (canon && exclusion == 0) {
|
758
|
+
rb_hash_aset(composition_table, rb_str_new2(canon), INT2FIX(code));
|
759
|
+
}
|
760
|
+
}
|
761
|
+
|
762
|
+
rb_define_module_function(mUnicode, "strcmp",
|
763
|
+
unicode_strcmp, 2);
|
764
|
+
rb_define_module_function(mUnicode, "strcmp_compat",
|
765
|
+
unicode_strcmp_compat, 2);
|
766
|
+
|
767
|
+
rb_define_module_function(mUnicode, "decompose",
|
768
|
+
unicode_decompose, 1);
|
769
|
+
rb_define_module_function(mUnicode, "decompose_compat",
|
770
|
+
unicode_decompose_compat, 1);
|
771
|
+
rb_define_module_function(mUnicode, "compose",
|
772
|
+
unicode_compose, 1);
|
773
|
+
|
774
|
+
rb_define_module_function(mUnicode, "normalize_D",
|
775
|
+
unicode_decompose, 1);
|
776
|
+
rb_define_module_function(mUnicode, "normalize_KD",
|
777
|
+
unicode_decompose_compat, 1);
|
778
|
+
rb_define_module_function(mUnicode, "normalize_C",
|
779
|
+
unicode_normalize_C, 1);
|
780
|
+
rb_define_module_function(mUnicode, "normalize_KC",
|
781
|
+
unicode_normalize_KC, 1);
|
782
|
+
|
783
|
+
rb_define_module_function(mUnicode, "upcase",
|
784
|
+
unicode_upcase, 1);
|
785
|
+
rb_define_module_function(mUnicode, "downcase",
|
786
|
+
unicode_downcase, 1);
|
787
|
+
rb_define_module_function(mUnicode, "capitalize",
|
788
|
+
unicode_capitalize, 1);
|
789
|
+
}
|