unicode 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +16 -11
- data/test.rb +9 -8
- data/tools/README +2 -2
- data/tools/mkunidata.rb +20 -10
- data/unicode.c +132 -54
- data/unidata.map +12976 -1764
- data/ustring.c +27 -25
- data/ustring.h +12 -12
- data/wstring.c +11 -11
- metadata +4 -4
data/README
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
Unicode Library for Ruby
|
2
|
-
Version 0.
|
2
|
+
Version 0.2.0
|
3
3
|
|
4
4
|
Yoshida Masato
|
5
5
|
|
@@ -14,8 +14,8 @@
|
|
14
14
|
|
15
15
|
- Install
|
16
16
|
|
17
|
-
This can work with ruby-1.
|
18
|
-
use ruby-1.
|
17
|
+
This can work with ruby-1.8 or later. I recommend you to
|
18
|
+
use ruby-1.8.1 or later.
|
19
19
|
|
20
20
|
Make and install usually.
|
21
21
|
For example, when Ruby supports dynamic linking on your OS,
|
@@ -36,16 +36,16 @@
|
|
36
36
|
|
37
37
|
- Module Functions
|
38
38
|
|
39
|
-
All parameters of functions must be UTF-8.
|
39
|
+
All parameters of functions must be UTF-8 strings.
|
40
40
|
|
41
41
|
Unicode::strcmp(str1, str2)
|
42
42
|
Unicode::strcmp_compat(str1, str2)
|
43
|
-
|
44
|
-
strcmp uses Normalization Form D, strcmp_compat uses
|
43
|
+
Compare Unicode strings with a normalization.
|
44
|
+
strcmp uses the Normalization Form D, strcmp_compat uses
|
45
45
|
Normalization Form KD.
|
46
46
|
|
47
|
-
Unicode::
|
48
|
-
Unicode::
|
47
|
+
Unicode::decompose(str)
|
48
|
+
Unicode::decompose_compat(str)
|
49
49
|
Decompose Unicode string. Then the trailing characters
|
50
50
|
are sorted in canonical order.
|
51
51
|
decompose uses the canonical decomposition,
|
@@ -65,12 +65,12 @@
|
|
65
65
|
|
66
66
|
Unicode::normalize_D(str)
|
67
67
|
Unicode::normalize_KD(str)
|
68
|
-
|
68
|
+
Normalize Unicode string in form D or form KD.
|
69
69
|
These are aliases of decompose/decompose_compat.
|
70
70
|
|
71
71
|
Unicode::normalize_C(str)
|
72
72
|
Unicode::normalize_KC(str)
|
73
|
-
|
73
|
+
Normalize Unicode string in form C or form KC.
|
74
74
|
normalize_C = decompose + compose
|
75
75
|
normalize_KC = decompose_compat + compose
|
76
76
|
|
@@ -78,7 +78,7 @@
|
|
78
78
|
Unicode::downcase(str)
|
79
79
|
Unicode::capitalize(str)
|
80
80
|
Case conversion functions.
|
81
|
-
The mappings
|
81
|
+
The mappings that are used by these functions are not normative
|
82
82
|
in UnicodeData.txt.
|
83
83
|
|
84
84
|
- Bugs
|
@@ -87,6 +87,8 @@
|
|
87
87
|
should not be implemented with a hash of string for better
|
88
88
|
performance.
|
89
89
|
|
90
|
+
Case conversion functions should reflecte UTR #21.
|
91
|
+
|
90
92
|
|
91
93
|
- Copying
|
92
94
|
|
@@ -104,4 +106,7 @@
|
|
104
106
|
|
105
107
|
- History
|
106
108
|
|
109
|
+
Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
|
110
|
+
Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
|
111
|
+
Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
|
107
112
|
Nov 23, 1999 version 0.1
|
data/test.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#! /usr/local/bin/ruby -KU
|
2
|
+
# -*- coding: utf-8 -*-
|
2
3
|
|
3
4
|
require 'unicode'
|
4
5
|
|
@@ -29,12 +30,12 @@ p Unicode::strcmp("ガ", "ガ")
|
|
29
30
|
p Unicode::strcmp_compat("ガ", "ガ")
|
30
31
|
|
31
32
|
print "Decomposition/composition\n"
|
32
|
-
p Unicode::normalize_D([
|
33
|
-
p Unicode::normalize_D([
|
33
|
+
p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
|
34
|
+
p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
|
34
35
|
p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
|
35
36
|
p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
|
36
|
-
p Unicode::normalize_C([
|
37
|
-
p Unicode::normalize_C([
|
37
|
+
p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
|
38
|
+
p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
|
38
39
|
p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
|
39
40
|
p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
|
40
41
|
|
@@ -50,7 +51,7 @@ p Unicode::normalize_D("요시담").udump
|
|
50
51
|
p Unicode::normalize_C("요시담").udump
|
51
52
|
|
52
53
|
print "Composition Exclusion\n"
|
53
|
-
print " ANGSTROM SIGN [U+
|
54
|
+
print " ANGSTROM SIGN [U+212B]\n"
|
54
55
|
p Unicode::normalize_D([0x212b].pack("U")).udump
|
55
56
|
p Unicode::normalize_C([0x212b].pack("U")).udump
|
56
57
|
print " LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
|
@@ -58,9 +59,9 @@ p Unicode::normalize_D([0x00c5].pack("U")).udump
|
|
58
59
|
p Unicode::normalize_C([0x00c5].pack("U")).udump
|
59
60
|
|
60
61
|
print "Case conversion\n"
|
61
|
-
p Unicode::normalize_C(Unicode::upcase([
|
62
|
-
p Unicode::normalize_C(Unicode::downcase([
|
63
|
-
p Unicode::capitalize([0x1f1,
|
62
|
+
p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
|
63
|
+
p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
|
64
|
+
p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
|
64
65
|
|
65
66
|
|
66
67
|
## Local variables:
|
data/tools/README
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
The unidata.map is created from UnicodeData.txt and
|
2
|
-
|
2
|
+
DerivedNormalizationProps.txt of Unicode 4.1.0
|
3
3
|
|
4
4
|
To update unidata.map,
|
5
5
|
|
6
|
-
ruby mkunidata.rb UnicodeData.txt
|
6
|
+
ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt > unidata.map
|
data/tools/mkunidata.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
#! /usr/local/bin/ruby -KU
|
2
2
|
|
3
|
-
if $KCODE != 'UTF8'
|
4
|
-
raise "$KCODE must be UTF8"
|
5
|
-
end
|
3
|
+
#if $KCODE != 'UTF8'
|
4
|
+
# raise "$KCODE must be UTF8"
|
5
|
+
#end
|
6
6
|
|
7
7
|
HEAD=<<EOS
|
8
8
|
/*
|
9
9
|
* UnicodeData
|
10
|
-
* 1999 by yoshidam
|
10
|
+
* Copyright 1999, 2004 by yoshidam
|
11
11
|
*
|
12
12
|
*/
|
13
13
|
|
@@ -25,7 +25,7 @@ struct unicode_data {
|
|
25
25
|
const int titlecase;
|
26
26
|
};
|
27
27
|
|
28
|
-
const
|
28
|
+
static const struct unicode_data unidata[] = {
|
29
29
|
EOS
|
30
30
|
|
31
31
|
TAIL=<<EOS
|
@@ -41,7 +41,7 @@ def hex2str(hex)
|
|
41
41
|
canon = ""
|
42
42
|
compat = ""
|
43
43
|
chars = hex.split(" ")
|
44
|
-
if chars[0] =~ /^[0-9A-F]{4}$/
|
44
|
+
if chars[0] =~ /^[0-9A-F]{4,6}$/
|
45
45
|
chars.each do |c|
|
46
46
|
canon << [c.hex].pack("U")
|
47
47
|
end
|
@@ -59,7 +59,7 @@ def hex2str(hex)
|
|
59
59
|
end
|
60
60
|
|
61
61
|
def hex_or_nil(str)
|
62
|
-
return "-1" if str.nil?
|
62
|
+
return "-1" if str.nil? || str == ''
|
63
63
|
return format("0x%04x", str.hex)
|
64
64
|
end
|
65
65
|
|
@@ -81,9 +81,19 @@ exclusion = {}
|
|
81
81
|
open(ARGV[1]) do |f|
|
82
82
|
while l = f.gets
|
83
83
|
next if l =~ /^\#/ || l =~ /^$/
|
84
|
+
next if l !~ /Full_Composition_Exclusion/
|
84
85
|
code, = l.split(/\s/)
|
85
|
-
code
|
86
|
-
|
86
|
+
if code =~ /^[0-9A-F]+$/
|
87
|
+
code = code.hex
|
88
|
+
exclusion[code] = true
|
89
|
+
elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
|
90
|
+
# p [$1, $2]
|
91
|
+
scode = $1.hex
|
92
|
+
ecode = $2.hex
|
93
|
+
for code in scode..ecode
|
94
|
+
exclusion[code] = true
|
95
|
+
end
|
96
|
+
end
|
87
97
|
end
|
88
98
|
end
|
89
99
|
|
@@ -94,7 +104,7 @@ open(ARGV[0]) do |f|
|
|
94
104
|
l.chomp!
|
95
105
|
code, charname, gencat, ccclass, bidicat,decomp,
|
96
106
|
dec, digit, num, mirror, uni1_0, comment, upcase,
|
97
|
-
lowcase, titlecase = l.split(";");
|
107
|
+
lowcase, titlecase = l.split(";", 15);
|
98
108
|
code = code.hex
|
99
109
|
ccclass = ccclass.to_i
|
100
110
|
canon, compat = hex2str(decomp)
|
data/unicode.c
CHANGED
@@ -1,15 +1,52 @@
|
|
1
1
|
/*
|
2
|
-
* Unicode Library version 0.
|
2
|
+
* Unicode Library version 0.2
|
3
|
+
* Dec 29, 2009: version 0.2
|
3
4
|
* Nov 23, 1999 yoshidam
|
4
5
|
*
|
5
6
|
*/
|
6
7
|
|
7
8
|
#include "ruby.h"
|
8
|
-
#
|
9
|
+
#ifdef HAVE_RUBY_IO_H
|
10
|
+
# include "ruby/io.h"
|
11
|
+
#else
|
12
|
+
# include "rubyio.h"
|
13
|
+
#endif
|
9
14
|
#include <stdio.h>
|
10
15
|
#include "wstring.h"
|
11
16
|
#include "unidata.map"
|
12
17
|
|
18
|
+
#ifndef RSTRING_PTR
|
19
|
+
# define RSTRING_PTR(s) (RSTRING(s)->ptr)
|
20
|
+
# define RSTRING_LEN(s) (RSTRING(s)->len)
|
21
|
+
#endif
|
22
|
+
|
23
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
24
|
+
static rb_encoding* enc_out;
|
25
|
+
# define ENC_(o) (rb_enc_associate(o, enc_out))
|
26
|
+
#else
|
27
|
+
# define ENC_(o) (o)
|
28
|
+
#endif
|
29
|
+
|
30
|
+
inline static VALUE
|
31
|
+
taintObject(VALUE src, VALUE obj) {
|
32
|
+
if (OBJ_TAINTED(src))
|
33
|
+
OBJ_TAINT(obj);
|
34
|
+
return obj;
|
35
|
+
}
|
36
|
+
#define TO_(src, obj) (taintObject(src, obj))
|
37
|
+
|
38
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
39
|
+
# define CONVERT_TO_UTF8(str) do { \
|
40
|
+
int encindex = ENCODING_GET(str); \
|
41
|
+
volatile VALUE encobj; \
|
42
|
+
if (encindex != rb_utf8_encindex() && \
|
43
|
+
encindex != rb_usascii_encindex()) { \
|
44
|
+
encobj = rb_enc_from_encoding(enc_out); \
|
45
|
+
str = rb_str_encode(str, encobj, 0, Qnil); \
|
46
|
+
} \
|
47
|
+
} while (0)
|
48
|
+
#endif
|
49
|
+
|
13
50
|
static VALUE mUnicode;
|
14
51
|
static VALUE unicode_data;
|
15
52
|
static VALUE composition_table;
|
@@ -58,7 +95,7 @@ get_compat(int ucs)
|
|
58
95
|
return NULL;
|
59
96
|
}
|
60
97
|
|
61
|
-
static
|
98
|
+
static int
|
62
99
|
get_uppercase(int ucs)
|
63
100
|
{
|
64
101
|
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
@@ -252,7 +289,7 @@ decompose_compat_internal(WString* ustr, WString* result)
|
|
252
289
|
} while (0)
|
253
290
|
|
254
291
|
static int
|
255
|
-
compose_pair(int c1, int c2)
|
292
|
+
compose_pair(unsigned int c1, unsigned int c2)
|
256
293
|
{
|
257
294
|
int ret;
|
258
295
|
char ustr[13]; /* stored two UTF-8 chars */
|
@@ -370,8 +407,12 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
|
|
370
407
|
|
371
408
|
Check_Type(str1, T_STRING);
|
372
409
|
Check_Type(str2, T_STRING);
|
373
|
-
|
374
|
-
|
410
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
411
|
+
CONVERT_TO_UTF8(str1);
|
412
|
+
CONVERT_TO_UTF8(str2);
|
413
|
+
#endif
|
414
|
+
WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
|
415
|
+
WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
|
375
416
|
WStr_alloc(&result1);
|
376
417
|
WStr_alloc(&result2);
|
377
418
|
decompose_internal(&wstr1, &result1);
|
@@ -380,17 +421,17 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
|
|
380
421
|
WStr_free(&wstr2);
|
381
422
|
sort_canonical(&result1);
|
382
423
|
sort_canonical(&result2);
|
383
|
-
|
384
|
-
|
424
|
+
UniStr_alloc(&ustr1);
|
425
|
+
UniStr_alloc(&ustr2);
|
385
426
|
WStr_convertIntoUString(&result1, &ustr1);
|
386
427
|
WStr_convertIntoUString(&result2, &ustr2);
|
387
428
|
WStr_free(&result1);
|
388
429
|
WStr_free(&result2);
|
389
|
-
|
390
|
-
|
391
|
-
ret = strcmp(ustr1.str, ustr2.str);
|
392
|
-
|
393
|
-
|
430
|
+
UniStr_addChar(&ustr1, '\0');
|
431
|
+
UniStr_addChar(&ustr2, '\0');
|
432
|
+
ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
|
433
|
+
UniStr_free(&ustr1);
|
434
|
+
UniStr_free(&ustr2);
|
394
435
|
|
395
436
|
return INT2FIX(ret);
|
396
437
|
}
|
@@ -408,8 +449,12 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
|
|
408
449
|
|
409
450
|
Check_Type(str1, T_STRING);
|
410
451
|
Check_Type(str2, T_STRING);
|
411
|
-
|
412
|
-
|
452
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
453
|
+
CONVERT_TO_UTF8(str1);
|
454
|
+
CONVERT_TO_UTF8(str2);
|
455
|
+
#endif
|
456
|
+
WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
|
457
|
+
WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
|
413
458
|
WStr_alloc(&result1);
|
414
459
|
WStr_alloc(&result2);
|
415
460
|
decompose_compat_internal(&wstr1, &result1);
|
@@ -418,17 +463,17 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
|
|
418
463
|
WStr_free(&wstr2);
|
419
464
|
sort_canonical(&result1);
|
420
465
|
sort_canonical(&result2);
|
421
|
-
|
422
|
-
|
466
|
+
UniStr_alloc(&ustr1);
|
467
|
+
UniStr_alloc(&ustr2);
|
423
468
|
WStr_convertIntoUString(&result1, &ustr1);
|
424
469
|
WStr_convertIntoUString(&result2, &ustr2);
|
425
470
|
WStr_free(&result1);
|
426
471
|
WStr_free(&result2);
|
427
|
-
|
428
|
-
|
429
|
-
ret = strcmp(ustr1.str, ustr2.str);
|
430
|
-
|
431
|
-
|
472
|
+
UniStr_addChar(&ustr1, '\0');
|
473
|
+
UniStr_addChar(&ustr2, '\0');
|
474
|
+
ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
|
475
|
+
UniStr_free(&ustr1);
|
476
|
+
UniStr_free(&ustr2);
|
432
477
|
|
433
478
|
return INT2FIX(ret);
|
434
479
|
}
|
@@ -442,16 +487,19 @@ unicode_decompose(VALUE obj, VALUE str)
|
|
442
487
|
VALUE vret;
|
443
488
|
|
444
489
|
Check_Type(str, T_STRING);
|
445
|
-
|
490
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
491
|
+
CONVERT_TO_UTF8(str);
|
492
|
+
#endif
|
493
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
446
494
|
WStr_alloc(&result);
|
447
495
|
decompose_internal(&ustr, &result);
|
448
496
|
WStr_free(&ustr);
|
449
497
|
sort_canonical(&result);
|
450
|
-
|
498
|
+
UniStr_alloc(&ret);
|
451
499
|
WStr_convertIntoUString(&result, &ret);
|
452
500
|
WStr_free(&result);
|
453
|
-
vret = rb_str_new(ret.str, ret.len);
|
454
|
-
|
501
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
502
|
+
UniStr_free(&ret);
|
455
503
|
|
456
504
|
return vret;
|
457
505
|
}
|
@@ -465,16 +513,19 @@ unicode_decompose_compat(VALUE obj, VALUE str)
|
|
465
513
|
VALUE vret;
|
466
514
|
|
467
515
|
Check_Type(str, T_STRING);
|
468
|
-
|
516
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
517
|
+
CONVERT_TO_UTF8(str);
|
518
|
+
#endif
|
519
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
469
520
|
WStr_alloc(&result);
|
470
521
|
decompose_compat_internal(&ustr, &result);
|
471
522
|
WStr_free(&ustr);
|
472
523
|
sort_canonical(&result);
|
473
|
-
|
524
|
+
UniStr_alloc(&ret);
|
474
525
|
WStr_convertIntoUString(&result, &ret);
|
475
526
|
WStr_free(&result);
|
476
|
-
vret = rb_str_new(ret.str, ret.len);
|
477
|
-
|
527
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
528
|
+
UniStr_free(&ret);
|
478
529
|
|
479
530
|
return vret;
|
480
531
|
}
|
@@ -488,16 +539,19 @@ unicode_compose(VALUE obj, VALUE str)
|
|
488
539
|
VALUE vret;
|
489
540
|
|
490
541
|
Check_Type(str, T_STRING);
|
491
|
-
|
542
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
543
|
+
CONVERT_TO_UTF8(str);
|
544
|
+
#endif
|
545
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
492
546
|
sort_canonical(&ustr);
|
493
547
|
WStr_alloc(&result);
|
494
548
|
compose_internal(&ustr, &result);
|
495
549
|
WStr_free(&ustr);
|
496
|
-
|
550
|
+
UniStr_alloc(&ret);
|
497
551
|
WStr_convertIntoUString(&result, &ret);
|
498
552
|
WStr_free(&result);
|
499
|
-
vret = rb_str_new(ret.str, ret.len);
|
500
|
-
|
553
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
554
|
+
UniStr_free(&ret);
|
501
555
|
|
502
556
|
return vret;
|
503
557
|
}
|
@@ -512,7 +566,10 @@ unicode_normalize_C(VALUE obj, VALUE str)
|
|
512
566
|
VALUE vret;
|
513
567
|
|
514
568
|
Check_Type(str, T_STRING);
|
515
|
-
|
569
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
570
|
+
CONVERT_TO_UTF8(str);
|
571
|
+
#endif
|
572
|
+
WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
|
516
573
|
WStr_alloc(&ustr2);
|
517
574
|
decompose_internal(&ustr1, &ustr2);
|
518
575
|
WStr_free(&ustr1);
|
@@ -520,11 +577,11 @@ unicode_normalize_C(VALUE obj, VALUE str)
|
|
520
577
|
WStr_alloc(&result);
|
521
578
|
compose_internal(&ustr2, &result);
|
522
579
|
WStr_free(&ustr2);
|
523
|
-
|
580
|
+
UniStr_alloc(&ret);
|
524
581
|
WStr_convertIntoUString(&result, &ret);
|
525
582
|
WStr_free(&result);
|
526
|
-
vret = rb_str_new(ret.str, ret.len);
|
527
|
-
|
583
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
584
|
+
UniStr_free(&ret);
|
528
585
|
|
529
586
|
return vret;
|
530
587
|
}
|
@@ -539,7 +596,10 @@ unicode_normalize_KC(VALUE obj, VALUE str)
|
|
539
596
|
VALUE vret;
|
540
597
|
|
541
598
|
Check_Type(str, T_STRING);
|
542
|
-
|
599
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
600
|
+
CONVERT_TO_UTF8(str);
|
601
|
+
#endif
|
602
|
+
WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
|
543
603
|
WStr_alloc(&ustr2);
|
544
604
|
decompose_compat_internal(&ustr1, &ustr2);
|
545
605
|
WStr_free(&ustr1);
|
@@ -547,11 +607,11 @@ unicode_normalize_KC(VALUE obj, VALUE str)
|
|
547
607
|
WStr_alloc(&result);
|
548
608
|
compose_internal(&ustr2, &result);
|
549
609
|
WStr_free(&ustr2);
|
550
|
-
|
610
|
+
UniStr_alloc(&ret);
|
551
611
|
WStr_convertIntoUString(&result, &ret);
|
552
612
|
WStr_free(&result);
|
553
|
-
vret = rb_str_new(ret.str, ret.len);
|
554
|
-
|
613
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
614
|
+
UniStr_free(&ret);
|
555
615
|
|
556
616
|
return vret;
|
557
617
|
}
|
@@ -564,13 +624,16 @@ unicode_upcase(VALUE obj, VALUE str)
|
|
564
624
|
VALUE vret;
|
565
625
|
|
566
626
|
Check_Type(str, T_STRING);
|
567
|
-
|
627
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
628
|
+
CONVERT_TO_UTF8(str);
|
629
|
+
#endif
|
630
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
568
631
|
upcase_internal(&ustr);
|
569
|
-
|
632
|
+
UniStr_alloc(&ret);
|
570
633
|
WStr_convertIntoUString(&ustr, &ret);
|
571
634
|
WStr_free(&ustr);
|
572
|
-
vret = rb_str_new(ret.str, ret.len);
|
573
|
-
|
635
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
636
|
+
UniStr_free(&ret);
|
574
637
|
|
575
638
|
return vret;
|
576
639
|
}
|
@@ -583,17 +646,25 @@ unicode_downcase(VALUE obj, VALUE str)
|
|
583
646
|
VALUE vret;
|
584
647
|
|
585
648
|
Check_Type(str, T_STRING);
|
586
|
-
|
649
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
650
|
+
CONVERT_TO_UTF8(str);
|
651
|
+
#endif
|
652
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
587
653
|
downcase_internal(&ustr);
|
588
|
-
|
654
|
+
UniStr_alloc(&ret);
|
589
655
|
WStr_convertIntoUString(&ustr, &ret);
|
590
656
|
WStr_free(&ustr);
|
591
|
-
vret = rb_str_new(ret.str, ret.len);
|
592
|
-
|
657
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
658
|
+
UniStr_free(&ret);
|
593
659
|
|
594
660
|
return vret;
|
595
661
|
}
|
596
662
|
|
663
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
664
|
+
|
665
|
+
|
666
|
+
#endif
|
667
|
+
|
597
668
|
static VALUE
|
598
669
|
unicode_capitalize(VALUE obj, VALUE str)
|
599
670
|
{
|
@@ -602,13 +673,16 @@ unicode_capitalize(VALUE obj, VALUE str)
|
|
602
673
|
VALUE vret;
|
603
674
|
|
604
675
|
Check_Type(str, T_STRING);
|
605
|
-
|
676
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
677
|
+
CONVERT_TO_UTF8(str);
|
678
|
+
#endif
|
679
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
606
680
|
capitalize_internal(&ustr);
|
607
|
-
|
681
|
+
UniStr_alloc(&ret);
|
608
682
|
WStr_convertIntoUString(&ustr, &ret);
|
609
683
|
WStr_free(&ustr);
|
610
|
-
vret = rb_str_new(ret.str, ret.len);
|
611
|
-
|
684
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
685
|
+
UniStr_free(&ret);
|
612
686
|
|
613
687
|
return vret;
|
614
688
|
}
|
@@ -618,6 +692,10 @@ Init_unicode()
|
|
618
692
|
{
|
619
693
|
int i;
|
620
694
|
|
695
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
696
|
+
enc_out = rb_utf8_encoding();
|
697
|
+
#endif
|
698
|
+
|
621
699
|
mUnicode = rb_define_module("Unicode");
|
622
700
|
unicode_data = rb_hash_new();
|
623
701
|
composition_table = rb_hash_new();
|