unicode 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +16 -11
- data/test.rb +9 -8
- data/tools/README +2 -2
- data/tools/mkunidata.rb +20 -10
- data/unicode.c +132 -54
- data/unidata.map +12976 -1764
- data/ustring.c +27 -25
- data/ustring.h +12 -12
- data/wstring.c +11 -11
- metadata +4 -4
data/README
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
Unicode Library for Ruby
|
2
|
-
Version 0.
|
2
|
+
Version 0.2.0
|
3
3
|
|
4
4
|
Yoshida Masato
|
5
5
|
|
@@ -14,8 +14,8 @@
|
|
14
14
|
|
15
15
|
- Install
|
16
16
|
|
17
|
-
This can work with ruby-1.
|
18
|
-
use ruby-1.
|
17
|
+
This can work with ruby-1.8 or later. I recommend you to
|
18
|
+
use ruby-1.8.1 or later.
|
19
19
|
|
20
20
|
Make and install usually.
|
21
21
|
For example, when Ruby supports dynamic linking on your OS,
|
@@ -36,16 +36,16 @@
|
|
36
36
|
|
37
37
|
- Module Functions
|
38
38
|
|
39
|
-
All parameters of functions must be UTF-8.
|
39
|
+
All parameters of functions must be UTF-8 strings.
|
40
40
|
|
41
41
|
Unicode::strcmp(str1, str2)
|
42
42
|
Unicode::strcmp_compat(str1, str2)
|
43
|
-
|
44
|
-
strcmp uses Normalization Form D, strcmp_compat uses
|
43
|
+
Compare Unicode strings with a normalization.
|
44
|
+
strcmp uses the Normalization Form D, strcmp_compat uses
|
45
45
|
Normalization Form KD.
|
46
46
|
|
47
|
-
Unicode::
|
48
|
-
Unicode::
|
47
|
+
Unicode::decompose(str)
|
48
|
+
Unicode::decompose_compat(str)
|
49
49
|
Decompose Unicode string. Then the trailing characters
|
50
50
|
are sorted in canonical order.
|
51
51
|
decompose uses the canonical decomposition,
|
@@ -65,12 +65,12 @@
|
|
65
65
|
|
66
66
|
Unicode::normalize_D(str)
|
67
67
|
Unicode::normalize_KD(str)
|
68
|
-
|
68
|
+
Normalize Unicode string in form D or form KD.
|
69
69
|
These are aliases of decompose/decompose_compat.
|
70
70
|
|
71
71
|
Unicode::normalize_C(str)
|
72
72
|
Unicode::normalize_KC(str)
|
73
|
-
|
73
|
+
Normalize Unicode string in form C or form KC.
|
74
74
|
normalize_C = decompose + compose
|
75
75
|
normalize_KC = decompose_compat + compose
|
76
76
|
|
@@ -78,7 +78,7 @@
|
|
78
78
|
Unicode::downcase(str)
|
79
79
|
Unicode::capitalize(str)
|
80
80
|
Case conversion functions.
|
81
|
-
The mappings
|
81
|
+
The mappings that are used by these functions are not normative
|
82
82
|
in UnicodeData.txt.
|
83
83
|
|
84
84
|
- Bugs
|
@@ -87,6 +87,8 @@
|
|
87
87
|
should not be implemented with a hash of string for better
|
88
88
|
performance.
|
89
89
|
|
90
|
+
Case conversion functions should reflecte UTR #21.
|
91
|
+
|
90
92
|
|
91
93
|
- Copying
|
92
94
|
|
@@ -104,4 +106,7 @@
|
|
104
106
|
|
105
107
|
- History
|
106
108
|
|
109
|
+
Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
|
110
|
+
Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
|
111
|
+
Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
|
107
112
|
Nov 23, 1999 version 0.1
|
data/test.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#! /usr/local/bin/ruby -KU
|
2
|
+
# -*- coding: utf-8 -*-
|
2
3
|
|
3
4
|
require 'unicode'
|
4
5
|
|
@@ -29,12 +30,12 @@ p Unicode::strcmp("ガ", "ガ")
|
|
29
30
|
p Unicode::strcmp_compat("ガ", "ガ")
|
30
31
|
|
31
32
|
print "Decomposition/composition\n"
|
32
|
-
p Unicode::normalize_D([
|
33
|
-
p Unicode::normalize_D([
|
33
|
+
p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
|
34
|
+
p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
|
34
35
|
p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
|
35
36
|
p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
|
36
|
-
p Unicode::normalize_C([
|
37
|
-
p Unicode::normalize_C([
|
37
|
+
p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
|
38
|
+
p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
|
38
39
|
p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
|
39
40
|
p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
|
40
41
|
|
@@ -50,7 +51,7 @@ p Unicode::normalize_D("요시담").udump
|
|
50
51
|
p Unicode::normalize_C("요시담").udump
|
51
52
|
|
52
53
|
print "Composition Exclusion\n"
|
53
|
-
print " ANGSTROM SIGN [U+
|
54
|
+
print " ANGSTROM SIGN [U+212B]\n"
|
54
55
|
p Unicode::normalize_D([0x212b].pack("U")).udump
|
55
56
|
p Unicode::normalize_C([0x212b].pack("U")).udump
|
56
57
|
print " LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
|
@@ -58,9 +59,9 @@ p Unicode::normalize_D([0x00c5].pack("U")).udump
|
|
58
59
|
p Unicode::normalize_C([0x00c5].pack("U")).udump
|
59
60
|
|
60
61
|
print "Case conversion\n"
|
61
|
-
p Unicode::normalize_C(Unicode::upcase([
|
62
|
-
p Unicode::normalize_C(Unicode::downcase([
|
63
|
-
p Unicode::capitalize([0x1f1,
|
62
|
+
p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
|
63
|
+
p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
|
64
|
+
p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
|
64
65
|
|
65
66
|
|
66
67
|
## Local variables:
|
data/tools/README
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
The unidata.map is created from UnicodeData.txt and
|
2
|
-
|
2
|
+
DerivedNormalizationProps.txt of Unicode 4.1.0
|
3
3
|
|
4
4
|
To update unidata.map,
|
5
5
|
|
6
|
-
ruby mkunidata.rb UnicodeData.txt
|
6
|
+
ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt > unidata.map
|
data/tools/mkunidata.rb
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
#! /usr/local/bin/ruby -KU
|
2
2
|
|
3
|
-
if $KCODE != 'UTF8'
|
4
|
-
raise "$KCODE must be UTF8"
|
5
|
-
end
|
3
|
+
#if $KCODE != 'UTF8'
|
4
|
+
# raise "$KCODE must be UTF8"
|
5
|
+
#end
|
6
6
|
|
7
7
|
HEAD=<<EOS
|
8
8
|
/*
|
9
9
|
* UnicodeData
|
10
|
-
* 1999 by yoshidam
|
10
|
+
* Copyright 1999, 2004 by yoshidam
|
11
11
|
*
|
12
12
|
*/
|
13
13
|
|
@@ -25,7 +25,7 @@ struct unicode_data {
|
|
25
25
|
const int titlecase;
|
26
26
|
};
|
27
27
|
|
28
|
-
const
|
28
|
+
static const struct unicode_data unidata[] = {
|
29
29
|
EOS
|
30
30
|
|
31
31
|
TAIL=<<EOS
|
@@ -41,7 +41,7 @@ def hex2str(hex)
|
|
41
41
|
canon = ""
|
42
42
|
compat = ""
|
43
43
|
chars = hex.split(" ")
|
44
|
-
if chars[0] =~ /^[0-9A-F]{4}$/
|
44
|
+
if chars[0] =~ /^[0-9A-F]{4,6}$/
|
45
45
|
chars.each do |c|
|
46
46
|
canon << [c.hex].pack("U")
|
47
47
|
end
|
@@ -59,7 +59,7 @@ def hex2str(hex)
|
|
59
59
|
end
|
60
60
|
|
61
61
|
def hex_or_nil(str)
|
62
|
-
return "-1" if str.nil?
|
62
|
+
return "-1" if str.nil? || str == ''
|
63
63
|
return format("0x%04x", str.hex)
|
64
64
|
end
|
65
65
|
|
@@ -81,9 +81,19 @@ exclusion = {}
|
|
81
81
|
open(ARGV[1]) do |f|
|
82
82
|
while l = f.gets
|
83
83
|
next if l =~ /^\#/ || l =~ /^$/
|
84
|
+
next if l !~ /Full_Composition_Exclusion/
|
84
85
|
code, = l.split(/\s/)
|
85
|
-
code
|
86
|
-
|
86
|
+
if code =~ /^[0-9A-F]+$/
|
87
|
+
code = code.hex
|
88
|
+
exclusion[code] = true
|
89
|
+
elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
|
90
|
+
# p [$1, $2]
|
91
|
+
scode = $1.hex
|
92
|
+
ecode = $2.hex
|
93
|
+
for code in scode..ecode
|
94
|
+
exclusion[code] = true
|
95
|
+
end
|
96
|
+
end
|
87
97
|
end
|
88
98
|
end
|
89
99
|
|
@@ -94,7 +104,7 @@ open(ARGV[0]) do |f|
|
|
94
104
|
l.chomp!
|
95
105
|
code, charname, gencat, ccclass, bidicat,decomp,
|
96
106
|
dec, digit, num, mirror, uni1_0, comment, upcase,
|
97
|
-
lowcase, titlecase = l.split(";");
|
107
|
+
lowcase, titlecase = l.split(";", 15);
|
98
108
|
code = code.hex
|
99
109
|
ccclass = ccclass.to_i
|
100
110
|
canon, compat = hex2str(decomp)
|
data/unicode.c
CHANGED
@@ -1,15 +1,52 @@
|
|
1
1
|
/*
|
2
|
-
* Unicode Library version 0.
|
2
|
+
* Unicode Library version 0.2
|
3
|
+
* Dec 29, 2009: version 0.2
|
3
4
|
* Nov 23, 1999 yoshidam
|
4
5
|
*
|
5
6
|
*/
|
6
7
|
|
7
8
|
#include "ruby.h"
|
8
|
-
#
|
9
|
+
#ifdef HAVE_RUBY_IO_H
|
10
|
+
# include "ruby/io.h"
|
11
|
+
#else
|
12
|
+
# include "rubyio.h"
|
13
|
+
#endif
|
9
14
|
#include <stdio.h>
|
10
15
|
#include "wstring.h"
|
11
16
|
#include "unidata.map"
|
12
17
|
|
18
|
+
#ifndef RSTRING_PTR
|
19
|
+
# define RSTRING_PTR(s) (RSTRING(s)->ptr)
|
20
|
+
# define RSTRING_LEN(s) (RSTRING(s)->len)
|
21
|
+
#endif
|
22
|
+
|
23
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
24
|
+
static rb_encoding* enc_out;
|
25
|
+
# define ENC_(o) (rb_enc_associate(o, enc_out))
|
26
|
+
#else
|
27
|
+
# define ENC_(o) (o)
|
28
|
+
#endif
|
29
|
+
|
30
|
+
inline static VALUE
|
31
|
+
taintObject(VALUE src, VALUE obj) {
|
32
|
+
if (OBJ_TAINTED(src))
|
33
|
+
OBJ_TAINT(obj);
|
34
|
+
return obj;
|
35
|
+
}
|
36
|
+
#define TO_(src, obj) (taintObject(src, obj))
|
37
|
+
|
38
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
39
|
+
# define CONVERT_TO_UTF8(str) do { \
|
40
|
+
int encindex = ENCODING_GET(str); \
|
41
|
+
volatile VALUE encobj; \
|
42
|
+
if (encindex != rb_utf8_encindex() && \
|
43
|
+
encindex != rb_usascii_encindex()) { \
|
44
|
+
encobj = rb_enc_from_encoding(enc_out); \
|
45
|
+
str = rb_str_encode(str, encobj, 0, Qnil); \
|
46
|
+
} \
|
47
|
+
} while (0)
|
48
|
+
#endif
|
49
|
+
|
13
50
|
static VALUE mUnicode;
|
14
51
|
static VALUE unicode_data;
|
15
52
|
static VALUE composition_table;
|
@@ -58,7 +95,7 @@ get_compat(int ucs)
|
|
58
95
|
return NULL;
|
59
96
|
}
|
60
97
|
|
61
|
-
static
|
98
|
+
static int
|
62
99
|
get_uppercase(int ucs)
|
63
100
|
{
|
64
101
|
VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
|
@@ -252,7 +289,7 @@ decompose_compat_internal(WString* ustr, WString* result)
|
|
252
289
|
} while (0)
|
253
290
|
|
254
291
|
static int
|
255
|
-
compose_pair(int c1, int c2)
|
292
|
+
compose_pair(unsigned int c1, unsigned int c2)
|
256
293
|
{
|
257
294
|
int ret;
|
258
295
|
char ustr[13]; /* stored two UTF-8 chars */
|
@@ -370,8 +407,12 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
|
|
370
407
|
|
371
408
|
Check_Type(str1, T_STRING);
|
372
409
|
Check_Type(str2, T_STRING);
|
373
|
-
|
374
|
-
|
410
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
411
|
+
CONVERT_TO_UTF8(str1);
|
412
|
+
CONVERT_TO_UTF8(str2);
|
413
|
+
#endif
|
414
|
+
WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
|
415
|
+
WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
|
375
416
|
WStr_alloc(&result1);
|
376
417
|
WStr_alloc(&result2);
|
377
418
|
decompose_internal(&wstr1, &result1);
|
@@ -380,17 +421,17 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
|
|
380
421
|
WStr_free(&wstr2);
|
381
422
|
sort_canonical(&result1);
|
382
423
|
sort_canonical(&result2);
|
383
|
-
|
384
|
-
|
424
|
+
UniStr_alloc(&ustr1);
|
425
|
+
UniStr_alloc(&ustr2);
|
385
426
|
WStr_convertIntoUString(&result1, &ustr1);
|
386
427
|
WStr_convertIntoUString(&result2, &ustr2);
|
387
428
|
WStr_free(&result1);
|
388
429
|
WStr_free(&result2);
|
389
|
-
|
390
|
-
|
391
|
-
ret = strcmp(ustr1.str, ustr2.str);
|
392
|
-
|
393
|
-
|
430
|
+
UniStr_addChar(&ustr1, '\0');
|
431
|
+
UniStr_addChar(&ustr2, '\0');
|
432
|
+
ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
|
433
|
+
UniStr_free(&ustr1);
|
434
|
+
UniStr_free(&ustr2);
|
394
435
|
|
395
436
|
return INT2FIX(ret);
|
396
437
|
}
|
@@ -408,8 +449,12 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
|
|
408
449
|
|
409
450
|
Check_Type(str1, T_STRING);
|
410
451
|
Check_Type(str2, T_STRING);
|
411
|
-
|
412
|
-
|
452
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
453
|
+
CONVERT_TO_UTF8(str1);
|
454
|
+
CONVERT_TO_UTF8(str2);
|
455
|
+
#endif
|
456
|
+
WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
|
457
|
+
WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
|
413
458
|
WStr_alloc(&result1);
|
414
459
|
WStr_alloc(&result2);
|
415
460
|
decompose_compat_internal(&wstr1, &result1);
|
@@ -418,17 +463,17 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
|
|
418
463
|
WStr_free(&wstr2);
|
419
464
|
sort_canonical(&result1);
|
420
465
|
sort_canonical(&result2);
|
421
|
-
|
422
|
-
|
466
|
+
UniStr_alloc(&ustr1);
|
467
|
+
UniStr_alloc(&ustr2);
|
423
468
|
WStr_convertIntoUString(&result1, &ustr1);
|
424
469
|
WStr_convertIntoUString(&result2, &ustr2);
|
425
470
|
WStr_free(&result1);
|
426
471
|
WStr_free(&result2);
|
427
|
-
|
428
|
-
|
429
|
-
ret = strcmp(ustr1.str, ustr2.str);
|
430
|
-
|
431
|
-
|
472
|
+
UniStr_addChar(&ustr1, '\0');
|
473
|
+
UniStr_addChar(&ustr2, '\0');
|
474
|
+
ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
|
475
|
+
UniStr_free(&ustr1);
|
476
|
+
UniStr_free(&ustr2);
|
432
477
|
|
433
478
|
return INT2FIX(ret);
|
434
479
|
}
|
@@ -442,16 +487,19 @@ unicode_decompose(VALUE obj, VALUE str)
|
|
442
487
|
VALUE vret;
|
443
488
|
|
444
489
|
Check_Type(str, T_STRING);
|
445
|
-
|
490
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
491
|
+
CONVERT_TO_UTF8(str);
|
492
|
+
#endif
|
493
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
446
494
|
WStr_alloc(&result);
|
447
495
|
decompose_internal(&ustr, &result);
|
448
496
|
WStr_free(&ustr);
|
449
497
|
sort_canonical(&result);
|
450
|
-
|
498
|
+
UniStr_alloc(&ret);
|
451
499
|
WStr_convertIntoUString(&result, &ret);
|
452
500
|
WStr_free(&result);
|
453
|
-
vret = rb_str_new(ret.str, ret.len);
|
454
|
-
|
501
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
502
|
+
UniStr_free(&ret);
|
455
503
|
|
456
504
|
return vret;
|
457
505
|
}
|
@@ -465,16 +513,19 @@ unicode_decompose_compat(VALUE obj, VALUE str)
|
|
465
513
|
VALUE vret;
|
466
514
|
|
467
515
|
Check_Type(str, T_STRING);
|
468
|
-
|
516
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
517
|
+
CONVERT_TO_UTF8(str);
|
518
|
+
#endif
|
519
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
469
520
|
WStr_alloc(&result);
|
470
521
|
decompose_compat_internal(&ustr, &result);
|
471
522
|
WStr_free(&ustr);
|
472
523
|
sort_canonical(&result);
|
473
|
-
|
524
|
+
UniStr_alloc(&ret);
|
474
525
|
WStr_convertIntoUString(&result, &ret);
|
475
526
|
WStr_free(&result);
|
476
|
-
vret = rb_str_new(ret.str, ret.len);
|
477
|
-
|
527
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
528
|
+
UniStr_free(&ret);
|
478
529
|
|
479
530
|
return vret;
|
480
531
|
}
|
@@ -488,16 +539,19 @@ unicode_compose(VALUE obj, VALUE str)
|
|
488
539
|
VALUE vret;
|
489
540
|
|
490
541
|
Check_Type(str, T_STRING);
|
491
|
-
|
542
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
543
|
+
CONVERT_TO_UTF8(str);
|
544
|
+
#endif
|
545
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
492
546
|
sort_canonical(&ustr);
|
493
547
|
WStr_alloc(&result);
|
494
548
|
compose_internal(&ustr, &result);
|
495
549
|
WStr_free(&ustr);
|
496
|
-
|
550
|
+
UniStr_alloc(&ret);
|
497
551
|
WStr_convertIntoUString(&result, &ret);
|
498
552
|
WStr_free(&result);
|
499
|
-
vret = rb_str_new(ret.str, ret.len);
|
500
|
-
|
553
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
554
|
+
UniStr_free(&ret);
|
501
555
|
|
502
556
|
return vret;
|
503
557
|
}
|
@@ -512,7 +566,10 @@ unicode_normalize_C(VALUE obj, VALUE str)
|
|
512
566
|
VALUE vret;
|
513
567
|
|
514
568
|
Check_Type(str, T_STRING);
|
515
|
-
|
569
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
570
|
+
CONVERT_TO_UTF8(str);
|
571
|
+
#endif
|
572
|
+
WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
|
516
573
|
WStr_alloc(&ustr2);
|
517
574
|
decompose_internal(&ustr1, &ustr2);
|
518
575
|
WStr_free(&ustr1);
|
@@ -520,11 +577,11 @@ unicode_normalize_C(VALUE obj, VALUE str)
|
|
520
577
|
WStr_alloc(&result);
|
521
578
|
compose_internal(&ustr2, &result);
|
522
579
|
WStr_free(&ustr2);
|
523
|
-
|
580
|
+
UniStr_alloc(&ret);
|
524
581
|
WStr_convertIntoUString(&result, &ret);
|
525
582
|
WStr_free(&result);
|
526
|
-
vret = rb_str_new(ret.str, ret.len);
|
527
|
-
|
583
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
584
|
+
UniStr_free(&ret);
|
528
585
|
|
529
586
|
return vret;
|
530
587
|
}
|
@@ -539,7 +596,10 @@ unicode_normalize_KC(VALUE obj, VALUE str)
|
|
539
596
|
VALUE vret;
|
540
597
|
|
541
598
|
Check_Type(str, T_STRING);
|
542
|
-
|
599
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
600
|
+
CONVERT_TO_UTF8(str);
|
601
|
+
#endif
|
602
|
+
WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
|
543
603
|
WStr_alloc(&ustr2);
|
544
604
|
decompose_compat_internal(&ustr1, &ustr2);
|
545
605
|
WStr_free(&ustr1);
|
@@ -547,11 +607,11 @@ unicode_normalize_KC(VALUE obj, VALUE str)
|
|
547
607
|
WStr_alloc(&result);
|
548
608
|
compose_internal(&ustr2, &result);
|
549
609
|
WStr_free(&ustr2);
|
550
|
-
|
610
|
+
UniStr_alloc(&ret);
|
551
611
|
WStr_convertIntoUString(&result, &ret);
|
552
612
|
WStr_free(&result);
|
553
|
-
vret = rb_str_new(ret.str, ret.len);
|
554
|
-
|
613
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
614
|
+
UniStr_free(&ret);
|
555
615
|
|
556
616
|
return vret;
|
557
617
|
}
|
@@ -564,13 +624,16 @@ unicode_upcase(VALUE obj, VALUE str)
|
|
564
624
|
VALUE vret;
|
565
625
|
|
566
626
|
Check_Type(str, T_STRING);
|
567
|
-
|
627
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
628
|
+
CONVERT_TO_UTF8(str);
|
629
|
+
#endif
|
630
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
568
631
|
upcase_internal(&ustr);
|
569
|
-
|
632
|
+
UniStr_alloc(&ret);
|
570
633
|
WStr_convertIntoUString(&ustr, &ret);
|
571
634
|
WStr_free(&ustr);
|
572
|
-
vret = rb_str_new(ret.str, ret.len);
|
573
|
-
|
635
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
636
|
+
UniStr_free(&ret);
|
574
637
|
|
575
638
|
return vret;
|
576
639
|
}
|
@@ -583,17 +646,25 @@ unicode_downcase(VALUE obj, VALUE str)
|
|
583
646
|
VALUE vret;
|
584
647
|
|
585
648
|
Check_Type(str, T_STRING);
|
586
|
-
|
649
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
650
|
+
CONVERT_TO_UTF8(str);
|
651
|
+
#endif
|
652
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
587
653
|
downcase_internal(&ustr);
|
588
|
-
|
654
|
+
UniStr_alloc(&ret);
|
589
655
|
WStr_convertIntoUString(&ustr, &ret);
|
590
656
|
WStr_free(&ustr);
|
591
|
-
vret = rb_str_new(ret.str, ret.len);
|
592
|
-
|
657
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
658
|
+
UniStr_free(&ret);
|
593
659
|
|
594
660
|
return vret;
|
595
661
|
}
|
596
662
|
|
663
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
664
|
+
|
665
|
+
|
666
|
+
#endif
|
667
|
+
|
597
668
|
static VALUE
|
598
669
|
unicode_capitalize(VALUE obj, VALUE str)
|
599
670
|
{
|
@@ -602,13 +673,16 @@ unicode_capitalize(VALUE obj, VALUE str)
|
|
602
673
|
VALUE vret;
|
603
674
|
|
604
675
|
Check_Type(str, T_STRING);
|
605
|
-
|
676
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
677
|
+
CONVERT_TO_UTF8(str);
|
678
|
+
#endif
|
679
|
+
WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
|
606
680
|
capitalize_internal(&ustr);
|
607
|
-
|
681
|
+
UniStr_alloc(&ret);
|
608
682
|
WStr_convertIntoUString(&ustr, &ret);
|
609
683
|
WStr_free(&ustr);
|
610
|
-
vret = rb_str_new(ret.str, ret.len);
|
611
|
-
|
684
|
+
vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
|
685
|
+
UniStr_free(&ret);
|
612
686
|
|
613
687
|
return vret;
|
614
688
|
}
|
@@ -618,6 +692,10 @@ Init_unicode()
|
|
618
692
|
{
|
619
693
|
int i;
|
620
694
|
|
695
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
696
|
+
enc_out = rb_utf8_encoding();
|
697
|
+
#endif
|
698
|
+
|
621
699
|
mUnicode = rb_define_module("Unicode");
|
622
700
|
unicode_data = rb_hash_new();
|
623
701
|
composition_table = rb_hash_new();
|