unicode 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (10) hide show
  1. data/README +16 -11
  2. data/test.rb +9 -8
  3. data/tools/README +2 -2
  4. data/tools/mkunidata.rb +20 -10
  5. data/unicode.c +132 -54
  6. data/unidata.map +12976 -1764
  7. data/ustring.c +27 -25
  8. data/ustring.h +12 -12
  9. data/wstring.c +11 -11
  10. metadata +4 -4
data/README CHANGED
@@ -1,5 +1,5 @@
1
1
  Unicode Library for Ruby
2
- Version 0.1
2
+ Version 0.2.0
3
3
 
4
4
  Yoshida Masato
5
5
 
@@ -14,8 +14,8 @@
14
14
 
15
15
  - Install
16
16
 
17
- This can work with ruby-1.4 or later. I recommend you to
18
- use ruby-1.4.2 or later.
17
+ This can work with ruby-1.8 or later. I recommend you to
18
+ use ruby-1.8.1 or later.
19
19
 
20
20
  Make and install usually.
21
21
  For example, when Ruby supports dynamic linking on your OS,
@@ -36,16 +36,16 @@
36
36
 
37
37
  - Module Functions
38
38
 
39
- All parameters of functions must be UTF-8.
39
+ All parameters of functions must be UTF-8 strings.
40
40
 
41
41
  Unicode::strcmp(str1, str2)
42
42
  Unicode::strcmp_compat(str1, str2)
43
- Compares Unicode strings with normalization.
44
- strcmp uses Normalization Form D, strcmp_compat uses
43
+ Compare Unicode strings with a normalization.
44
+ strcmp uses the Normalization Form D, strcmp_compat uses
45
45
  Normalization Form KD.
46
46
 
47
- Unicode::decopose(str)
48
- Unicode::decopose_compat(str)
47
+ Unicode::decompose(str)
48
+ Unicode::decompose_compat(str)
49
49
  Decompose Unicode string. Then the trailing characters
50
50
  are sorted in canonical order.
51
51
  decompose uses the canonical decomposition,
@@ -65,12 +65,12 @@
65
65
 
66
66
  Unicode::normalize_D(str)
67
67
  Unicode::normalize_KD(str)
68
- Normalizes Unicode string in form D or form KD.
68
+ Normalize Unicode string in form D or form KD.
69
69
  These are aliases of decompose/decompose_compat.
70
70
 
71
71
  Unicode::normalize_C(str)
72
72
  Unicode::normalize_KC(str)
73
- Normalizes Unicode string in form C or form KC.
73
+ Normalize Unicode string in form C or form KC.
74
74
  normalize_C = decompose + compose
75
75
  normalize_KC = decompose_compat + compose
76
76
 
@@ -78,7 +78,7 @@
78
78
  Unicode::downcase(str)
79
79
  Unicode::capitalize(str)
80
80
  Case conversion functions.
81
- The mappings which these functions use are not normative
81
+ The mappings that are used by these functions are not normative
82
82
  in UnicodeData.txt.
83
83
 
84
84
  - Bugs
@@ -87,6 +87,8 @@
87
87
  should not be implemented with a hash of string for better
88
88
  performance.
89
89
 
90
+ Case conversion functions should reflecte UTR #21.
91
+
90
92
 
91
93
  - Copying
92
94
 
@@ -104,4 +106,7 @@
104
106
 
105
107
  - History
106
108
 
109
+ Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
110
+ Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
111
+ Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
107
112
  Nov 23, 1999 version 0.1
data/test.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  #! /usr/local/bin/ruby -KU
2
+ # -*- coding: utf-8 -*-
2
3
 
3
4
  require 'unicode'
4
5
 
@@ -29,12 +30,12 @@ p Unicode::strcmp("ガ", "ガ")
29
30
  p Unicode::strcmp_compat("ガ", "ガ")
30
31
 
31
32
  print "Decomposition/composition\n"
32
- p Unicode::normalize_D([?c, 0x301, 0x327].pack("U*")).udump
33
- p Unicode::normalize_D([?c, 0x327, 0x301].pack("U*")).udump
33
+ p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
34
+ p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
34
35
  p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
35
36
  p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
36
- p Unicode::normalize_C([?c, 0x301, 0x327].pack("U*")).udump
37
- p Unicode::normalize_C([?c, 0x327, 0x301].pack("U*")).udump
37
+ p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
38
+ p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
38
39
  p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
39
40
  p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
40
41
 
@@ -50,7 +51,7 @@ p Unicode::normalize_D("요시담").udump
50
51
  p Unicode::normalize_C("요시담").udump
51
52
 
52
53
  print "Composition Exclusion\n"
53
- print " ANGSTROM SIGN [U+221B]\n"
54
+ print " ANGSTROM SIGN [U+212B]\n"
54
55
  p Unicode::normalize_D([0x212b].pack("U")).udump
55
56
  p Unicode::normalize_C([0x212b].pack("U")).udump
56
57
  print " LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
@@ -58,9 +59,9 @@ p Unicode::normalize_D([0x00c5].pack("U")).udump
58
59
  p Unicode::normalize_C([0x00c5].pack("U")).udump
59
60
 
60
61
  print "Case conversion\n"
61
- p Unicode::normalize_C(Unicode::upcase([?c, 0x301, 0x327, 0xff41].pack("U*"))).udump
62
- p Unicode::normalize_C(Unicode::downcase([?C, 0x301, 0x327, 0xff21].pack("U*"))).udump
63
- p Unicode::capitalize([0x1f1, ?A, ?a, 0xff21].pack("U*")).udump
62
+ p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
63
+ p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
64
+ p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
64
65
 
65
66
 
66
67
  ## Local variables:
@@ -1,6 +1,6 @@
1
1
  The unidata.map is created from UnicodeData.txt and
2
- CompositionExclusions.txt of Unicode 3.0.0.
2
+ DerivedNormalizationProps.txt of Unicode 4.1.0
3
3
 
4
4
  To update unidata.map,
5
5
 
6
- ruby mkunidata.rb UnicodeData.txt CompositionExclusions.txt > unidata.map
6
+ ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt > unidata.map
@@ -1,13 +1,13 @@
1
1
  #! /usr/local/bin/ruby -KU
2
2
 
3
- if $KCODE != 'UTF8'
4
- raise "$KCODE must be UTF8"
5
- end
3
+ #if $KCODE != 'UTF8'
4
+ # raise "$KCODE must be UTF8"
5
+ #end
6
6
 
7
7
  HEAD=<<EOS
8
8
  /*
9
9
  * UnicodeData
10
- * 1999 by yoshidam
10
+ * Copyright 1999, 2004 by yoshidam
11
11
  *
12
12
  */
13
13
 
@@ -25,7 +25,7 @@ struct unicode_data {
25
25
  const int titlecase;
26
26
  };
27
27
 
28
- const static struct unicode_data unidata[] = {
28
+ static const struct unicode_data unidata[] = {
29
29
  EOS
30
30
 
31
31
  TAIL=<<EOS
@@ -41,7 +41,7 @@ def hex2str(hex)
41
41
  canon = ""
42
42
  compat = ""
43
43
  chars = hex.split(" ")
44
- if chars[0] =~ /^[0-9A-F]{4}$/
44
+ if chars[0] =~ /^[0-9A-F]{4,6}$/
45
45
  chars.each do |c|
46
46
  canon << [c.hex].pack("U")
47
47
  end
@@ -59,7 +59,7 @@ def hex2str(hex)
59
59
  end
60
60
 
61
61
  def hex_or_nil(str)
62
- return "-1" if str.nil?
62
+ return "-1" if str.nil? || str == ''
63
63
  return format("0x%04x", str.hex)
64
64
  end
65
65
 
@@ -81,9 +81,19 @@ exclusion = {}
81
81
  open(ARGV[1]) do |f|
82
82
  while l = f.gets
83
83
  next if l =~ /^\#/ || l =~ /^$/
84
+ next if l !~ /Full_Composition_Exclusion/
84
85
  code, = l.split(/\s/)
85
- code = code.hex
86
- exclusion[code] = true
86
+ if code =~ /^[0-9A-F]+$/
87
+ code = code.hex
88
+ exclusion[code] = true
89
+ elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
90
+ # p [$1, $2]
91
+ scode = $1.hex
92
+ ecode = $2.hex
93
+ for code in scode..ecode
94
+ exclusion[code] = true
95
+ end
96
+ end
87
97
  end
88
98
  end
89
99
 
@@ -94,7 +104,7 @@ open(ARGV[0]) do |f|
94
104
  l.chomp!
95
105
  code, charname, gencat, ccclass, bidicat,decomp,
96
106
  dec, digit, num, mirror, uni1_0, comment, upcase,
97
- lowcase, titlecase = l.split(";");
107
+ lowcase, titlecase = l.split(";", 15);
98
108
  code = code.hex
99
109
  ccclass = ccclass.to_i
100
110
  canon, compat = hex2str(decomp)
data/unicode.c CHANGED
@@ -1,15 +1,52 @@
1
1
  /*
2
- * Unicode Library version 0.1
2
+ * Unicode Library version 0.2
3
+ * Dec 29, 2009: version 0.2
3
4
  * Nov 23, 1999 yoshidam
4
5
  *
5
6
  */
6
7
 
7
8
  #include "ruby.h"
8
- #include "rubyio.h"
9
+ #ifdef HAVE_RUBY_IO_H
10
+ # include "ruby/io.h"
11
+ #else
12
+ # include "rubyio.h"
13
+ #endif
9
14
  #include <stdio.h>
10
15
  #include "wstring.h"
11
16
  #include "unidata.map"
12
17
 
18
+ #ifndef RSTRING_PTR
19
+ # define RSTRING_PTR(s) (RSTRING(s)->ptr)
20
+ # define RSTRING_LEN(s) (RSTRING(s)->len)
21
+ #endif
22
+
23
+ #ifdef HAVE_RUBY_ENCODING_H
24
+ static rb_encoding* enc_out;
25
+ # define ENC_(o) (rb_enc_associate(o, enc_out))
26
+ #else
27
+ # define ENC_(o) (o)
28
+ #endif
29
+
30
+ inline static VALUE
31
+ taintObject(VALUE src, VALUE obj) {
32
+ if (OBJ_TAINTED(src))
33
+ OBJ_TAINT(obj);
34
+ return obj;
35
+ }
36
+ #define TO_(src, obj) (taintObject(src, obj))
37
+
38
+ #ifdef HAVE_RUBY_ENCODING_H
39
+ # define CONVERT_TO_UTF8(str) do { \
40
+ int encindex = ENCODING_GET(str); \
41
+ volatile VALUE encobj; \
42
+ if (encindex != rb_utf8_encindex() && \
43
+ encindex != rb_usascii_encindex()) { \
44
+ encobj = rb_enc_from_encoding(enc_out); \
45
+ str = rb_str_encode(str, encobj, 0, Qnil); \
46
+ } \
47
+ } while (0)
48
+ #endif
49
+
13
50
  static VALUE mUnicode;
14
51
  static VALUE unicode_data;
15
52
  static VALUE composition_table;
@@ -58,7 +95,7 @@ get_compat(int ucs)
58
95
  return NULL;
59
96
  }
60
97
 
61
- static const int
98
+ static int
62
99
  get_uppercase(int ucs)
63
100
  {
64
101
  VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
@@ -252,7 +289,7 @@ decompose_compat_internal(WString* ustr, WString* result)
252
289
  } while (0)
253
290
 
254
291
  static int
255
- compose_pair(int c1, int c2)
292
+ compose_pair(unsigned int c1, unsigned int c2)
256
293
  {
257
294
  int ret;
258
295
  char ustr[13]; /* stored two UTF-8 chars */
@@ -370,8 +407,12 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
370
407
 
371
408
  Check_Type(str1, T_STRING);
372
409
  Check_Type(str2, T_STRING);
373
- WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);
374
- WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);
410
+ #ifdef HAVE_RUBY_ENCODING_H
411
+ CONVERT_TO_UTF8(str1);
412
+ CONVERT_TO_UTF8(str2);
413
+ #endif
414
+ WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
415
+ WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
375
416
  WStr_alloc(&result1);
376
417
  WStr_alloc(&result2);
377
418
  decompose_internal(&wstr1, &result1);
@@ -380,17 +421,17 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
380
421
  WStr_free(&wstr2);
381
422
  sort_canonical(&result1);
382
423
  sort_canonical(&result2);
383
- UStr_alloc(&ustr1);
384
- UStr_alloc(&ustr2);
424
+ UniStr_alloc(&ustr1);
425
+ UniStr_alloc(&ustr2);
385
426
  WStr_convertIntoUString(&result1, &ustr1);
386
427
  WStr_convertIntoUString(&result2, &ustr2);
387
428
  WStr_free(&result1);
388
429
  WStr_free(&result2);
389
- UStr_addChar(&ustr1, '\0');
390
- UStr_addChar(&ustr2, '\0');
391
- ret = strcmp(ustr1.str, ustr2.str);
392
- UStr_free(&ustr1);
393
- UStr_free(&ustr2);
430
+ UniStr_addChar(&ustr1, '\0');
431
+ UniStr_addChar(&ustr2, '\0');
432
+ ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
433
+ UniStr_free(&ustr1);
434
+ UniStr_free(&ustr2);
394
435
 
395
436
  return INT2FIX(ret);
396
437
  }
@@ -408,8 +449,12 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
408
449
 
409
450
  Check_Type(str1, T_STRING);
410
451
  Check_Type(str2, T_STRING);
411
- WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);
412
- WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);
452
+ #ifdef HAVE_RUBY_ENCODING_H
453
+ CONVERT_TO_UTF8(str1);
454
+ CONVERT_TO_UTF8(str2);
455
+ #endif
456
+ WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
457
+ WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
413
458
  WStr_alloc(&result1);
414
459
  WStr_alloc(&result2);
415
460
  decompose_compat_internal(&wstr1, &result1);
@@ -418,17 +463,17 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
418
463
  WStr_free(&wstr2);
419
464
  sort_canonical(&result1);
420
465
  sort_canonical(&result2);
421
- UStr_alloc(&ustr1);
422
- UStr_alloc(&ustr2);
466
+ UniStr_alloc(&ustr1);
467
+ UniStr_alloc(&ustr2);
423
468
  WStr_convertIntoUString(&result1, &ustr1);
424
469
  WStr_convertIntoUString(&result2, &ustr2);
425
470
  WStr_free(&result1);
426
471
  WStr_free(&result2);
427
- UStr_addChar(&ustr1, '\0');
428
- UStr_addChar(&ustr2, '\0');
429
- ret = strcmp(ustr1.str, ustr2.str);
430
- UStr_free(&ustr1);
431
- UStr_free(&ustr2);
472
+ UniStr_addChar(&ustr1, '\0');
473
+ UniStr_addChar(&ustr2, '\0');
474
+ ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
475
+ UniStr_free(&ustr1);
476
+ UniStr_free(&ustr2);
432
477
 
433
478
  return INT2FIX(ret);
434
479
  }
@@ -442,16 +487,19 @@ unicode_decompose(VALUE obj, VALUE str)
442
487
  VALUE vret;
443
488
 
444
489
  Check_Type(str, T_STRING);
445
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
490
+ #ifdef HAVE_RUBY_ENCODING_H
491
+ CONVERT_TO_UTF8(str);
492
+ #endif
493
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
446
494
  WStr_alloc(&result);
447
495
  decompose_internal(&ustr, &result);
448
496
  WStr_free(&ustr);
449
497
  sort_canonical(&result);
450
- UStr_alloc(&ret);
498
+ UniStr_alloc(&ret);
451
499
  WStr_convertIntoUString(&result, &ret);
452
500
  WStr_free(&result);
453
- vret = rb_str_new(ret.str, ret.len);
454
- UStr_free(&ret);
501
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
502
+ UniStr_free(&ret);
455
503
 
456
504
  return vret;
457
505
  }
@@ -465,16 +513,19 @@ unicode_decompose_compat(VALUE obj, VALUE str)
465
513
  VALUE vret;
466
514
 
467
515
  Check_Type(str, T_STRING);
468
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
516
+ #ifdef HAVE_RUBY_ENCODING_H
517
+ CONVERT_TO_UTF8(str);
518
+ #endif
519
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
469
520
  WStr_alloc(&result);
470
521
  decompose_compat_internal(&ustr, &result);
471
522
  WStr_free(&ustr);
472
523
  sort_canonical(&result);
473
- UStr_alloc(&ret);
524
+ UniStr_alloc(&ret);
474
525
  WStr_convertIntoUString(&result, &ret);
475
526
  WStr_free(&result);
476
- vret = rb_str_new(ret.str, ret.len);
477
- UStr_free(&ret);
527
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
528
+ UniStr_free(&ret);
478
529
 
479
530
  return vret;
480
531
  }
@@ -488,16 +539,19 @@ unicode_compose(VALUE obj, VALUE str)
488
539
  VALUE vret;
489
540
 
490
541
  Check_Type(str, T_STRING);
491
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
542
+ #ifdef HAVE_RUBY_ENCODING_H
543
+ CONVERT_TO_UTF8(str);
544
+ #endif
545
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
492
546
  sort_canonical(&ustr);
493
547
  WStr_alloc(&result);
494
548
  compose_internal(&ustr, &result);
495
549
  WStr_free(&ustr);
496
- UStr_alloc(&ret);
550
+ UniStr_alloc(&ret);
497
551
  WStr_convertIntoUString(&result, &ret);
498
552
  WStr_free(&result);
499
- vret = rb_str_new(ret.str, ret.len);
500
- UStr_free(&ret);
553
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
554
+ UniStr_free(&ret);
501
555
 
502
556
  return vret;
503
557
  }
@@ -512,7 +566,10 @@ unicode_normalize_C(VALUE obj, VALUE str)
512
566
  VALUE vret;
513
567
 
514
568
  Check_Type(str, T_STRING);
515
- WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);
569
+ #ifdef HAVE_RUBY_ENCODING_H
570
+ CONVERT_TO_UTF8(str);
571
+ #endif
572
+ WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
516
573
  WStr_alloc(&ustr2);
517
574
  decompose_internal(&ustr1, &ustr2);
518
575
  WStr_free(&ustr1);
@@ -520,11 +577,11 @@ unicode_normalize_C(VALUE obj, VALUE str)
520
577
  WStr_alloc(&result);
521
578
  compose_internal(&ustr2, &result);
522
579
  WStr_free(&ustr2);
523
- UStr_alloc(&ret);
580
+ UniStr_alloc(&ret);
524
581
  WStr_convertIntoUString(&result, &ret);
525
582
  WStr_free(&result);
526
- vret = rb_str_new(ret.str, ret.len);
527
- UStr_free(&ret);
583
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
584
+ UniStr_free(&ret);
528
585
 
529
586
  return vret;
530
587
  }
@@ -539,7 +596,10 @@ unicode_normalize_KC(VALUE obj, VALUE str)
539
596
  VALUE vret;
540
597
 
541
598
  Check_Type(str, T_STRING);
542
- WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);
599
+ #ifdef HAVE_RUBY_ENCODING_H
600
+ CONVERT_TO_UTF8(str);
601
+ #endif
602
+ WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
543
603
  WStr_alloc(&ustr2);
544
604
  decompose_compat_internal(&ustr1, &ustr2);
545
605
  WStr_free(&ustr1);
@@ -547,11 +607,11 @@ unicode_normalize_KC(VALUE obj, VALUE str)
547
607
  WStr_alloc(&result);
548
608
  compose_internal(&ustr2, &result);
549
609
  WStr_free(&ustr2);
550
- UStr_alloc(&ret);
610
+ UniStr_alloc(&ret);
551
611
  WStr_convertIntoUString(&result, &ret);
552
612
  WStr_free(&result);
553
- vret = rb_str_new(ret.str, ret.len);
554
- UStr_free(&ret);
613
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
614
+ UniStr_free(&ret);
555
615
 
556
616
  return vret;
557
617
  }
@@ -564,13 +624,16 @@ unicode_upcase(VALUE obj, VALUE str)
564
624
  VALUE vret;
565
625
 
566
626
  Check_Type(str, T_STRING);
567
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
627
+ #ifdef HAVE_RUBY_ENCODING_H
628
+ CONVERT_TO_UTF8(str);
629
+ #endif
630
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
568
631
  upcase_internal(&ustr);
569
- UStr_alloc(&ret);
632
+ UniStr_alloc(&ret);
570
633
  WStr_convertIntoUString(&ustr, &ret);
571
634
  WStr_free(&ustr);
572
- vret = rb_str_new(ret.str, ret.len);
573
- UStr_free(&ret);
635
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
636
+ UniStr_free(&ret);
574
637
 
575
638
  return vret;
576
639
  }
@@ -583,17 +646,25 @@ unicode_downcase(VALUE obj, VALUE str)
583
646
  VALUE vret;
584
647
 
585
648
  Check_Type(str, T_STRING);
586
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
649
+ #ifdef HAVE_RUBY_ENCODING_H
650
+ CONVERT_TO_UTF8(str);
651
+ #endif
652
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
587
653
  downcase_internal(&ustr);
588
- UStr_alloc(&ret);
654
+ UniStr_alloc(&ret);
589
655
  WStr_convertIntoUString(&ustr, &ret);
590
656
  WStr_free(&ustr);
591
- vret = rb_str_new(ret.str, ret.len);
592
- UStr_free(&ret);
657
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
658
+ UniStr_free(&ret);
593
659
 
594
660
  return vret;
595
661
  }
596
662
 
663
+ #ifdef HAVE_RUBY_ENCODING_H
664
+
665
+
666
+ #endif
667
+
597
668
  static VALUE
598
669
  unicode_capitalize(VALUE obj, VALUE str)
599
670
  {
@@ -602,13 +673,16 @@ unicode_capitalize(VALUE obj, VALUE str)
602
673
  VALUE vret;
603
674
 
604
675
  Check_Type(str, T_STRING);
605
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
676
+ #ifdef HAVE_RUBY_ENCODING_H
677
+ CONVERT_TO_UTF8(str);
678
+ #endif
679
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
606
680
  capitalize_internal(&ustr);
607
- UStr_alloc(&ret);
681
+ UniStr_alloc(&ret);
608
682
  WStr_convertIntoUString(&ustr, &ret);
609
683
  WStr_free(&ustr);
610
- vret = rb_str_new(ret.str, ret.len);
611
- UStr_free(&ret);
684
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
685
+ UniStr_free(&ret);
612
686
 
613
687
  return vret;
614
688
  }
@@ -618,6 +692,10 @@ Init_unicode()
618
692
  {
619
693
  int i;
620
694
 
695
+ #ifdef HAVE_RUBY_ENCODING_H
696
+ enc_out = rb_utf8_encoding();
697
+ #endif
698
+
621
699
  mUnicode = rb_define_module("Unicode");
622
700
  unicode_data = rb_hash_new();
623
701
  composition_table = rb_hash_new();