unicode 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (10) hide show
  1. data/README +16 -11
  2. data/test.rb +9 -8
  3. data/tools/README +2 -2
  4. data/tools/mkunidata.rb +20 -10
  5. data/unicode.c +132 -54
  6. data/unidata.map +12976 -1764
  7. data/ustring.c +27 -25
  8. data/ustring.h +12 -12
  9. data/wstring.c +11 -11
  10. metadata +4 -4
data/README CHANGED
@@ -1,5 +1,5 @@
1
1
  Unicode Library for Ruby
2
- Version 0.1
2
+ Version 0.2.0
3
3
 
4
4
  Yoshida Masato
5
5
 
@@ -14,8 +14,8 @@
14
14
 
15
15
  - Install
16
16
 
17
- This can work with ruby-1.4 or later. I recommend you to
18
- use ruby-1.4.2 or later.
17
+ This can work with ruby-1.8 or later. I recommend you to
18
+ use ruby-1.8.1 or later.
19
19
 
20
20
  Make and install usually.
21
21
  For example, when Ruby supports dynamic linking on your OS,
@@ -36,16 +36,16 @@
36
36
 
37
37
  - Module Functions
38
38
 
39
- All parameters of functions must be UTF-8.
39
+ All parameters of functions must be UTF-8 strings.
40
40
 
41
41
  Unicode::strcmp(str1, str2)
42
42
  Unicode::strcmp_compat(str1, str2)
43
- Compares Unicode strings with normalization.
44
- strcmp uses Normalization Form D, strcmp_compat uses
43
+ Compare Unicode strings with a normalization.
44
+ strcmp uses the Normalization Form D, strcmp_compat uses
45
45
  Normalization Form KD.
46
46
 
47
- Unicode::decopose(str)
48
- Unicode::decopose_compat(str)
47
+ Unicode::decompose(str)
48
+ Unicode::decompose_compat(str)
49
49
  Decompose Unicode string. Then the trailing characters
50
50
  are sorted in canonical order.
51
51
  decompose uses the canonical decomposition,
@@ -65,12 +65,12 @@
65
65
 
66
66
  Unicode::normalize_D(str)
67
67
  Unicode::normalize_KD(str)
68
- Normalizes Unicode string in form D or form KD.
68
+ Normalize Unicode string in form D or form KD.
69
69
  These are aliases of decompose/decompose_compat.
70
70
 
71
71
  Unicode::normalize_C(str)
72
72
  Unicode::normalize_KC(str)
73
- Normalizes Unicode string in form C or form KC.
73
+ Normalize Unicode string in form C or form KC.
74
74
  normalize_C = decompose + compose
75
75
  normalize_KC = decompose_compat + compose
76
76
 
@@ -78,7 +78,7 @@
78
78
  Unicode::downcase(str)
79
79
  Unicode::capitalize(str)
80
80
  Case conversion functions.
81
- The mappings which these functions use are not normative
81
+ The mappings that are used by these functions are not normative
82
82
  in UnicodeData.txt.
83
83
 
84
84
  - Bugs
@@ -87,6 +87,8 @@
87
87
  should not be implemented with a hash of string for better
88
88
  performance.
89
89
 
90
+ Case conversion functions should reflecte UTR #21.
91
+
90
92
 
91
93
  - Copying
92
94
 
@@ -104,4 +106,7 @@
104
106
 
105
107
  - History
106
108
 
109
+ Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
110
+ Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
111
+ Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
107
112
  Nov 23, 1999 version 0.1
data/test.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  #! /usr/local/bin/ruby -KU
2
+ # -*- coding: utf-8 -*-
2
3
 
3
4
  require 'unicode'
4
5
 
@@ -29,12 +30,12 @@ p Unicode::strcmp("ガ", "ガ")
29
30
  p Unicode::strcmp_compat("ガ", "ガ")
30
31
 
31
32
  print "Decomposition/composition\n"
32
- p Unicode::normalize_D([?c, 0x301, 0x327].pack("U*")).udump
33
- p Unicode::normalize_D([?c, 0x327, 0x301].pack("U*")).udump
33
+ p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
34
+ p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
34
35
  p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
35
36
  p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
36
- p Unicode::normalize_C([?c, 0x301, 0x327].pack("U*")).udump
37
- p Unicode::normalize_C([?c, 0x327, 0x301].pack("U*")).udump
37
+ p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
38
+ p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
38
39
  p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
39
40
  p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
40
41
 
@@ -50,7 +51,7 @@ p Unicode::normalize_D("요시담").udump
50
51
  p Unicode::normalize_C("요시담").udump
51
52
 
52
53
  print "Composition Exclusion\n"
53
- print " ANGSTROM SIGN [U+221B]\n"
54
+ print " ANGSTROM SIGN [U+212B]\n"
54
55
  p Unicode::normalize_D([0x212b].pack("U")).udump
55
56
  p Unicode::normalize_C([0x212b].pack("U")).udump
56
57
  print " LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
@@ -58,9 +59,9 @@ p Unicode::normalize_D([0x00c5].pack("U")).udump
58
59
  p Unicode::normalize_C([0x00c5].pack("U")).udump
59
60
 
60
61
  print "Case conversion\n"
61
- p Unicode::normalize_C(Unicode::upcase([?c, 0x301, 0x327, 0xff41].pack("U*"))).udump
62
- p Unicode::normalize_C(Unicode::downcase([?C, 0x301, 0x327, 0xff21].pack("U*"))).udump
63
- p Unicode::capitalize([0x1f1, ?A, ?a, 0xff21].pack("U*")).udump
62
+ p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
63
+ p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
64
+ p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
64
65
 
65
66
 
66
67
  ## Local variables:
@@ -1,6 +1,6 @@
1
1
  The unidata.map is created from UnicodeData.txt and
2
- CompositionExclusions.txt of Unicode 3.0.0.
2
+ DerivedNormalizationProps.txt of Unicode 4.1.0
3
3
 
4
4
  To update unidata.map,
5
5
 
6
- ruby mkunidata.rb UnicodeData.txt CompositionExclusions.txt > unidata.map
6
+ ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt > unidata.map
@@ -1,13 +1,13 @@
1
1
  #! /usr/local/bin/ruby -KU
2
2
 
3
- if $KCODE != 'UTF8'
4
- raise "$KCODE must be UTF8"
5
- end
3
+ #if $KCODE != 'UTF8'
4
+ # raise "$KCODE must be UTF8"
5
+ #end
6
6
 
7
7
  HEAD=<<EOS
8
8
  /*
9
9
  * UnicodeData
10
- * 1999 by yoshidam
10
+ * Copyright 1999, 2004 by yoshidam
11
11
  *
12
12
  */
13
13
 
@@ -25,7 +25,7 @@ struct unicode_data {
25
25
  const int titlecase;
26
26
  };
27
27
 
28
- const static struct unicode_data unidata[] = {
28
+ static const struct unicode_data unidata[] = {
29
29
  EOS
30
30
 
31
31
  TAIL=<<EOS
@@ -41,7 +41,7 @@ def hex2str(hex)
41
41
  canon = ""
42
42
  compat = ""
43
43
  chars = hex.split(" ")
44
- if chars[0] =~ /^[0-9A-F]{4}$/
44
+ if chars[0] =~ /^[0-9A-F]{4,6}$/
45
45
  chars.each do |c|
46
46
  canon << [c.hex].pack("U")
47
47
  end
@@ -59,7 +59,7 @@ def hex2str(hex)
59
59
  end
60
60
 
61
61
  def hex_or_nil(str)
62
- return "-1" if str.nil?
62
+ return "-1" if str.nil? || str == ''
63
63
  return format("0x%04x", str.hex)
64
64
  end
65
65
 
@@ -81,9 +81,19 @@ exclusion = {}
81
81
  open(ARGV[1]) do |f|
82
82
  while l = f.gets
83
83
  next if l =~ /^\#/ || l =~ /^$/
84
+ next if l !~ /Full_Composition_Exclusion/
84
85
  code, = l.split(/\s/)
85
- code = code.hex
86
- exclusion[code] = true
86
+ if code =~ /^[0-9A-F]+$/
87
+ code = code.hex
88
+ exclusion[code] = true
89
+ elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
90
+ # p [$1, $2]
91
+ scode = $1.hex
92
+ ecode = $2.hex
93
+ for code in scode..ecode
94
+ exclusion[code] = true
95
+ end
96
+ end
87
97
  end
88
98
  end
89
99
 
@@ -94,7 +104,7 @@ open(ARGV[0]) do |f|
94
104
  l.chomp!
95
105
  code, charname, gencat, ccclass, bidicat,decomp,
96
106
  dec, digit, num, mirror, uni1_0, comment, upcase,
97
- lowcase, titlecase = l.split(";");
107
+ lowcase, titlecase = l.split(";", 15);
98
108
  code = code.hex
99
109
  ccclass = ccclass.to_i
100
110
  canon, compat = hex2str(decomp)
data/unicode.c CHANGED
@@ -1,15 +1,52 @@
1
1
  /*
2
- * Unicode Library version 0.1
2
+ * Unicode Library version 0.2
3
+ * Dec 29, 2009: version 0.2
3
4
  * Nov 23, 1999 yoshidam
4
5
  *
5
6
  */
6
7
 
7
8
  #include "ruby.h"
8
- #include "rubyio.h"
9
+ #ifdef HAVE_RUBY_IO_H
10
+ # include "ruby/io.h"
11
+ #else
12
+ # include "rubyio.h"
13
+ #endif
9
14
  #include <stdio.h>
10
15
  #include "wstring.h"
11
16
  #include "unidata.map"
12
17
 
18
+ #ifndef RSTRING_PTR
19
+ # define RSTRING_PTR(s) (RSTRING(s)->ptr)
20
+ # define RSTRING_LEN(s) (RSTRING(s)->len)
21
+ #endif
22
+
23
+ #ifdef HAVE_RUBY_ENCODING_H
24
+ static rb_encoding* enc_out;
25
+ # define ENC_(o) (rb_enc_associate(o, enc_out))
26
+ #else
27
+ # define ENC_(o) (o)
28
+ #endif
29
+
30
+ inline static VALUE
31
+ taintObject(VALUE src, VALUE obj) {
32
+ if (OBJ_TAINTED(src))
33
+ OBJ_TAINT(obj);
34
+ return obj;
35
+ }
36
+ #define TO_(src, obj) (taintObject(src, obj))
37
+
38
+ #ifdef HAVE_RUBY_ENCODING_H
39
+ # define CONVERT_TO_UTF8(str) do { \
40
+ int encindex = ENCODING_GET(str); \
41
+ volatile VALUE encobj; \
42
+ if (encindex != rb_utf8_encindex() && \
43
+ encindex != rb_usascii_encindex()) { \
44
+ encobj = rb_enc_from_encoding(enc_out); \
45
+ str = rb_str_encode(str, encobj, 0, Qnil); \
46
+ } \
47
+ } while (0)
48
+ #endif
49
+
13
50
  static VALUE mUnicode;
14
51
  static VALUE unicode_data;
15
52
  static VALUE composition_table;
@@ -58,7 +95,7 @@ get_compat(int ucs)
58
95
  return NULL;
59
96
  }
60
97
 
61
- static const int
98
+ static int
62
99
  get_uppercase(int ucs)
63
100
  {
64
101
  VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
@@ -252,7 +289,7 @@ decompose_compat_internal(WString* ustr, WString* result)
252
289
  } while (0)
253
290
 
254
291
  static int
255
- compose_pair(int c1, int c2)
292
+ compose_pair(unsigned int c1, unsigned int c2)
256
293
  {
257
294
  int ret;
258
295
  char ustr[13]; /* stored two UTF-8 chars */
@@ -370,8 +407,12 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
370
407
 
371
408
  Check_Type(str1, T_STRING);
372
409
  Check_Type(str2, T_STRING);
373
- WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);
374
- WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);
410
+ #ifdef HAVE_RUBY_ENCODING_H
411
+ CONVERT_TO_UTF8(str1);
412
+ CONVERT_TO_UTF8(str2);
413
+ #endif
414
+ WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
415
+ WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
375
416
  WStr_alloc(&result1);
376
417
  WStr_alloc(&result2);
377
418
  decompose_internal(&wstr1, &result1);
@@ -380,17 +421,17 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
380
421
  WStr_free(&wstr2);
381
422
  sort_canonical(&result1);
382
423
  sort_canonical(&result2);
383
- UStr_alloc(&ustr1);
384
- UStr_alloc(&ustr2);
424
+ UniStr_alloc(&ustr1);
425
+ UniStr_alloc(&ustr2);
385
426
  WStr_convertIntoUString(&result1, &ustr1);
386
427
  WStr_convertIntoUString(&result2, &ustr2);
387
428
  WStr_free(&result1);
388
429
  WStr_free(&result2);
389
- UStr_addChar(&ustr1, '\0');
390
- UStr_addChar(&ustr2, '\0');
391
- ret = strcmp(ustr1.str, ustr2.str);
392
- UStr_free(&ustr1);
393
- UStr_free(&ustr2);
430
+ UniStr_addChar(&ustr1, '\0');
431
+ UniStr_addChar(&ustr2, '\0');
432
+ ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
433
+ UniStr_free(&ustr1);
434
+ UniStr_free(&ustr2);
394
435
 
395
436
  return INT2FIX(ret);
396
437
  }
@@ -408,8 +449,12 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
408
449
 
409
450
  Check_Type(str1, T_STRING);
410
451
  Check_Type(str2, T_STRING);
411
- WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);
412
- WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);
452
+ #ifdef HAVE_RUBY_ENCODING_H
453
+ CONVERT_TO_UTF8(str1);
454
+ CONVERT_TO_UTF8(str2);
455
+ #endif
456
+ WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
457
+ WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
413
458
  WStr_alloc(&result1);
414
459
  WStr_alloc(&result2);
415
460
  decompose_compat_internal(&wstr1, &result1);
@@ -418,17 +463,17 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
418
463
  WStr_free(&wstr2);
419
464
  sort_canonical(&result1);
420
465
  sort_canonical(&result2);
421
- UStr_alloc(&ustr1);
422
- UStr_alloc(&ustr2);
466
+ UniStr_alloc(&ustr1);
467
+ UniStr_alloc(&ustr2);
423
468
  WStr_convertIntoUString(&result1, &ustr1);
424
469
  WStr_convertIntoUString(&result2, &ustr2);
425
470
  WStr_free(&result1);
426
471
  WStr_free(&result2);
427
- UStr_addChar(&ustr1, '\0');
428
- UStr_addChar(&ustr2, '\0');
429
- ret = strcmp(ustr1.str, ustr2.str);
430
- UStr_free(&ustr1);
431
- UStr_free(&ustr2);
472
+ UniStr_addChar(&ustr1, '\0');
473
+ UniStr_addChar(&ustr2, '\0');
474
+ ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
475
+ UniStr_free(&ustr1);
476
+ UniStr_free(&ustr2);
432
477
 
433
478
  return INT2FIX(ret);
434
479
  }
@@ -442,16 +487,19 @@ unicode_decompose(VALUE obj, VALUE str)
442
487
  VALUE vret;
443
488
 
444
489
  Check_Type(str, T_STRING);
445
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
490
+ #ifdef HAVE_RUBY_ENCODING_H
491
+ CONVERT_TO_UTF8(str);
492
+ #endif
493
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
446
494
  WStr_alloc(&result);
447
495
  decompose_internal(&ustr, &result);
448
496
  WStr_free(&ustr);
449
497
  sort_canonical(&result);
450
- UStr_alloc(&ret);
498
+ UniStr_alloc(&ret);
451
499
  WStr_convertIntoUString(&result, &ret);
452
500
  WStr_free(&result);
453
- vret = rb_str_new(ret.str, ret.len);
454
- UStr_free(&ret);
501
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
502
+ UniStr_free(&ret);
455
503
 
456
504
  return vret;
457
505
  }
@@ -465,16 +513,19 @@ unicode_decompose_compat(VALUE obj, VALUE str)
465
513
  VALUE vret;
466
514
 
467
515
  Check_Type(str, T_STRING);
468
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
516
+ #ifdef HAVE_RUBY_ENCODING_H
517
+ CONVERT_TO_UTF8(str);
518
+ #endif
519
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
469
520
  WStr_alloc(&result);
470
521
  decompose_compat_internal(&ustr, &result);
471
522
  WStr_free(&ustr);
472
523
  sort_canonical(&result);
473
- UStr_alloc(&ret);
524
+ UniStr_alloc(&ret);
474
525
  WStr_convertIntoUString(&result, &ret);
475
526
  WStr_free(&result);
476
- vret = rb_str_new(ret.str, ret.len);
477
- UStr_free(&ret);
527
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
528
+ UniStr_free(&ret);
478
529
 
479
530
  return vret;
480
531
  }
@@ -488,16 +539,19 @@ unicode_compose(VALUE obj, VALUE str)
488
539
  VALUE vret;
489
540
 
490
541
  Check_Type(str, T_STRING);
491
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
542
+ #ifdef HAVE_RUBY_ENCODING_H
543
+ CONVERT_TO_UTF8(str);
544
+ #endif
545
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
492
546
  sort_canonical(&ustr);
493
547
  WStr_alloc(&result);
494
548
  compose_internal(&ustr, &result);
495
549
  WStr_free(&ustr);
496
- UStr_alloc(&ret);
550
+ UniStr_alloc(&ret);
497
551
  WStr_convertIntoUString(&result, &ret);
498
552
  WStr_free(&result);
499
- vret = rb_str_new(ret.str, ret.len);
500
- UStr_free(&ret);
553
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
554
+ UniStr_free(&ret);
501
555
 
502
556
  return vret;
503
557
  }
@@ -512,7 +566,10 @@ unicode_normalize_C(VALUE obj, VALUE str)
512
566
  VALUE vret;
513
567
 
514
568
  Check_Type(str, T_STRING);
515
- WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);
569
+ #ifdef HAVE_RUBY_ENCODING_H
570
+ CONVERT_TO_UTF8(str);
571
+ #endif
572
+ WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
516
573
  WStr_alloc(&ustr2);
517
574
  decompose_internal(&ustr1, &ustr2);
518
575
  WStr_free(&ustr1);
@@ -520,11 +577,11 @@ unicode_normalize_C(VALUE obj, VALUE str)
520
577
  WStr_alloc(&result);
521
578
  compose_internal(&ustr2, &result);
522
579
  WStr_free(&ustr2);
523
- UStr_alloc(&ret);
580
+ UniStr_alloc(&ret);
524
581
  WStr_convertIntoUString(&result, &ret);
525
582
  WStr_free(&result);
526
- vret = rb_str_new(ret.str, ret.len);
527
- UStr_free(&ret);
583
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
584
+ UniStr_free(&ret);
528
585
 
529
586
  return vret;
530
587
  }
@@ -539,7 +596,10 @@ unicode_normalize_KC(VALUE obj, VALUE str)
539
596
  VALUE vret;
540
597
 
541
598
  Check_Type(str, T_STRING);
542
- WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);
599
+ #ifdef HAVE_RUBY_ENCODING_H
600
+ CONVERT_TO_UTF8(str);
601
+ #endif
602
+ WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
543
603
  WStr_alloc(&ustr2);
544
604
  decompose_compat_internal(&ustr1, &ustr2);
545
605
  WStr_free(&ustr1);
@@ -547,11 +607,11 @@ unicode_normalize_KC(VALUE obj, VALUE str)
547
607
  WStr_alloc(&result);
548
608
  compose_internal(&ustr2, &result);
549
609
  WStr_free(&ustr2);
550
- UStr_alloc(&ret);
610
+ UniStr_alloc(&ret);
551
611
  WStr_convertIntoUString(&result, &ret);
552
612
  WStr_free(&result);
553
- vret = rb_str_new(ret.str, ret.len);
554
- UStr_free(&ret);
613
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
614
+ UniStr_free(&ret);
555
615
 
556
616
  return vret;
557
617
  }
@@ -564,13 +624,16 @@ unicode_upcase(VALUE obj, VALUE str)
564
624
  VALUE vret;
565
625
 
566
626
  Check_Type(str, T_STRING);
567
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
627
+ #ifdef HAVE_RUBY_ENCODING_H
628
+ CONVERT_TO_UTF8(str);
629
+ #endif
630
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
568
631
  upcase_internal(&ustr);
569
- UStr_alloc(&ret);
632
+ UniStr_alloc(&ret);
570
633
  WStr_convertIntoUString(&ustr, &ret);
571
634
  WStr_free(&ustr);
572
- vret = rb_str_new(ret.str, ret.len);
573
- UStr_free(&ret);
635
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
636
+ UniStr_free(&ret);
574
637
 
575
638
  return vret;
576
639
  }
@@ -583,17 +646,25 @@ unicode_downcase(VALUE obj, VALUE str)
583
646
  VALUE vret;
584
647
 
585
648
  Check_Type(str, T_STRING);
586
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
649
+ #ifdef HAVE_RUBY_ENCODING_H
650
+ CONVERT_TO_UTF8(str);
651
+ #endif
652
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
587
653
  downcase_internal(&ustr);
588
- UStr_alloc(&ret);
654
+ UniStr_alloc(&ret);
589
655
  WStr_convertIntoUString(&ustr, &ret);
590
656
  WStr_free(&ustr);
591
- vret = rb_str_new(ret.str, ret.len);
592
- UStr_free(&ret);
657
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
658
+ UniStr_free(&ret);
593
659
 
594
660
  return vret;
595
661
  }
596
662
 
663
+ #ifdef HAVE_RUBY_ENCODING_H
664
+
665
+
666
+ #endif
667
+
597
668
  static VALUE
598
669
  unicode_capitalize(VALUE obj, VALUE str)
599
670
  {
@@ -602,13 +673,16 @@ unicode_capitalize(VALUE obj, VALUE str)
602
673
  VALUE vret;
603
674
 
604
675
  Check_Type(str, T_STRING);
605
- WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
676
+ #ifdef HAVE_RUBY_ENCODING_H
677
+ CONVERT_TO_UTF8(str);
678
+ #endif
679
+ WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
606
680
  capitalize_internal(&ustr);
607
- UStr_alloc(&ret);
681
+ UniStr_alloc(&ret);
608
682
  WStr_convertIntoUString(&ustr, &ret);
609
683
  WStr_free(&ustr);
610
- vret = rb_str_new(ret.str, ret.len);
611
- UStr_free(&ret);
684
+ vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
685
+ UniStr_free(&ret);
612
686
 
613
687
  return vret;
614
688
  }
@@ -618,6 +692,10 @@ Init_unicode()
618
692
  {
619
693
  int i;
620
694
 
695
+ #ifdef HAVE_RUBY_ENCODING_H
696
+ enc_out = rb_utf8_encoding();
697
+ #endif
698
+
621
699
  mUnicode = rb_define_module("Unicode");
622
700
  unicode_data = rb_hash_new();
623
701
  composition_table = rb_hash_new();