unicode 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +16 -11
- data/test.rb +9 -8
- data/tools/README +2 -2
- data/tools/mkunidata.rb +20 -10
- data/unicode.c +132 -54
- data/unidata.map +12976 -1764
- data/ustring.c +27 -25
- data/ustring.h +12 -12
- data/wstring.c +11 -11
- metadata +4 -4
    
        data/README
    CHANGED
    
    | @@ -1,5 +1,5 @@ | |
| 1 1 | 
             
            		   Unicode Library for Ruby
         | 
| 2 | 
            -
            			Version 0. | 
| 2 | 
            +
            			Version 0.2.0
         | 
| 3 3 |  | 
| 4 4 | 
             
            		       Yoshida Masato
         | 
| 5 5 |  | 
| @@ -14,8 +14,8 @@ | |
| 14 14 |  | 
| 15 15 | 
             
            - Install
         | 
| 16 16 |  | 
| 17 | 
            -
              This can work with ruby-1. | 
| 18 | 
            -
              use ruby-1. | 
| 17 | 
            +
              This can work with ruby-1.8 or later. I recommend you to
         | 
| 18 | 
            +
              use ruby-1.8.1 or later.
         | 
| 19 19 |  | 
| 20 20 | 
             
              Make and install usually.
         | 
| 21 21 | 
             
              For example, when Ruby supports dynamic linking on your OS,
         | 
| @@ -36,16 +36,16 @@ | |
| 36 36 |  | 
| 37 37 | 
             
            - Module Functions
         | 
| 38 38 |  | 
| 39 | 
            -
              All parameters of functions must be UTF-8.
         | 
| 39 | 
            +
              All parameters of functions must be UTF-8 strings.
         | 
| 40 40 |  | 
| 41 41 | 
             
              Unicode::strcmp(str1, str2)
         | 
| 42 42 | 
             
              Unicode::strcmp_compat(str1, str2)
         | 
| 43 | 
            -
                 | 
| 44 | 
            -
                strcmp uses Normalization Form D, strcmp_compat uses
         | 
| 43 | 
            +
                Compare Unicode strings with a normalization.
         | 
| 44 | 
            +
                strcmp uses the Normalization Form D, strcmp_compat uses
         | 
| 45 45 | 
             
                Normalization Form KD.
         | 
| 46 46 |  | 
| 47 | 
            -
              Unicode:: | 
| 48 | 
            -
              Unicode:: | 
| 47 | 
            +
              Unicode::decompose(str)
         | 
| 48 | 
            +
              Unicode::decompose_compat(str)
         | 
| 49 49 | 
             
                Decompose Unicode string. Then the trailing characters
         | 
| 50 50 | 
             
                are sorted in canonical order.
         | 
| 51 51 | 
             
                decompose uses the canonical decomposition,
         | 
| @@ -65,12 +65,12 @@ | |
| 65 65 |  | 
| 66 66 | 
             
              Unicode::normalize_D(str)
         | 
| 67 67 | 
             
              Unicode::normalize_KD(str)
         | 
| 68 | 
            -
                 | 
| 68 | 
            +
                Normalize Unicode string in form D or form KD.
         | 
| 69 69 | 
             
                These are aliases of decompose/decompose_compat.
         | 
| 70 70 |  | 
| 71 71 | 
             
              Unicode::normalize_C(str)
         | 
| 72 72 | 
             
              Unicode::normalize_KC(str)
         | 
| 73 | 
            -
                 | 
| 73 | 
            +
                Normalize Unicode string in form C or form KC.
         | 
| 74 74 | 
             
                  normalize_C  = decompose + compose
         | 
| 75 75 | 
             
                  normalize_KC = decompose_compat + compose
         | 
| 76 76 |  | 
| @@ -78,7 +78,7 @@ | |
| 78 78 | 
             
              Unicode::downcase(str)
         | 
| 79 79 | 
             
              Unicode::capitalize(str)
         | 
| 80 80 | 
             
                Case conversion functions.
         | 
| 81 | 
            -
                The mappings  | 
| 81 | 
            +
                The mappings that are used by these functions are not normative
         | 
| 82 82 | 
             
                in UnicodeData.txt.
         | 
| 83 83 |  | 
| 84 84 | 
             
            - Bugs
         | 
| @@ -87,6 +87,8 @@ | |
| 87 87 | 
             
              should not be implemented with a hash of string for better
         | 
| 88 88 | 
             
              performance.
         | 
| 89 89 |  | 
| 90 | 
            +
              Case conversion functions should reflecte UTR #21.
         | 
| 91 | 
            +
             | 
| 90 92 |  | 
| 91 93 | 
             
            - Copying
         | 
| 92 94 |  | 
| @@ -104,4 +106,7 @@ | |
| 104 106 |  | 
| 105 107 | 
             
            - History
         | 
| 106 108 |  | 
| 109 | 
            +
              Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
         | 
| 110 | 
            +
              Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
         | 
| 111 | 
            +
              Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
         | 
| 107 112 | 
             
              Nov 23, 1999 version 0.1
         | 
    
        data/test.rb
    CHANGED
    
    | @@ -1,4 +1,5 @@ | |
| 1 1 | 
             
            #! /usr/local/bin/ruby -KU
         | 
| 2 | 
            +
            # -*- coding: utf-8 -*-
         | 
| 2 3 |  | 
| 3 4 | 
             
            require 'unicode'
         | 
| 4 5 |  | 
| @@ -29,12 +30,12 @@ p Unicode::strcmp("ガ", "ガ") | |
| 29 30 | 
             
            p Unicode::strcmp_compat("ガ", "ガ")
         | 
| 30 31 |  | 
| 31 32 | 
             
            print "Decomposition/composition\n"
         | 
| 32 | 
            -
            p Unicode::normalize_D([ | 
| 33 | 
            -
            p Unicode::normalize_D([ | 
| 33 | 
            +
            p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
         | 
| 34 | 
            +
            p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
         | 
| 34 35 | 
             
            p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
         | 
| 35 36 | 
             
            p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
         | 
| 36 | 
            -
            p Unicode::normalize_C([ | 
| 37 | 
            -
            p Unicode::normalize_C([ | 
| 37 | 
            +
            p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
         | 
| 38 | 
            +
            p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
         | 
| 38 39 | 
             
            p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
         | 
| 39 40 | 
             
            p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
         | 
| 40 41 |  | 
| @@ -50,7 +51,7 @@ p Unicode::normalize_D("요시담").udump | |
| 50 51 | 
             
            p Unicode::normalize_C("요시담").udump
         | 
| 51 52 |  | 
| 52 53 | 
             
            print "Composition Exclusion\n"
         | 
| 53 | 
            -
            print "   ANGSTROM SIGN [U+ | 
| 54 | 
            +
            print "   ANGSTROM SIGN [U+212B]\n"
         | 
| 54 55 | 
             
            p Unicode::normalize_D([0x212b].pack("U")).udump
         | 
| 55 56 | 
             
            p Unicode::normalize_C([0x212b].pack("U")).udump
         | 
| 56 57 | 
             
            print "   LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
         | 
| @@ -58,9 +59,9 @@ p Unicode::normalize_D([0x00c5].pack("U")).udump | |
| 58 59 | 
             
            p Unicode::normalize_C([0x00c5].pack("U")).udump
         | 
| 59 60 |  | 
| 60 61 | 
             
            print "Case conversion\n"
         | 
| 61 | 
            -
            p Unicode::normalize_C(Unicode::upcase([ | 
| 62 | 
            -
            p Unicode::normalize_C(Unicode::downcase([ | 
| 63 | 
            -
            p Unicode::capitalize([0x1f1,  | 
| 62 | 
            +
            p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
         | 
| 63 | 
            +
            p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
         | 
| 64 | 
            +
            p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
         | 
| 64 65 |  | 
| 65 66 |  | 
| 66 67 | 
             
            ## Local variables:
         | 
    
        data/tools/README
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            The unidata.map is created from UnicodeData.txt and
         | 
| 2 | 
            -
             | 
| 2 | 
            +
            DerivedNormalizationProps.txt of Unicode 4.1.0
         | 
| 3 3 |  | 
| 4 4 | 
             
            To update unidata.map,
         | 
| 5 5 |  | 
| 6 | 
            -
              ruby mkunidata.rb UnicodeData.txt  | 
| 6 | 
            +
              ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt > unidata.map
         | 
    
        data/tools/mkunidata.rb
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 1 | 
             
            #! /usr/local/bin/ruby -KU
         | 
| 2 2 |  | 
| 3 | 
            -
            if $KCODE != 'UTF8'
         | 
| 4 | 
            -
              raise "$KCODE must be UTF8"
         | 
| 5 | 
            -
            end
         | 
| 3 | 
            +
            #if $KCODE != 'UTF8'
         | 
| 4 | 
            +
            #  raise "$KCODE must be UTF8"
         | 
| 5 | 
            +
            #end
         | 
| 6 6 |  | 
| 7 7 | 
             
            HEAD=<<EOS
         | 
| 8 8 | 
             
            /*
         | 
| 9 9 | 
             
             * UnicodeData
         | 
| 10 | 
            -
             * 1999 by yoshidam
         | 
| 10 | 
            +
             * Copyright 1999, 2004 by yoshidam
         | 
| 11 11 | 
             
             *
         | 
| 12 12 | 
             
             */
         | 
| 13 13 |  | 
| @@ -25,7 +25,7 @@ struct unicode_data { | |
| 25 25 | 
             
              const int titlecase;
         | 
| 26 26 | 
             
            };
         | 
| 27 27 |  | 
| 28 | 
            -
            const  | 
| 28 | 
            +
            static const struct unicode_data unidata[] = {
         | 
| 29 29 | 
             
            EOS
         | 
| 30 30 |  | 
| 31 31 | 
             
            TAIL=<<EOS
         | 
| @@ -41,7 +41,7 @@ def hex2str(hex) | |
| 41 41 | 
             
              canon = ""
         | 
| 42 42 | 
             
              compat = ""
         | 
| 43 43 | 
             
              chars = hex.split(" ")
         | 
| 44 | 
            -
              if chars[0] =~ /^[0-9A-F]{4}$/
         | 
| 44 | 
            +
              if chars[0] =~ /^[0-9A-F]{4,6}$/
         | 
| 45 45 | 
             
                chars.each do |c|
         | 
| 46 46 | 
             
                  canon << [c.hex].pack("U")
         | 
| 47 47 | 
             
                end
         | 
| @@ -59,7 +59,7 @@ def hex2str(hex) | |
| 59 59 | 
             
            end
         | 
| 60 60 |  | 
| 61 61 | 
             
            def hex_or_nil(str)
         | 
| 62 | 
            -
              return "-1" if str.nil?
         | 
| 62 | 
            +
              return "-1" if str.nil? || str == ''
         | 
| 63 63 | 
             
              return format("0x%04x", str.hex)
         | 
| 64 64 | 
             
            end
         | 
| 65 65 |  | 
| @@ -81,9 +81,19 @@ exclusion = {} | |
| 81 81 | 
             
            open(ARGV[1]) do |f|
         | 
| 82 82 | 
             
              while l = f.gets
         | 
| 83 83 | 
             
                next if l =~ /^\#/ || l =~ /^$/
         | 
| 84 | 
            +
                next if l !~ /Full_Composition_Exclusion/
         | 
| 84 85 | 
             
                code, = l.split(/\s/)
         | 
| 85 | 
            -
                code  | 
| 86 | 
            -
             | 
| 86 | 
            +
                if code =~ /^[0-9A-F]+$/
         | 
| 87 | 
            +
                  code = code.hex
         | 
| 88 | 
            +
                  exclusion[code] = true
         | 
| 89 | 
            +
                elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
         | 
| 90 | 
            +
            #      p [$1, $2]
         | 
| 91 | 
            +
                  scode = $1.hex
         | 
| 92 | 
            +
                  ecode = $2.hex
         | 
| 93 | 
            +
                  for code in scode..ecode
         | 
| 94 | 
            +
                    exclusion[code] = true
         | 
| 95 | 
            +
                  end
         | 
| 96 | 
            +
                end
         | 
| 87 97 | 
             
              end
         | 
| 88 98 | 
             
            end
         | 
| 89 99 |  | 
| @@ -94,7 +104,7 @@ open(ARGV[0]) do |f| | |
| 94 104 | 
             
                l.chomp!
         | 
| 95 105 | 
             
                code, charname, gencat, ccclass, bidicat,decomp,
         | 
| 96 106 | 
             
                  dec, digit, num, mirror, uni1_0, comment, upcase,
         | 
| 97 | 
            -
                  lowcase, titlecase = l.split(";");
         | 
| 107 | 
            +
                  lowcase, titlecase = l.split(";", 15);
         | 
| 98 108 | 
             
                code = code.hex
         | 
| 99 109 | 
             
                ccclass = ccclass.to_i
         | 
| 100 110 | 
             
                canon, compat = hex2str(decomp)
         | 
    
        data/unicode.c
    CHANGED
    
    | @@ -1,15 +1,52 @@ | |
| 1 1 | 
             
            /*
         | 
| 2 | 
            -
             * Unicode Library version 0. | 
| 2 | 
            +
             * Unicode Library version 0.2
         | 
| 3 | 
            +
             * Dec 29, 2009: version 0.2
         | 
| 3 4 | 
             
             * Nov 23, 1999 yoshidam
         | 
| 4 5 | 
             
             *
         | 
| 5 6 | 
             
             */
         | 
| 6 7 |  | 
| 7 8 | 
             
            #include "ruby.h"
         | 
| 8 | 
            -
            # | 
| 9 | 
            +
            #ifdef HAVE_RUBY_IO_H
         | 
| 10 | 
            +
            #  include "ruby/io.h"
         | 
| 11 | 
            +
            #else
         | 
| 12 | 
            +
            #  include "rubyio.h"
         | 
| 13 | 
            +
            #endif
         | 
| 9 14 | 
             
            #include <stdio.h>
         | 
| 10 15 | 
             
            #include "wstring.h"
         | 
| 11 16 | 
             
            #include "unidata.map"
         | 
| 12 17 |  | 
| 18 | 
            +
            #ifndef RSTRING_PTR
         | 
| 19 | 
            +
            #  define RSTRING_PTR(s) (RSTRING(s)->ptr)
         | 
| 20 | 
            +
            #  define RSTRING_LEN(s) (RSTRING(s)->len)
         | 
| 21 | 
            +
            #endif
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 24 | 
            +
            static rb_encoding* enc_out;
         | 
| 25 | 
            +
            #  define ENC_(o) (rb_enc_associate(o, enc_out))
         | 
| 26 | 
            +
            #else
         | 
| 27 | 
            +
            #  define ENC_(o) (o)
         | 
| 28 | 
            +
            #endif
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            inline static VALUE
         | 
| 31 | 
            +
            taintObject(VALUE src, VALUE obj) {
         | 
| 32 | 
            +
              if (OBJ_TAINTED(src))
         | 
| 33 | 
            +
                OBJ_TAINT(obj);
         | 
| 34 | 
            +
              return obj;
         | 
| 35 | 
            +
            }
         | 
| 36 | 
            +
            #define TO_(src, obj) (taintObject(src, obj))
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 39 | 
            +
            #  define CONVERT_TO_UTF8(str) do { \
         | 
| 40 | 
            +
                int encindex = ENCODING_GET(str); \
         | 
| 41 | 
            +
                volatile VALUE encobj; \
         | 
| 42 | 
            +
                if (encindex != rb_utf8_encindex() && \
         | 
| 43 | 
            +
                    encindex != rb_usascii_encindex()) { \
         | 
| 44 | 
            +
                  encobj = rb_enc_from_encoding(enc_out); \
         | 
| 45 | 
            +
                  str = rb_str_encode(str, encobj, 0, Qnil); \
         | 
| 46 | 
            +
                } \
         | 
| 47 | 
            +
              } while (0)
         | 
| 48 | 
            +
            #endif
         | 
| 49 | 
            +
             | 
| 13 50 | 
             
            static VALUE mUnicode;
         | 
| 14 51 | 
             
            static VALUE unicode_data;
         | 
| 15 52 | 
             
            static VALUE composition_table;
         | 
| @@ -58,7 +95,7 @@ get_compat(int ucs) | |
| 58 95 | 
             
              return NULL;
         | 
| 59 96 | 
             
            }
         | 
| 60 97 |  | 
| 61 | 
            -
            static  | 
| 98 | 
            +
            static int
         | 
| 62 99 | 
             
            get_uppercase(int ucs)
         | 
| 63 100 | 
             
            {
         | 
| 64 101 | 
             
              VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
         | 
| @@ -252,7 +289,7 @@ decompose_compat_internal(WString* ustr, WString* result) | |
| 252 289 | 
             
              } while (0)
         | 
| 253 290 |  | 
| 254 291 | 
             
            static int
         | 
| 255 | 
            -
            compose_pair(int c1, int c2)
         | 
| 292 | 
            +
            compose_pair(unsigned int c1, unsigned int c2)
         | 
| 256 293 | 
             
            {
         | 
| 257 294 | 
             
              int ret;
         | 
| 258 295 | 
             
              char ustr[13]; /* stored two UTF-8 chars */
         | 
| @@ -370,8 +407,12 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2) | |
| 370 407 |  | 
| 371 408 | 
             
              Check_Type(str1, T_STRING);
         | 
| 372 409 | 
             
              Check_Type(str2, T_STRING);
         | 
| 373 | 
            -
             | 
| 374 | 
            -
               | 
| 410 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 411 | 
            +
              CONVERT_TO_UTF8(str1);
         | 
| 412 | 
            +
              CONVERT_TO_UTF8(str2);
         | 
| 413 | 
            +
            #endif
         | 
| 414 | 
            +
              WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
         | 
| 415 | 
            +
              WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
         | 
| 375 416 | 
             
              WStr_alloc(&result1);
         | 
| 376 417 | 
             
              WStr_alloc(&result2);
         | 
| 377 418 | 
             
              decompose_internal(&wstr1, &result1);
         | 
| @@ -380,17 +421,17 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2) | |
| 380 421 | 
             
              WStr_free(&wstr2);
         | 
| 381 422 | 
             
              sort_canonical(&result1);
         | 
| 382 423 | 
             
              sort_canonical(&result2);
         | 
| 383 | 
            -
               | 
| 384 | 
            -
               | 
| 424 | 
            +
              UniStr_alloc(&ustr1);
         | 
| 425 | 
            +
              UniStr_alloc(&ustr2);
         | 
| 385 426 | 
             
              WStr_convertIntoUString(&result1, &ustr1);
         | 
| 386 427 | 
             
              WStr_convertIntoUString(&result2, &ustr2);
         | 
| 387 428 | 
             
              WStr_free(&result1);
         | 
| 388 429 | 
             
              WStr_free(&result2);
         | 
| 389 | 
            -
               | 
| 390 | 
            -
               | 
| 391 | 
            -
              ret = strcmp(ustr1.str, ustr2.str);
         | 
| 392 | 
            -
               | 
| 393 | 
            -
               | 
| 430 | 
            +
              UniStr_addChar(&ustr1, '\0');
         | 
| 431 | 
            +
              UniStr_addChar(&ustr2, '\0');
         | 
| 432 | 
            +
              ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
         | 
| 433 | 
            +
              UniStr_free(&ustr1);
         | 
| 434 | 
            +
              UniStr_free(&ustr2);
         | 
| 394 435 |  | 
| 395 436 | 
             
              return INT2FIX(ret);
         | 
| 396 437 | 
             
            }
         | 
| @@ -408,8 +449,12 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2) | |
| 408 449 |  | 
| 409 450 | 
             
              Check_Type(str1, T_STRING);
         | 
| 410 451 | 
             
              Check_Type(str2, T_STRING);
         | 
| 411 | 
            -
             | 
| 412 | 
            -
               | 
| 452 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 453 | 
            +
              CONVERT_TO_UTF8(str1);
         | 
| 454 | 
            +
              CONVERT_TO_UTF8(str2);
         | 
| 455 | 
            +
            #endif
         | 
| 456 | 
            +
              WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
         | 
| 457 | 
            +
              WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
         | 
| 413 458 | 
             
              WStr_alloc(&result1);
         | 
| 414 459 | 
             
              WStr_alloc(&result2);
         | 
| 415 460 | 
             
              decompose_compat_internal(&wstr1, &result1);
         | 
| @@ -418,17 +463,17 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2) | |
| 418 463 | 
             
              WStr_free(&wstr2);
         | 
| 419 464 | 
             
              sort_canonical(&result1);
         | 
| 420 465 | 
             
              sort_canonical(&result2);
         | 
| 421 | 
            -
               | 
| 422 | 
            -
               | 
| 466 | 
            +
              UniStr_alloc(&ustr1);
         | 
| 467 | 
            +
              UniStr_alloc(&ustr2);
         | 
| 423 468 | 
             
              WStr_convertIntoUString(&result1, &ustr1);
         | 
| 424 469 | 
             
              WStr_convertIntoUString(&result2, &ustr2);
         | 
| 425 470 | 
             
              WStr_free(&result1);
         | 
| 426 471 | 
             
              WStr_free(&result2);
         | 
| 427 | 
            -
               | 
| 428 | 
            -
               | 
| 429 | 
            -
              ret = strcmp(ustr1.str, ustr2.str);
         | 
| 430 | 
            -
               | 
| 431 | 
            -
               | 
| 472 | 
            +
              UniStr_addChar(&ustr1, '\0');
         | 
| 473 | 
            +
              UniStr_addChar(&ustr2, '\0');
         | 
| 474 | 
            +
              ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
         | 
| 475 | 
            +
              UniStr_free(&ustr1);
         | 
| 476 | 
            +
              UniStr_free(&ustr2);
         | 
| 432 477 |  | 
| 433 478 | 
             
              return INT2FIX(ret);
         | 
| 434 479 | 
             
            }
         | 
| @@ -442,16 +487,19 @@ unicode_decompose(VALUE obj, VALUE str) | |
| 442 487 | 
             
              VALUE vret;
         | 
| 443 488 |  | 
| 444 489 | 
             
              Check_Type(str, T_STRING);
         | 
| 445 | 
            -
             | 
| 490 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 491 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 492 | 
            +
            #endif
         | 
| 493 | 
            +
              WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
         | 
| 446 494 | 
             
              WStr_alloc(&result);
         | 
| 447 495 | 
             
              decompose_internal(&ustr, &result);
         | 
| 448 496 | 
             
              WStr_free(&ustr);
         | 
| 449 497 | 
             
              sort_canonical(&result);
         | 
| 450 | 
            -
               | 
| 498 | 
            +
              UniStr_alloc(&ret);
         | 
| 451 499 | 
             
              WStr_convertIntoUString(&result, &ret);
         | 
| 452 500 | 
             
              WStr_free(&result);
         | 
| 453 | 
            -
              vret = rb_str_new(ret.str, ret.len);
         | 
| 454 | 
            -
               | 
| 501 | 
            +
              vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
         | 
| 502 | 
            +
              UniStr_free(&ret);
         | 
| 455 503 |  | 
| 456 504 | 
             
              return vret;
         | 
| 457 505 | 
             
            }
         | 
| @@ -465,16 +513,19 @@ unicode_decompose_compat(VALUE obj, VALUE str) | |
| 465 513 | 
             
              VALUE vret;
         | 
| 466 514 |  | 
| 467 515 | 
             
              Check_Type(str, T_STRING);
         | 
| 468 | 
            -
             | 
| 516 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 517 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 518 | 
            +
            #endif
         | 
| 519 | 
            +
              WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
         | 
| 469 520 | 
             
              WStr_alloc(&result);
         | 
| 470 521 | 
             
              decompose_compat_internal(&ustr, &result);
         | 
| 471 522 | 
             
              WStr_free(&ustr);
         | 
| 472 523 | 
             
              sort_canonical(&result);
         | 
| 473 | 
            -
               | 
| 524 | 
            +
              UniStr_alloc(&ret);
         | 
| 474 525 | 
             
              WStr_convertIntoUString(&result, &ret);
         | 
| 475 526 | 
             
              WStr_free(&result);
         | 
| 476 | 
            -
              vret = rb_str_new(ret.str, ret.len);
         | 
| 477 | 
            -
               | 
| 527 | 
            +
              vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
         | 
| 528 | 
            +
              UniStr_free(&ret);
         | 
| 478 529 |  | 
| 479 530 | 
             
              return vret;
         | 
| 480 531 | 
             
            }
         | 
| @@ -488,16 +539,19 @@ unicode_compose(VALUE obj, VALUE str) | |
| 488 539 | 
             
              VALUE vret;
         | 
| 489 540 |  | 
| 490 541 | 
             
              Check_Type(str, T_STRING);
         | 
| 491 | 
            -
             | 
| 542 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 543 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 544 | 
            +
            #endif
         | 
| 545 | 
            +
              WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
         | 
| 492 546 | 
             
              sort_canonical(&ustr);
         | 
| 493 547 | 
             
              WStr_alloc(&result);
         | 
| 494 548 | 
             
              compose_internal(&ustr, &result);
         | 
| 495 549 | 
             
              WStr_free(&ustr);
         | 
| 496 | 
            -
               | 
| 550 | 
            +
              UniStr_alloc(&ret);
         | 
| 497 551 | 
             
              WStr_convertIntoUString(&result, &ret);
         | 
| 498 552 | 
             
              WStr_free(&result);
         | 
| 499 | 
            -
              vret = rb_str_new(ret.str, ret.len);
         | 
| 500 | 
            -
               | 
| 553 | 
            +
              vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
         | 
| 554 | 
            +
              UniStr_free(&ret);
         | 
| 501 555 |  | 
| 502 556 | 
             
              return vret;
         | 
| 503 557 | 
             
            }
         | 
| @@ -512,7 +566,10 @@ unicode_normalize_C(VALUE obj, VALUE str) | |
| 512 566 | 
             
              VALUE vret;
         | 
| 513 567 |  | 
| 514 568 | 
             
              Check_Type(str, T_STRING);
         | 
| 515 | 
            -
             | 
| 569 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 570 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 571 | 
            +
            #endif
         | 
| 572 | 
            +
              WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
         | 
| 516 573 | 
             
              WStr_alloc(&ustr2);
         | 
| 517 574 | 
             
              decompose_internal(&ustr1, &ustr2);
         | 
| 518 575 | 
             
              WStr_free(&ustr1);
         | 
| @@ -520,11 +577,11 @@ unicode_normalize_C(VALUE obj, VALUE str) | |
| 520 577 | 
             
              WStr_alloc(&result);
         | 
| 521 578 | 
             
              compose_internal(&ustr2, &result);
         | 
| 522 579 | 
             
              WStr_free(&ustr2);
         | 
| 523 | 
            -
               | 
| 580 | 
            +
              UniStr_alloc(&ret);
         | 
| 524 581 | 
             
              WStr_convertIntoUString(&result, &ret);
         | 
| 525 582 | 
             
              WStr_free(&result);
         | 
| 526 | 
            -
              vret = rb_str_new(ret.str, ret.len);
         | 
| 527 | 
            -
               | 
| 583 | 
            +
              vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
         | 
| 584 | 
            +
              UniStr_free(&ret);
         | 
| 528 585 |  | 
| 529 586 | 
             
              return vret;
         | 
| 530 587 | 
             
            }
         | 
| @@ -539,7 +596,10 @@ unicode_normalize_KC(VALUE obj, VALUE str) | |
| 539 596 | 
             
              VALUE vret;
         | 
| 540 597 |  | 
| 541 598 | 
             
              Check_Type(str, T_STRING);
         | 
| 542 | 
            -
             | 
| 599 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 600 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 601 | 
            +
            #endif
         | 
| 602 | 
            +
              WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
         | 
| 543 603 | 
             
              WStr_alloc(&ustr2);
         | 
| 544 604 | 
             
              decompose_compat_internal(&ustr1, &ustr2);
         | 
| 545 605 | 
             
              WStr_free(&ustr1);
         | 
| @@ -547,11 +607,11 @@ unicode_normalize_KC(VALUE obj, VALUE str) | |
| 547 607 | 
             
              WStr_alloc(&result);
         | 
| 548 608 | 
             
              compose_internal(&ustr2, &result);
         | 
| 549 609 | 
             
              WStr_free(&ustr2);
         | 
| 550 | 
            -
               | 
| 610 | 
            +
              UniStr_alloc(&ret);
         | 
| 551 611 | 
             
              WStr_convertIntoUString(&result, &ret);
         | 
| 552 612 | 
             
              WStr_free(&result);
         | 
| 553 | 
            -
              vret = rb_str_new(ret.str, ret.len);
         | 
| 554 | 
            -
               | 
| 613 | 
            +
              vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
         | 
| 614 | 
            +
              UniStr_free(&ret);
         | 
| 555 615 |  | 
| 556 616 | 
             
              return vret;
         | 
| 557 617 | 
             
            }
         | 
| @@ -564,13 +624,16 @@ unicode_upcase(VALUE obj, VALUE str) | |
| 564 624 | 
             
              VALUE vret;
         | 
| 565 625 |  | 
| 566 626 | 
             
              Check_Type(str, T_STRING);
         | 
| 567 | 
            -
             | 
| 627 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 628 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 629 | 
            +
            #endif
         | 
| 630 | 
            +
              WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
         | 
| 568 631 | 
             
              upcase_internal(&ustr);
         | 
| 569 | 
            -
               | 
| 632 | 
            +
              UniStr_alloc(&ret);
         | 
| 570 633 | 
             
              WStr_convertIntoUString(&ustr, &ret);
         | 
| 571 634 | 
             
              WStr_free(&ustr);
         | 
| 572 | 
            -
              vret = rb_str_new(ret.str, ret.len);
         | 
| 573 | 
            -
               | 
| 635 | 
            +
              vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
         | 
| 636 | 
            +
              UniStr_free(&ret);
         | 
| 574 637 |  | 
| 575 638 | 
             
              return vret;
         | 
| 576 639 | 
             
            }
         | 
| @@ -583,17 +646,25 @@ unicode_downcase(VALUE obj, VALUE str) | |
| 583 646 | 
             
              VALUE vret;
         | 
| 584 647 |  | 
| 585 648 | 
             
              Check_Type(str, T_STRING);
         | 
| 586 | 
            -
             | 
| 649 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 650 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 651 | 
            +
            #endif
         | 
| 652 | 
            +
              WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
         | 
| 587 653 | 
             
              downcase_internal(&ustr);
         | 
| 588 | 
            -
               | 
| 654 | 
            +
              UniStr_alloc(&ret);
         | 
| 589 655 | 
             
              WStr_convertIntoUString(&ustr, &ret);
         | 
| 590 656 | 
             
              WStr_free(&ustr);
         | 
| 591 | 
            -
              vret = rb_str_new(ret.str, ret.len);
         | 
| 592 | 
            -
               | 
| 657 | 
            +
              vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
         | 
| 658 | 
            +
              UniStr_free(&ret);
         | 
| 593 659 |  | 
| 594 660 | 
             
              return vret;
         | 
| 595 661 | 
             
            }
         | 
| 596 662 |  | 
| 663 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 664 | 
            +
             | 
| 665 | 
            +
             | 
| 666 | 
            +
            #endif
         | 
| 667 | 
            +
             | 
| 597 668 | 
             
            static VALUE
         | 
| 598 669 | 
             
            unicode_capitalize(VALUE obj, VALUE str)
         | 
| 599 670 | 
             
            {
         | 
| @@ -602,13 +673,16 @@ unicode_capitalize(VALUE obj, VALUE str) | |
| 602 673 | 
             
              VALUE vret;
         | 
| 603 674 |  | 
| 604 675 | 
             
              Check_Type(str, T_STRING);
         | 
| 605 | 
            -
             | 
| 676 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 677 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 678 | 
            +
            #endif
         | 
| 679 | 
            +
              WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
         | 
| 606 680 | 
             
              capitalize_internal(&ustr);
         | 
| 607 | 
            -
               | 
| 681 | 
            +
              UniStr_alloc(&ret);
         | 
| 608 682 | 
             
              WStr_convertIntoUString(&ustr, &ret);
         | 
| 609 683 | 
             
              WStr_free(&ustr);
         | 
| 610 | 
            -
              vret = rb_str_new(ret.str, ret.len);
         | 
| 611 | 
            -
               | 
| 684 | 
            +
              vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
         | 
| 685 | 
            +
              UniStr_free(&ret);
         | 
| 612 686 |  | 
| 613 687 | 
             
              return vret;
         | 
| 614 688 | 
             
            }
         | 
| @@ -618,6 +692,10 @@ Init_unicode() | |
| 618 692 | 
             
            {
         | 
| 619 693 | 
             
              int i;
         | 
| 620 694 |  | 
| 695 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 696 | 
            +
              enc_out = rb_utf8_encoding();
         | 
| 697 | 
            +
            #endif
         | 
| 698 | 
            +
             | 
| 621 699 | 
             
              mUnicode = rb_define_module("Unicode");
         | 
| 622 700 | 
             
              unicode_data = rb_hash_new();
         | 
| 623 701 | 
             
              composition_table = rb_hash_new();
         |