unicode 0.4.2-x86-mswin32-60 → 0.4.3-x86-mswin32-60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +29 -7
- data/ext/unicode/unicode.c +379 -16
- data/ext/unicode/unidata.map +24536 -24435
- data/ext/unicode/wstring.c +69 -1
- data/ext/unicode/wstring.h +2 -0
- data/lib/unicode/1.8/unicode_native.so +0 -0
- data/lib/unicode/1.9/unicode_native.so +0 -0
- data/tools/README +3 -2
- data/tools/mkunidata.rb +136 -12
- data/unicode.gemspec +2 -2
- metadata +5 -5
    
        data/README
    CHANGED
    
    | @@ -1,5 +1,5 @@ | |
| 1 1 | 
             
            		   Unicode Library for Ruby
         | 
| 2 | 
            -
            			Version 0.4. | 
| 2 | 
            +
            			Version 0.4.3
         | 
| 3 3 |  | 
| 4 4 | 
             
            		       Yoshida Masato
         | 
| 5 5 |  | 
| @@ -7,14 +7,14 @@ | |
| 7 7 | 
             
            - Introduction
         | 
| 8 8 |  | 
| 9 9 | 
             
              Unicode string manipulation library for Ruby.
         | 
| 10 | 
            -
              This library is based on  | 
| 10 | 
            +
              This library is based on UAX #15 Unicode Normalization Forms(*1).
         | 
| 11 11 |  | 
| 12 12 | 
             
                *1 <URL:http://www.unicode.org/unicode/reports/tr15/>
         | 
| 13 13 |  | 
| 14 14 |  | 
| 15 15 | 
             
            - Install
         | 
| 16 16 |  | 
| 17 | 
            -
              This can work with ruby-1.8 or later. I recommend you to
         | 
| 17 | 
            +
              This can work with ruby-1.8.7 or later. I recommend you to
         | 
| 18 18 | 
             
              use ruby-1.9.3 or later.
         | 
| 19 19 |  | 
| 20 20 | 
             
              Make and install usually.
         | 
| @@ -79,7 +79,7 @@ | |
| 79 79 | 
             
                These are aliases of decompose/decompose_compat.
         | 
| 80 80 |  | 
| 81 81 | 
             
              Unicode::normalize_D_safe(str)  (Unicode::nfd_safe(str))
         | 
| 82 | 
            -
                This is an  | 
| 82 | 
            +
                This is an alias of decompose_safe.
         | 
| 83 83 |  | 
| 84 84 | 
             
              Unicode::normalize_C(str) (Unicode::nfc(str))
         | 
| 85 85 | 
             
              Unicode::normalize_KC(str) (Unicode::nfkc(str))
         | 
| @@ -98,14 +98,35 @@ | |
| 98 98 | 
             
                The mappings that are used by these functions are not normative
         | 
| 99 99 | 
             
                in UnicodeData.txt.
         | 
| 100 100 |  | 
| 101 | 
            +
              Unicode::categories(str)
         | 
| 102 | 
            +
              Unicode::abbr_categories(str)
         | 
| 103 | 
            +
                Get an array of general category names of the string.
         | 
| 104 | 
            +
                get_abbr_categories returns abbreviated names.
         | 
| 105 | 
            +
                These can be called with a block.
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                  Unicode.get_category do |category| p category end
         | 
| 108 | 
            +
             | 
| 109 | 
            +
              Unicode::text_elements(str)
         | 
| 110 | 
            +
                Get an array of text elements.
         | 
| 111 | 
            +
                A text element is a unit that is displayed as a single character.
         | 
| 112 | 
            +
                These can be called with a block.
         | 
| 113 | 
            +
             | 
| 114 | 
            +
              Unicode::width(str[, cjk])
         | 
| 115 | 
            +
                Estimate the display width on the fixed pitch text terminal.
         | 
| 116 | 
            +
                It based on Markus Kuhn's mk_wcwidth.
         | 
| 117 | 
            +
                If the optional argument 'cjk' is true, East Asian
         | 
| 118 | 
            +
                Ambiguous characters are treated as wide characters.
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                  Unicode.width("\u03b1") #=> 1
         | 
| 121 | 
            +
                  Unicode.width("\u03b1", true) #=> 2
         | 
| 122 | 
            +
             | 
| 123 | 
            +
             | 
| 101 124 | 
             
            - Bugs
         | 
| 102 125 |  | 
| 103 | 
            -
               | 
| 126 | 
            +
              UAX #15 suggests that the look up for Normalization Form C
         | 
| 104 127 | 
             
              should not be implemented with a hash of string for better
         | 
| 105 128 | 
             
              performance.
         | 
| 106 129 |  | 
| 107 | 
            -
              Case conversion functions should reflecte UTR #21.
         | 
| 108 | 
            -
             | 
| 109 130 |  | 
| 110 131 | 
             
            - Copying
         | 
| 111 132 |  | 
| @@ -123,6 +144,7 @@ | |
| 123 144 |  | 
| 124 145 | 
             
            - History
         | 
| 125 146 |  | 
| 147 | 
            +
              Aug  8, 2012 version 0.4.3 add categories, text_elements and width
         | 
| 126 148 | 
             
              Feb 29, 2012 version 0.4.2 add decompose_safe
         | 
| 127 149 | 
             
              Feb  3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
         | 
| 128 150 | 
             
              Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
         | 
    
        data/ext/unicode/unicode.c
    CHANGED
    
    | @@ -1,5 +1,6 @@ | |
| 1 1 | 
             
            /*
         | 
| 2 | 
            -
             * Unicode Library version 0.4
         | 
| 2 | 
            +
             * Unicode Library version 0.4.3
         | 
| 3 | 
            +
             * Aug  8, 2012: version 0.4
         | 
| 3 4 | 
             
             * Oct 14, 2010: version 0.4
         | 
| 4 5 | 
             
             * Feb 26, 2010: version 0.3
         | 
| 5 6 | 
             
             * Dec 29, 2009: version 0.2
         | 
| @@ -7,7 +8,7 @@ | |
| 7 8 | 
             
             *
         | 
| 8 9 | 
             
             */
         | 
| 9 10 |  | 
| 10 | 
            -
            #define UNICODE_VERSION "0.4. | 
| 11 | 
            +
            #define UNICODE_VERSION "0.4.3"
         | 
| 11 12 |  | 
| 12 13 | 
             
            #include "ruby.h"
         | 
| 13 14 | 
             
            #ifdef HAVE_RUBY_IO_H
         | 
| @@ -54,6 +55,8 @@ taintObject(VALUE src, VALUE obj) { | |
| 54 55 | 
             
            static VALUE mUnicode;
         | 
| 55 56 | 
             
            static VALUE unicode_data;
         | 
| 56 57 | 
             
            static VALUE composition_table;
         | 
| 58 | 
            +
            static VALUE catname_long[c_Cn+1];
         | 
| 59 | 
            +
            static VALUE catname_abbr[c_Cn+1];
         | 
| 57 60 |  | 
| 58 61 | 
             
            /* Hangul */
         | 
| 59 62 | 
             
            #define SBASE   (0xac00)
         | 
| @@ -66,6 +69,86 @@ static VALUE composition_table; | |
| 66 69 | 
             
            #define NCOUNT  (VCOUNT * TCOUNT) /* 588 */
         | 
| 67 70 | 
             
            #define SCOUNT  (LCOUNT * NCOUNT) /* 11172 */
         | 
| 68 71 |  | 
| 72 | 
            +
            VALUE
         | 
| 73 | 
            +
            get_unidata(int ucs) {
         | 
| 74 | 
            +
              VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
         | 
| 75 | 
            +
              if (!NIL_P(ch))
         | 
| 76 | 
            +
                return ch;
         | 
| 77 | 
            +
            #ifdef CJK_IDEOGRAPH_EXTENSION_A_FIRST
         | 
| 78 | 
            +
              else if (ucs >= CJK_IDEOGRAPH_EXTENSION_A_FIRST &&
         | 
| 79 | 
            +
                       ucs <= CJK_IDEOGRAPH_EXTENSION_A_LAST)
         | 
| 80 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 81 | 
            +
                                    INT2FIX(CJK_IDEOGRAPH_EXTENSION_A_FIRST));
         | 
| 82 | 
            +
            #endif
         | 
| 83 | 
            +
            #ifdef CJK_IDEOGRAPH_FIRST
         | 
| 84 | 
            +
              else if (ucs >= CJK_IDEOGRAPH_FIRST &&
         | 
| 85 | 
            +
                       ucs <= CJK_IDEOGRAPH_LAST)
         | 
| 86 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 87 | 
            +
                                    INT2FIX(CJK_IDEOGRAPH_FIRST));
         | 
| 88 | 
            +
            #endif
         | 
| 89 | 
            +
            #ifdef HANGUL_SYLLABLE_FIRST
         | 
| 90 | 
            +
              else if (ucs >= HANGUL_SYLLABLE_FIRST &&
         | 
| 91 | 
            +
                       ucs <= HANGUL_SYLLABLE_LAST)
         | 
| 92 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 93 | 
            +
                                    INT2FIX(HANGUL_SYLLABLE_FIRST));
         | 
| 94 | 
            +
            #endif
         | 
| 95 | 
            +
            #ifdef NON_PRIVATE_USE_HIGH_SURROGATE_FIRST
         | 
| 96 | 
            +
              else if (ucs >= NON_PRIVATE_USE_HIGH_SURROGATE_FIRST &&
         | 
| 97 | 
            +
                       ucs <= NON_PRIVATE_USE_HIGH_SURROGATE_LAST)
         | 
| 98 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 99 | 
            +
                                    INT2FIX(NON_PRIVATE_USE_HIGH_SURROGATE_FIRST));
         | 
| 100 | 
            +
            #endif
         | 
| 101 | 
            +
            #ifdef PRIVATE_USE_HIGH_SURROGATE_FIRST
         | 
| 102 | 
            +
              else if (ucs >= PRIVATE_USE_HIGH_SURROGATE_FIRST &&
         | 
| 103 | 
            +
                       ucs <= PRIVATE_USE_HIGH_SURROGATE_LAST)
         | 
| 104 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 105 | 
            +
                                    INT2FIX(PRIVATE_USE_HIGH_SURROGATE_FIRST));
         | 
| 106 | 
            +
            #endif
         | 
| 107 | 
            +
            #ifdef LOW_SURROGATE_FIRST
         | 
| 108 | 
            +
              else if (ucs >= LOW_SURROGATE_FIRST &&
         | 
| 109 | 
            +
                       ucs <= LOW_SURROGATE_LAST)
         | 
| 110 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 111 | 
            +
                                    INT2FIX(LOW_SURROGATE_FIRST));
         | 
| 112 | 
            +
            #endif
         | 
| 113 | 
            +
            #ifdef PRIVATE_USE_FIRST
         | 
| 114 | 
            +
              else if (ucs >= PRIVATE_USE_FIRST &&
         | 
| 115 | 
            +
                       ucs <= PRIVATE_USE_LAST)
         | 
| 116 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 117 | 
            +
                                    INT2FIX(PRIVATE_USE_FIRST));
         | 
| 118 | 
            +
            #endif
         | 
| 119 | 
            +
            #ifdef CJK_IDEOGRAPH_EXTENSION_B_FIRST
         | 
| 120 | 
            +
              else if (ucs >= CJK_IDEOGRAPH_EXTENSION_B_FIRST &&
         | 
| 121 | 
            +
                       ucs <= CJK_IDEOGRAPH_EXTENSION_B_LAST)
         | 
| 122 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 123 | 
            +
                                    INT2FIX(CJK_IDEOGRAPH_EXTENSION_B_FIRST));
         | 
| 124 | 
            +
            #endif
         | 
| 125 | 
            +
            #ifdef CJK_IDEOGRAPH_EXTENSION_C_FIRST
         | 
| 126 | 
            +
              else if (ucs >= CJK_IDEOGRAPH_EXTENSION_C_FIRST &&
         | 
| 127 | 
            +
                       ucs <= CJK_IDEOGRAPH_EXTENSION_C_LAST)
         | 
| 128 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 129 | 
            +
                                    INT2FIX(CJK_IDEOGRAPH_EXTENSION_C_FIRST));
         | 
| 130 | 
            +
            #endif
         | 
| 131 | 
            +
            #ifdef CJK_IDEOGRAPH_EXTENSION_D_FIRST
         | 
| 132 | 
            +
              else if (ucs >= CJK_IDEOGRAPH_EXTENSION_D_FIRST &&
         | 
| 133 | 
            +
                       ucs <= CJK_IDEOGRAPH_EXTENSION_D_LAST)
         | 
| 134 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 135 | 
            +
                                    INT2FIX(CJK_IDEOGRAPH_EXTENSION_D_FIRST));
         | 
| 136 | 
            +
            #endif
         | 
| 137 | 
            +
            #ifdef PLANE_15_PRIVATE_USE_FIRST
         | 
| 138 | 
            +
              else if (ucs >= PLANE_15_PRIVATE_USE_FIRST &&
         | 
| 139 | 
            +
                       ucs <= PLANE_15_PRIVATE_USE_LAST)
         | 
| 140 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 141 | 
            +
                                    INT2FIX(PLANE_15_PRIVATE_USE_FIRST));
         | 
| 142 | 
            +
            #endif
         | 
| 143 | 
            +
            #ifdef PLANE_16_PRIVATE_USE_FIRST
         | 
| 144 | 
            +
              else if (ucs >= PLANE_16_PRIVATE_USE_FIRST &&
         | 
| 145 | 
            +
                       ucs <= PLANE_16_PRIVATE_USE_LAST)
         | 
| 146 | 
            +
                return rb_hash_aref(unicode_data,
         | 
| 147 | 
            +
                                    INT2FIX(PLANE_16_PRIVATE_USE_FIRST));
         | 
| 148 | 
            +
            #endif
         | 
| 149 | 
            +
              return Qnil;
         | 
| 150 | 
            +
            }
         | 
| 151 | 
            +
             | 
| 69 152 | 
             
            static int
         | 
| 70 153 | 
             
            get_cc(int ucs)
         | 
| 71 154 | 
             
            {
         | 
| @@ -77,6 +160,28 @@ get_cc(int ucs) | |
| 77 160 | 
             
              return 0;
         | 
| 78 161 | 
             
            }
         | 
| 79 162 |  | 
| 163 | 
            +
            static int
         | 
| 164 | 
            +
            get_gencat(int ucs)
         | 
| 165 | 
            +
            {
         | 
| 166 | 
            +
              VALUE ch = get_unidata(ucs);
         | 
| 167 | 
            +
             | 
| 168 | 
            +
              if (!NIL_P(ch)) {
         | 
| 169 | 
            +
                return unidata[FIX2INT(ch)].general_category;
         | 
| 170 | 
            +
              }
         | 
| 171 | 
            +
              return c_Cn; /* Unassigned */
         | 
| 172 | 
            +
            }
         | 
| 173 | 
            +
             | 
| 174 | 
            +
            static int
         | 
| 175 | 
            +
            get_eawidth(int ucs)
         | 
| 176 | 
            +
            {
         | 
| 177 | 
            +
              VALUE ch = get_unidata(ucs);
         | 
| 178 | 
            +
             | 
| 179 | 
            +
              if (!NIL_P(ch)) {
         | 
| 180 | 
            +
                return unidata[FIX2INT(ch)].east_asian_width;
         | 
| 181 | 
            +
              }
         | 
| 182 | 
            +
              return w_N; /* Neutral */
         | 
| 183 | 
            +
            }
         | 
| 184 | 
            +
             | 
| 80 185 | 
             
            static const char*
         | 
| 81 186 | 
             
            get_canon(int ucs)
         | 
| 82 187 | 
             
            {
         | 
| @@ -538,8 +643,8 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2) | |
| 538 643 | 
             
              CONVERT_TO_UTF8(str1);
         | 
| 539 644 | 
             
              CONVERT_TO_UTF8(str2);
         | 
| 540 645 | 
             
            #endif
         | 
| 541 | 
            -
               | 
| 542 | 
            -
               | 
| 646 | 
            +
              WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
         | 
| 647 | 
            +
              WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
         | 
| 543 648 | 
             
              WStr_alloc(&result1);
         | 
| 544 649 | 
             
              WStr_alloc(&result2);
         | 
| 545 650 | 
             
              decompose_internal(&wstr1, &result1);
         | 
| @@ -580,8 +685,8 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2) | |
| 580 685 | 
             
              CONVERT_TO_UTF8(str1);
         | 
| 581 686 | 
             
              CONVERT_TO_UTF8(str2);
         | 
| 582 687 | 
             
            #endif
         | 
| 583 | 
            -
               | 
| 584 | 
            -
               | 
| 688 | 
            +
              WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
         | 
| 689 | 
            +
              WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
         | 
| 585 690 | 
             
              WStr_alloc(&result1);
         | 
| 586 691 | 
             
              WStr_alloc(&result2);
         | 
| 587 692 | 
             
              decompose_compat_internal(&wstr1, &result1);
         | 
| @@ -617,7 +722,7 @@ unicode_decompose(VALUE obj, VALUE str) | |
| 617 722 | 
             
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 618 723 | 
             
              CONVERT_TO_UTF8(str);
         | 
| 619 724 | 
             
            #endif
         | 
| 620 | 
            -
               | 
| 725 | 
            +
              WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 621 726 | 
             
              WStr_alloc(&result);
         | 
| 622 727 | 
             
              decompose_internal(&ustr, &result);
         | 
| 623 728 | 
             
              WStr_free(&ustr);
         | 
| @@ -643,7 +748,7 @@ unicode_decompose_safe(VALUE obj, VALUE str) | |
| 643 748 | 
             
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 644 749 | 
             
              CONVERT_TO_UTF8(str);
         | 
| 645 750 | 
             
            #endif
         | 
| 646 | 
            -
               | 
| 751 | 
            +
              WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 647 752 | 
             
              WStr_alloc(&result);
         | 
| 648 753 | 
             
              decompose_safe_internal(&ustr, &result);
         | 
| 649 754 | 
             
              WStr_free(&ustr);
         | 
| @@ -669,7 +774,7 @@ unicode_decompose_compat(VALUE obj, VALUE str) | |
| 669 774 | 
             
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 670 775 | 
             
              CONVERT_TO_UTF8(str);
         | 
| 671 776 | 
             
            #endif
         | 
| 672 | 
            -
               | 
| 777 | 
            +
              WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 673 778 | 
             
              WStr_alloc(&result);
         | 
| 674 779 | 
             
              decompose_compat_internal(&ustr, &result);
         | 
| 675 780 | 
             
              WStr_free(&ustr);
         | 
| @@ -695,7 +800,7 @@ unicode_compose(VALUE obj, VALUE str) | |
| 695 800 | 
             
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 696 801 | 
             
              CONVERT_TO_UTF8(str);
         | 
| 697 802 | 
             
            #endif
         | 
| 698 | 
            -
               | 
| 803 | 
            +
              WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 699 804 | 
             
              sort_canonical(&ustr);
         | 
| 700 805 | 
             
              WStr_alloc(&result);
         | 
| 701 806 | 
             
              compose_internal(&ustr, &result);
         | 
| @@ -722,7 +827,7 @@ unicode_normalize_C(VALUE obj, VALUE str) | |
| 722 827 | 
             
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 723 828 | 
             
              CONVERT_TO_UTF8(str);
         | 
| 724 829 | 
             
            #endif
         | 
| 725 | 
            -
               | 
| 830 | 
            +
              WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 726 831 | 
             
              WStr_alloc(&ustr2);
         | 
| 727 832 | 
             
              decompose_internal(&ustr1, &ustr2);
         | 
| 728 833 | 
             
              WStr_free(&ustr1);
         | 
| @@ -752,7 +857,7 @@ unicode_normalize_safe(VALUE obj, VALUE str) | |
| 752 857 | 
             
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 753 858 | 
             
              CONVERT_TO_UTF8(str);
         | 
| 754 859 | 
             
            #endif
         | 
| 755 | 
            -
               | 
| 860 | 
            +
              WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 756 861 | 
             
              WStr_alloc(&ustr2);
         | 
| 757 862 | 
             
              decompose_safe_internal(&ustr1, &ustr2);
         | 
| 758 863 | 
             
              WStr_free(&ustr1);
         | 
| @@ -782,7 +887,7 @@ unicode_normalize_KC(VALUE obj, VALUE str) | |
| 782 887 | 
             
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 783 888 | 
             
              CONVERT_TO_UTF8(str);
         | 
| 784 889 | 
             
            #endif
         | 
| 785 | 
            -
               | 
| 890 | 
            +
              WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 786 891 | 
             
              WStr_alloc(&ustr2);
         | 
| 787 892 | 
             
              decompose_compat_internal(&ustr1, &ustr2);
         | 
| 788 893 | 
             
              WStr_free(&ustr1);
         | 
| @@ -811,7 +916,7 @@ unicode_upcase(VALUE obj, VALUE str) | |
| 811 916 | 
             
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 812 917 | 
             
              CONVERT_TO_UTF8(str);
         | 
| 813 918 | 
             
            #endif
         | 
| 814 | 
            -
               | 
| 919 | 
            +
              WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 815 920 | 
             
              WStr_alloc(&result);
         | 
| 816 921 | 
             
              upcase_internal(&ustr, &result);
         | 
| 817 922 | 
             
              //sort_canonical(&result);
         | 
| @@ -837,7 +942,7 @@ unicode_downcase(VALUE obj, VALUE str) | |
| 837 942 | 
             
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 838 943 | 
             
              CONVERT_TO_UTF8(str);
         | 
| 839 944 | 
             
            #endif
         | 
| 840 | 
            -
               | 
| 945 | 
            +
              WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 841 946 | 
             
              WStr_alloc(&result);
         | 
| 842 947 | 
             
              downcase_internal(&ustr, &result);
         | 
| 843 948 | 
             
              //sort_canonical(&result);
         | 
| @@ -868,7 +973,7 @@ unicode_capitalize(VALUE obj, VALUE str) | |
| 868 973 | 
             
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 869 974 | 
             
              CONVERT_TO_UTF8(str);
         | 
| 870 975 | 
             
            #endif
         | 
| 871 | 
            -
               | 
| 976 | 
            +
              WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 872 977 | 
             
              WStr_alloc(&result);
         | 
| 873 978 | 
             
              capitalize_internal(&ustr, &result);
         | 
| 874 979 | 
             
              //sort_canonical(&result);
         | 
| @@ -882,6 +987,248 @@ unicode_capitalize(VALUE obj, VALUE str) | |
| 882 987 | 
             
              return vret;
         | 
| 883 988 | 
             
            }
         | 
| 884 989 |  | 
| 990 | 
            +
            typedef struct _get_categories_param {
         | 
| 991 | 
            +
              WString* wstr;
         | 
| 992 | 
            +
              VALUE str;
         | 
| 993 | 
            +
              VALUE* catname;
         | 
| 994 | 
            +
            } get_categories_param;
         | 
| 995 | 
            +
             | 
| 996 | 
            +
            static VALUE
         | 
| 997 | 
            +
            get_categories_internal(get_categories_param* param)
         | 
| 998 | 
            +
            {
         | 
| 999 | 
            +
              WString* wstr = param->wstr;
         | 
| 1000 | 
            +
              VALUE str = param->str;
         | 
| 1001 | 
            +
              VALUE* catname = param->catname;
         | 
| 1002 | 
            +
              int pos;
         | 
| 1003 | 
            +
              int block_p = rb_block_given_p();
         | 
| 1004 | 
            +
              volatile VALUE ret = str;
         | 
| 1005 | 
            +
             | 
| 1006 | 
            +
              if (!block_p)
         | 
| 1007 | 
            +
                ret = rb_ary_new();
         | 
| 1008 | 
            +
              for (pos = 0; pos < wstr->len; pos++) {
         | 
| 1009 | 
            +
                int gencat = get_gencat(wstr->str[pos]);
         | 
| 1010 | 
            +
                if (!block_p)
         | 
| 1011 | 
            +
                  rb_ary_push(ret, catname[gencat]);
         | 
| 1012 | 
            +
                else {
         | 
| 1013 | 
            +
                  rb_yield(catname[gencat]);
         | 
| 1014 | 
            +
                }
         | 
| 1015 | 
            +
              }
         | 
| 1016 | 
            +
             
         | 
| 1017 | 
            +
              return ret;
         | 
| 1018 | 
            +
            }
         | 
| 1019 | 
            +
             | 
| 1020 | 
            +
            VALUE
         | 
| 1021 | 
            +
            get_categories_ensure(WString* wstr)
         | 
| 1022 | 
            +
            {
         | 
| 1023 | 
            +
              WStr_free(wstr);
         | 
| 1024 | 
            +
              return Qnil;
         | 
| 1025 | 
            +
            }
         | 
| 1026 | 
            +
             | 
| 1027 | 
            +
            VALUE
         | 
| 1028 | 
            +
            unicode_get_categories(VALUE obj, VALUE str)
         | 
| 1029 | 
            +
            {
         | 
| 1030 | 
            +
              WString wstr;
         | 
| 1031 | 
            +
              get_categories_param param = { &wstr, str, catname_long };
         | 
| 1032 | 
            +
             | 
| 1033 | 
            +
              Check_Type(str, T_STRING);
         | 
| 1034 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 1035 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 1036 | 
            +
            #endif
         | 
| 1037 | 
            +
              WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 1038 | 
            +
             | 
| 1039 | 
            +
              return rb_ensure(get_categories_internal, (VALUE)¶m,
         | 
| 1040 | 
            +
                               get_categories_ensure, (VALUE)&wstr);
         | 
| 1041 | 
            +
              /* wstr will be freed in get_text_elements_ensure() */
         | 
| 1042 | 
            +
            }
         | 
| 1043 | 
            +
             | 
| 1044 | 
            +
             | 
| 1045 | 
            +
            VALUE
         | 
| 1046 | 
            +
            unicode_get_abbr_categories(VALUE obj, VALUE str)
         | 
| 1047 | 
            +
            {
         | 
| 1048 | 
            +
              WString wstr;
         | 
| 1049 | 
            +
              get_categories_param param = { &wstr, str, catname_abbr };
         | 
| 1050 | 
            +
             | 
| 1051 | 
            +
              Check_Type(str, T_STRING);
         | 
| 1052 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 1053 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 1054 | 
            +
            #endif
         | 
| 1055 | 
            +
              WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 1056 | 
            +
             | 
| 1057 | 
            +
              return rb_ensure(get_categories_internal, (VALUE)¶m,
         | 
| 1058 | 
            +
                               get_categories_ensure, (VALUE)&wstr);
         | 
| 1059 | 
            +
              /* wstr will be freed in get_text_elements_ensure() */
         | 
| 1060 | 
            +
            }
         | 
| 1061 | 
            +
             | 
| 1062 | 
            +
            VALUE
         | 
| 1063 | 
            +
            unicode_wcswidth(int argc, VALUE* argv, VALUE obj)
         | 
| 1064 | 
            +
            {
         | 
| 1065 | 
            +
              WString wstr;
         | 
| 1066 | 
            +
              int i, count;
         | 
| 1067 | 
            +
              int width = 0;
         | 
| 1068 | 
            +
              int cjk_p = 0;
         | 
| 1069 | 
            +
              VALUE str;
         | 
| 1070 | 
            +
              VALUE cjk;
         | 
| 1071 | 
            +
             | 
| 1072 | 
            +
              count = rb_scan_args(argc, argv, "11", &str, &cjk);
         | 
| 1073 | 
            +
              if (count > 1)
         | 
| 1074 | 
            +
                cjk_p = RTEST(cjk);
         | 
| 1075 | 
            +
              Check_Type(str, T_STRING);
         | 
| 1076 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 1077 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 1078 | 
            +
            #endif
         | 
| 1079 | 
            +
              WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 1080 | 
            +
              for (i = 0; i <wstr.len; i++) {
         | 
| 1081 | 
            +
                int c = wstr.str[i];
         | 
| 1082 | 
            +
                int cat = get_gencat(c);
         | 
| 1083 | 
            +
                int eaw = get_eawidth(c);
         | 
| 1084 | 
            +
                if ((c > 0 && c < 32) || (c >= 0x7f && c < 0xa0)) {
         | 
| 1085 | 
            +
                  /* Control Characters */
         | 
| 1086 | 
            +
                  width = -1;
         | 
| 1087 | 
            +
                  break;
         | 
| 1088 | 
            +
                }
         | 
| 1089 | 
            +
                else if (c != 0x00ad && /* SOFT HYPHEN */
         | 
| 1090 | 
            +
                         (cat == c_Mn || cat == c_Me || /* Non-spacing Marks */
         | 
| 1091 | 
            +
                          cat == c_Cf || /* Format */
         | 
| 1092 | 
            +
                          c == 0 || /* NUL */
         | 
| 1093 | 
            +
                          (c >= 0x1160 && c <= 0x11ff))) /* HANGUL JUNGSEONG/JONGSEONG */
         | 
| 1094 | 
            +
                  /* zero width */ ;
         | 
| 1095 | 
            +
                else if (eaw == w_F || eaw == w_W || /* Fullwidth or Wide */
         | 
| 1096 | 
            +
                         (c >= 0x4db6 && c <= 0x4dbf) || /* CJK Reserved */
         | 
| 1097 | 
            +
                         (c >= 0x9fcd && c <= 0x9fff) || /* CJK Reserved */
         | 
| 1098 | 
            +
                         (c >= 0xfa6e && c <= 0xfa6f) || /* CJK Reserved */
         | 
| 1099 | 
            +
                         (c >= 0xfada && c <= 0xfaff) || /* CJK Reserved */
         | 
| 1100 | 
            +
                         (c >= 0x2a6d7 && c <= 0x2a6ff) || /* CJK Reserved */
         | 
| 1101 | 
            +
                         (c >= 0x2b735 && c <= 0x2b73f) || /* CJK Reserved */
         | 
| 1102 | 
            +
                         (c >= 0x2b81e && c <= 0x2f7ff) || /* CJK Reserved */
         | 
| 1103 | 
            +
                         (c >= 0x2fa1e && c <= 0x2fffd) || /* CJK Reserved */
         | 
| 1104 | 
            +
                         (c >= 0x30000 && c <= 0x3fffd) || /* CJK Reserved */
         | 
| 1105 | 
            +
                         (cjk_p && eaw == w_A)) /* East Asian Ambiguous */
         | 
| 1106 | 
            +
                  width += 2;
         | 
| 1107 | 
            +
                else
         | 
| 1108 | 
            +
                  width++; /* Halfwidth or Neutral */
         | 
| 1109 | 
            +
              }
         | 
| 1110 | 
            +
              WStr_free(&wstr);
         | 
| 1111 | 
            +
             | 
| 1112 | 
            +
              return INT2FIX(width);
         | 
| 1113 | 
            +
            }
         | 
| 1114 | 
            +
             | 
| 1115 | 
            +
            VALUE
         | 
| 1116 | 
            +
            wstring_to_rstring(WString* wstr, int start, int len) {
         | 
| 1117 | 
            +
              UString ret;
         | 
| 1118 | 
            +
              volatile VALUE vret;
         | 
| 1119 | 
            +
             | 
| 1120 | 
            +
              UniStr_alloc(&ret);
         | 
| 1121 | 
            +
              WStr_convertIntoUString2(wstr, start, len, &ret);
         | 
| 1122 | 
            +
              vret = ENC_(rb_str_new((char*)ret.str, ret.len));
         | 
| 1123 | 
            +
              UniStr_free(&ret);
         | 
| 1124 | 
            +
             | 
| 1125 | 
            +
              return vret;
         | 
| 1126 | 
            +
            }
         | 
| 1127 | 
            +
             | 
| 1128 | 
            +
            typedef struct _get_text_elements_param {
         | 
| 1129 | 
            +
              WString* wstr;
         | 
| 1130 | 
            +
              VALUE str;
         | 
| 1131 | 
            +
            } get_text_elements_param;
         | 
| 1132 | 
            +
             | 
| 1133 | 
            +
            VALUE
         | 
| 1134 | 
            +
            get_text_elements_internal(get_text_elements_param* param)
         | 
| 1135 | 
            +
            {
         | 
| 1136 | 
            +
              WString* wstr = param->wstr;
         | 
| 1137 | 
            +
              VALUE str = param->str;
         | 
| 1138 | 
            +
              int start_pos;
         | 
| 1139 | 
            +
              int block_p = rb_block_given_p();
         | 
| 1140 | 
            +
              volatile VALUE ret = str;
         | 
| 1141 | 
            +
             | 
| 1142 | 
            +
              if (!block_p)
         | 
| 1143 | 
            +
                ret = rb_ary_new();
         | 
| 1144 | 
            +
              for (start_pos = 0; start_pos < wstr->len;) {
         | 
| 1145 | 
            +
                int c0 = wstr->str[start_pos];
         | 
| 1146 | 
            +
                int cat = get_gencat(c0);
         | 
| 1147 | 
            +
                int length = 1;
         | 
| 1148 | 
            +
                int j;
         | 
| 1149 | 
            +
             | 
| 1150 | 
            +
                if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
         | 
| 1151 | 
            +
                  volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
         | 
| 1152 | 
            +
                  if (!block_p)
         | 
| 1153 | 
            +
                    rb_ary_push(ret, rstr);
         | 
| 1154 | 
            +
                  else
         | 
| 1155 | 
            +
                    rb_yield(rstr);
         | 
| 1156 | 
            +
                  start_pos++;
         | 
| 1157 | 
            +
                  continue;
         | 
| 1158 | 
            +
                }
         | 
| 1159 | 
            +
             | 
| 1160 | 
            +
                for (j = start_pos + 1; j < wstr->len; j++) {
         | 
| 1161 | 
            +
                  int c1 = wstr->str[j];
         | 
| 1162 | 
            +
                  int cat = get_gencat(c1);
         | 
| 1163 | 
            +
                  if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
         | 
| 1164 | 
            +
                      j + 1 < wstr->len &&
         | 
| 1165 | 
            +
                      c1 >= VBASE && c1 < VBASE + VCOUNT &&
         | 
| 1166 | 
            +
                      wstr->str[j+1] >= TBASE && wstr->str[j+1] < TBASE + TCOUNT) {
         | 
| 1167 | 
            +
                    /* Hangul L+V+T */
         | 
| 1168 | 
            +
                    length += 2;
         | 
| 1169 | 
            +
                    j++;
         | 
| 1170 | 
            +
                  }
         | 
| 1171 | 
            +
                  else if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
         | 
| 1172 | 
            +
                           c1 >= VBASE && c1< VBASE + VCOUNT) {
         | 
| 1173 | 
            +
                    /* Hangul L+V */
         | 
| 1174 | 
            +
                    length++;
         | 
| 1175 | 
            +
                  }
         | 
| 1176 | 
            +
                  else if (c0 >= SBASE && c0 < SBASE + SCOUNT &&
         | 
| 1177 | 
            +
                           (c0 - SBASE) % TCOUNT == 0 &&
         | 
| 1178 | 
            +
                           c1 >= TBASE && c1 < TBASE + TCOUNT) {
         | 
| 1179 | 
            +
                    /* Hangul LV+T */
         | 
| 1180 | 
            +
                    length++;
         | 
| 1181 | 
            +
                  }
         | 
| 1182 | 
            +
                  else if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
         | 
| 1183 | 
            +
                    /* Mark */
         | 
| 1184 | 
            +
                    length++;
         | 
| 1185 | 
            +
                  }
         | 
| 1186 | 
            +
                  else {
         | 
| 1187 | 
            +
                    volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
         | 
| 1188 | 
            +
                    if (!block_p)
         | 
| 1189 | 
            +
                      rb_ary_push(ret, rstr);
         | 
| 1190 | 
            +
                    else
         | 
| 1191 | 
            +
                      rb_yield(rstr);
         | 
| 1192 | 
            +
                    length = 0;
         | 
| 1193 | 
            +
                    break;
         | 
| 1194 | 
            +
                  }
         | 
| 1195 | 
            +
                }
         | 
| 1196 | 
            +
                if (length > 0) {
         | 
| 1197 | 
            +
                  volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
         | 
| 1198 | 
            +
                  if (!block_p)
         | 
| 1199 | 
            +
                    rb_ary_push(ret, rstr);
         | 
| 1200 | 
            +
                  else
         | 
| 1201 | 
            +
                    rb_yield(rstr);
         | 
| 1202 | 
            +
                }
         | 
| 1203 | 
            +
                start_pos = j;
         | 
| 1204 | 
            +
              }
         | 
| 1205 | 
            +
              return ret;
         | 
| 1206 | 
            +
            }
         | 
| 1207 | 
            +
             | 
| 1208 | 
            +
            VALUE
         | 
| 1209 | 
            +
            get_text_elements_ensure(WString* wstr)
         | 
| 1210 | 
            +
            {
         | 
| 1211 | 
            +
              WStr_free(wstr);
         | 
| 1212 | 
            +
              return Qnil;
         | 
| 1213 | 
            +
            }
         | 
| 1214 | 
            +
             | 
| 1215 | 
            +
            VALUE
         | 
| 1216 | 
            +
            unicode_get_text_elements(VALUE obj, VALUE str)
         | 
| 1217 | 
            +
            {
         | 
| 1218 | 
            +
              WString wstr;
         | 
| 1219 | 
            +
              get_text_elements_param param = { &wstr, str };
         | 
| 1220 | 
            +
             | 
| 1221 | 
            +
              Check_Type(str, T_STRING);
         | 
| 1222 | 
            +
            #ifdef HAVE_RUBY_ENCODING_H
         | 
| 1223 | 
            +
              CONVERT_TO_UTF8(str);
         | 
| 1224 | 
            +
            #endif
         | 
| 1225 | 
            +
              WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
         | 
| 1226 | 
            +
             | 
| 1227 | 
            +
              return rb_ensure(get_text_elements_internal, (VALUE)¶m,
         | 
| 1228 | 
            +
                               get_text_elements_ensure, (VALUE)&wstr);
         | 
| 1229 | 
            +
              /* wstr will be freed in get_text_elements_ensure() */
         | 
| 1230 | 
            +
            }
         | 
| 1231 | 
            +
             | 
| 885 1232 | 
             
            void
         | 
| 886 1233 | 
             
            Init_unicode_native()
         | 
| 887 1234 | 
             
            {
         | 
| @@ -909,6 +1256,13 @@ Init_unicode_native() | |
| 909 1256 | 
             
                }
         | 
| 910 1257 | 
             
              }
         | 
| 911 1258 |  | 
| 1259 | 
            +
              for (i = 0; i < c_Cn + 1; i++) {
         | 
| 1260 | 
            +
                catname_abbr[i] = ID2SYM(rb_intern(gencat_abbr[i]));
         | 
| 1261 | 
            +
                catname_long[i] = ID2SYM(rb_intern(gencat_long[i]));
         | 
| 1262 | 
            +
                rb_global_variable(&catname_abbr[i]);
         | 
| 1263 | 
            +
                rb_global_variable(&catname_long[i]);
         | 
| 1264 | 
            +
              }
         | 
| 1265 | 
            +
             | 
| 912 1266 | 
             
              rb_define_module_function(mUnicode, "strcmp",
         | 
| 913 1267 | 
             
            			    unicode_strcmp, 2);
         | 
| 914 1268 | 
             
              rb_define_module_function(mUnicode, "strcmp_compat",
         | 
| @@ -957,6 +1311,15 @@ Init_unicode_native() | |
| 957 1311 | 
             
              rb_define_module_function(mUnicode, "capitalize",
         | 
| 958 1312 | 
             
            			    unicode_capitalize, 1);
         | 
| 959 1313 |  | 
| 1314 | 
            +
              rb_define_module_function(mUnicode, "categories",
         | 
| 1315 | 
            +
            			    unicode_get_categories, 1);
         | 
| 1316 | 
            +
              rb_define_module_function(mUnicode, "abbr_categories",
         | 
| 1317 | 
            +
            			    unicode_get_abbr_categories, 1);
         | 
| 1318 | 
            +
              rb_define_module_function(mUnicode, "width",
         | 
| 1319 | 
            +
            			    unicode_wcswidth, -1);
         | 
| 1320 | 
            +
              rb_define_module_function(mUnicode, "text_elements",
         | 
| 1321 | 
            +
            			    unicode_get_text_elements, 1);
         | 
| 1322 | 
            +
             | 
| 960 1323 | 
             
              rb_define_const(mUnicode, "VERSION",
         | 
| 961 1324 | 
             
            		  rb_str_new2(UNICODE_VERSION));
         | 
| 962 1325 | 
             
            }
         |