unicode 0.4.2-x86-mswin32-60 → 0.4.3-x86-mswin32-60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +29 -7
- data/ext/unicode/unicode.c +379 -16
- data/ext/unicode/unidata.map +24536 -24435
- data/ext/unicode/wstring.c +69 -1
- data/ext/unicode/wstring.h +2 -0
- data/lib/unicode/1.8/unicode_native.so +0 -0
- data/lib/unicode/1.9/unicode_native.so +0 -0
- data/tools/README +3 -2
- data/tools/mkunidata.rb +136 -12
- data/unicode.gemspec +2 -2
- metadata +5 -5
    
        data/ext/unicode/wstring.c
    CHANGED
    
    | @@ -43,7 +43,10 @@ WStr_free(WString* str) | |
| 43 43 | 
             
            {
         | 
| 44 44 | 
             
              str->size = 0;
         | 
| 45 45 | 
             
              str->len = 0;
         | 
| 46 | 
            -
               | 
| 46 | 
            +
              if (str->str) {
         | 
| 47 | 
            +
                free(str->str);
         | 
| 48 | 
            +
                str->str = NULL;
         | 
| 49 | 
            +
              }
         | 
| 47 50 | 
             
            }
         | 
| 48 51 |  | 
| 49 52 | 
             
            int
         | 
| @@ -164,6 +167,59 @@ WStr_allocWithUTF8(WString* s, const char* in) | |
| 164 167 | 
             
              return s;
         | 
| 165 168 | 
             
            }
         | 
| 166 169 |  | 
| 170 | 
            +
            WString*
         | 
| 171 | 
            +
            WStr_allocWithUTF8L(WString* s, const char* in, int len)
         | 
| 172 | 
            +
            {
         | 
| 173 | 
            +
              int i;
         | 
| 174 | 
            +
              int u = 0;
         | 
| 175 | 
            +
              int rest = 0;
         | 
| 176 | 
            +
             | 
| 177 | 
            +
              WStr_alloc(s);
         | 
| 178 | 
            +
              if (in == NULL)
         | 
| 179 | 
            +
                return s;
         | 
| 180 | 
            +
              for (i = 0; i < len; i++) {
         | 
| 181 | 
            +
                unsigned char c = in[i];
         | 
| 182 | 
            +
                if ((c & 0xc0) == 0x80) {
         | 
| 183 | 
            +
                  if (rest == 0)
         | 
| 184 | 
            +
            	return NULL;
         | 
| 185 | 
            +
                  u = (u << 6) | (c & 63);
         | 
| 186 | 
            +
                  rest--;
         | 
| 187 | 
            +
                  if (rest == 0) {
         | 
| 188 | 
            +
            	WStr_addWChar(s, u);
         | 
| 189 | 
            +
                  }
         | 
| 190 | 
            +
                }
         | 
| 191 | 
            +
                else if ((c & 0x80) == 0) {      /* 0b0nnnnnnn (7bit) */
         | 
| 192 | 
            +
                  WStr_addWChar(s, c);
         | 
| 193 | 
            +
                  rest = 0;
         | 
| 194 | 
            +
                }
         | 
| 195 | 
            +
                else if ((c & 0xe0) == 0xc0) {      /* 0b110nnnnn (11bit) */
         | 
| 196 | 
            +
                  rest = 1;
         | 
| 197 | 
            +
                  u = c & 31;
         | 
| 198 | 
            +
                }
         | 
| 199 | 
            +
                else if ((c & 0xf0) == 0xe0) {      /* 0b1110nnnn (16bit) */
         | 
| 200 | 
            +
                  rest = 2;
         | 
| 201 | 
            +
                  u = c & 15;
         | 
| 202 | 
            +
                }
         | 
| 203 | 
            +
                else if ((c & 0xf8) == 0xf0) {      /* 0b11110nnn (21bit) */
         | 
| 204 | 
            +
                  rest = 3;
         | 
| 205 | 
            +
                  u = c & 7;
         | 
| 206 | 
            +
                }
         | 
| 207 | 
            +
                else if ((c & 0xfc) == 0xf8) {      /* 0b111110nn (26bit) */
         | 
| 208 | 
            +
                  rest = 4;
         | 
| 209 | 
            +
                  u = c & 3;
         | 
| 210 | 
            +
                }
         | 
| 211 | 
            +
                else if ((c & 0xfe) == 0xfc) {      /* 0b1111110n (31bit) */
         | 
| 212 | 
            +
                  rest = 5;
         | 
| 213 | 
            +
                  u = c & 1;
         | 
| 214 | 
            +
                }
         | 
| 215 | 
            +
                else {
         | 
| 216 | 
            +
                  return NULL;
         | 
| 217 | 
            +
                }
         | 
| 218 | 
            +
              }
         | 
| 219 | 
            +
             | 
| 220 | 
            +
              return s;
         | 
| 221 | 
            +
            }
         | 
| 222 | 
            +
             | 
| 167 223 | 
             
            UString*
         | 
| 168 224 | 
             
            WStr_convertIntoUString(WString* wstr, UString* ustr)
         | 
| 169 225 | 
             
            {
         | 
| @@ -176,6 +232,18 @@ WStr_convertIntoUString(WString* wstr, UString* ustr) | |
| 176 232 | 
             
              return ustr;
         | 
| 177 233 | 
             
            }
         | 
| 178 234 |  | 
| 235 | 
            +
            UString*
         | 
| 236 | 
            +
            WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr)
         | 
| 237 | 
            +
            {
         | 
| 238 | 
            +
              int i;
         | 
| 239 | 
            +
             | 
| 240 | 
            +
              for (i = start; i < wstr->len && i < start + len; i++) {
         | 
| 241 | 
            +
                UniStr_addWChar(ustr, wstr->str[i]);
         | 
| 242 | 
            +
              }
         | 
| 243 | 
            +
             | 
| 244 | 
            +
              return ustr;
         | 
| 245 | 
            +
            }
         | 
| 246 | 
            +
             | 
| 179 247 | 
             
            void
         | 
| 180 248 | 
             
            WStr_dump(WString* s)
         | 
| 181 249 | 
             
            {
         | 
    
        data/ext/unicode/wstring.h
    CHANGED
    
    | @@ -24,6 +24,7 @@ typedef struct _WString { | |
| 24 24 |  | 
| 25 25 | 
             
            WString* WStr_alloc(WString* str);
         | 
| 26 26 | 
             
            WString* WStr_allocWithUTF8(WString* s, const char* u);
         | 
| 27 | 
            +
            WString* WStr_allocWithUTF8L(WString* s, const char* u, int len);
         | 
| 27 28 | 
             
            WString* WStr_enlarge(WString* str, int size);
         | 
| 28 29 | 
             
            void WStr_free(WString* str);
         | 
| 29 30 | 
             
            int WStr_addWChars(WString* s, const int* a, int len);
         | 
| @@ -32,6 +33,7 @@ int WStr_pushWString(WString* s, const WString* add); | |
| 32 33 | 
             
            int WStr_addWChar2(WString* s, int a1, int a2);
         | 
| 33 34 | 
             
            int WStr_addWChar3(WString* s, int a1, int a2, int a3);
         | 
| 34 35 | 
             
            UString* WStr_convertIntoUString(WString* wstr, UString* ustr);
         | 
| 36 | 
            +
            UString* WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr);
         | 
| 35 37 | 
             
            void WStr_dump(WString* s);
         | 
| 36 38 |  | 
| 37 39 | 
             
            #ifdef __cplusplus
         | 
| Binary file | 
| Binary file | 
    
        data/tools/README
    CHANGED
    
    | @@ -1,6 +1,7 @@ | |
| 1 1 | 
             
            The bundled unidata.map is created from UnicodeData.txt,
         | 
| 2 | 
            -
            DerivedNormalizationProps.txt  | 
| 2 | 
            +
            DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
         | 
| 3 | 
            +
            of Unicode 6.0.
         | 
| 3 4 |  | 
| 4 5 | 
             
            To update unidata.map,
         | 
| 5 6 |  | 
| 6 | 
            -
              ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt > unidata.map
         | 
| 7 | 
            +
              ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt  > unidata.map
         | 
    
        data/tools/mkunidata.rb
    CHANGED
    
    | @@ -7,22 +7,102 @@ | |
| 7 7 | 
             
            HEAD=<<EOS
         | 
| 8 8 | 
             
            /*
         | 
| 9 9 | 
             
             * UnicodeData
         | 
| 10 | 
            -
             * Copyright 1999, 2004, 2010 by yoshidam
         | 
| 10 | 
            +
             * Copyright 1999, 2004, 2010, 2012 by yoshidam
         | 
| 11 11 | 
             
             *
         | 
| 12 12 | 
             
             */
         | 
| 13 13 |  | 
| 14 14 | 
             
            #ifndef _UNIDATA_MAP
         | 
| 15 15 | 
             
            #define _UNIDATA_MAP
         | 
| 16 16 |  | 
| 17 | 
            +
            EOS
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            HEAD1=<<EOS
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            enum GeneralCategory {
         | 
| 22 | 
            +
              /* Letter */
         | 
| 23 | 
            +
              c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
         | 
| 24 | 
            +
              /* Mark */
         | 
| 25 | 
            +
              c_Mn, c_Mc, c_Me,
         | 
| 26 | 
            +
              /* Number */
         | 
| 27 | 
            +
              c_Nd, c_Nl, c_No,
         | 
| 28 | 
            +
              /* Punctuation */
         | 
| 29 | 
            +
              c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
         | 
| 30 | 
            +
              /* Symbol */
         | 
| 31 | 
            +
              c_Sm, c_Sc, c_Sk, c_So,
         | 
| 32 | 
            +
              /* Separator */
         | 
| 33 | 
            +
              c_Zs, c_Zl, c_Zp,
         | 
| 34 | 
            +
              /* Other */
         | 
| 35 | 
            +
              c_Cc, c_Cf, c_Cs, c_Co, c_Cn
         | 
| 36 | 
            +
            };
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            const char* const gencat_abbr[] = {
         | 
| 39 | 
            +
              "", /* 0 */
         | 
| 40 | 
            +
              /* Letter */
         | 
| 41 | 
            +
              "Lu", "Ll", "Lt", "LC", "Lm", "Lo",
         | 
| 42 | 
            +
              /* Mark */
         | 
| 43 | 
            +
              "Mn", "Mc", "Me",
         | 
| 44 | 
            +
              /* Number */
         | 
| 45 | 
            +
              "Nd", "Nl", "No",
         | 
| 46 | 
            +
              /* Punctuation */
         | 
| 47 | 
            +
              "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
         | 
| 48 | 
            +
              /* Symbol */
         | 
| 49 | 
            +
              "Sm", "Sc", "Sk", "So",
         | 
| 50 | 
            +
              /* Separator */
         | 
| 51 | 
            +
              "Zs", "Zl", "Zp",
         | 
| 52 | 
            +
              /* Other */
         | 
| 53 | 
            +
              "Cc", "Cf", "Cs", "Co", "Cn"
         | 
| 54 | 
            +
            };
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            const char* const gencat_long[] = {
         | 
| 57 | 
            +
              "",
         | 
| 58 | 
            +
              "Uppercase_Letter",
         | 
| 59 | 
            +
              "Lowercase_Letter",
         | 
| 60 | 
            +
              "Titlecase_Letter",
         | 
| 61 | 
            +
              "Cased_Letter",
         | 
| 62 | 
            +
              "Modifier_Letter",
         | 
| 63 | 
            +
              "Other_Letter",
         | 
| 64 | 
            +
              "Nonspacing_Mark",
         | 
| 65 | 
            +
              "Spacing_Mark",
         | 
| 66 | 
            +
              "Enclosing_Mark",
         | 
| 67 | 
            +
              "Decimal_Number",
         | 
| 68 | 
            +
              "Letter_Number",
         | 
| 69 | 
            +
              "Other_Number",
         | 
| 70 | 
            +
              "Connector_Punctuation",
         | 
| 71 | 
            +
              "Dash_Punctuation",
         | 
| 72 | 
            +
              "Open_Punctuation",
         | 
| 73 | 
            +
              "Close_Punctuation",
         | 
| 74 | 
            +
              "Initial_Punctuation",
         | 
| 75 | 
            +
              "Final_Punctuation",
         | 
| 76 | 
            +
              "Other_Punctuation",
         | 
| 77 | 
            +
              "Math_Symbol",
         | 
| 78 | 
            +
              "Currency_Symbol",
         | 
| 79 | 
            +
              "Modifier_Symbol",
         | 
| 80 | 
            +
              "Other_Symbol",
         | 
| 81 | 
            +
              "Space_Separator",
         | 
| 82 | 
            +
              "Line_Separator",
         | 
| 83 | 
            +
              "Paragraph_Separator",
         | 
| 84 | 
            +
              "Control",
         | 
| 85 | 
            +
              "Format",
         | 
| 86 | 
            +
              "Surrogate",
         | 
| 87 | 
            +
              "Private_Use",
         | 
| 88 | 
            +
              "Unassigned"
         | 
| 89 | 
            +
            };
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            enum EastAsianWidth {
         | 
| 92 | 
            +
              w_N = 1, w_A, w_H, w_W, w_F, w_Na
         | 
| 93 | 
            +
            };
         | 
| 94 | 
            +
             | 
| 17 95 | 
             
            struct unicode_data {
         | 
| 18 96 | 
             
              const int code;
         | 
| 19 | 
            -
              const int combining_class;
         | 
| 20 | 
            -
              const int exclusion;
         | 
| 21 97 | 
             
              const char* const canon;
         | 
| 22 98 | 
             
              const char* const compat;
         | 
| 23 | 
            -
              const char* uppercase;
         | 
| 24 | 
            -
              const char* lowercase;
         | 
| 25 | 
            -
              const char* titlecase;
         | 
| 99 | 
            +
              const char* const uppercase;
         | 
| 100 | 
            +
              const char* const lowercase;
         | 
| 101 | 
            +
              const char* const titlecase;
         | 
| 102 | 
            +
              const unsigned char combining_class;
         | 
| 103 | 
            +
              const unsigned char exclusion;
         | 
| 104 | 
            +
              const unsigned char general_category;
         | 
| 105 | 
            +
              const unsigned char east_asian_width;
         | 
| 26 106 | 
             
            };
         | 
| 27 107 |  | 
| 28 108 | 
             
            static const struct unicode_data unidata[] = {
         | 
| @@ -81,6 +161,11 @@ def printstr(str) | |
| 81 161 | 
             
              return '"' + ret + '"'
         | 
| 82 162 | 
             
            end
         | 
| 83 163 |  | 
| 164 | 
            +
            if ARGV.length != 4
         | 
| 165 | 
            +
              puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
         | 
| 166 | 
            +
              exit 0
         | 
| 167 | 
            +
            end
         | 
| 168 | 
            +
             | 
| 84 169 | 
             
            ## scan Composition Exclusions
         | 
| 85 170 | 
             
            exclusion = {}
         | 
| 86 171 | 
             
            open(ARGV[1]) do |f|
         | 
| @@ -123,6 +208,7 @@ end | |
| 123 208 |  | 
| 124 209 | 
             
            ## scan UnicodeData
         | 
| 125 210 | 
             
            udata = {}
         | 
| 211 | 
            +
            range_data = []
         | 
| 126 212 | 
             
            open(ARGV[0]) do |f|
         | 
| 127 213 | 
             
              while l = f.gets
         | 
| 128 214 | 
             
                l.chomp!
         | 
| @@ -135,13 +221,46 @@ open(ARGV[0]) do |f| | |
| 135 221 | 
             
                upcase = hex_or_nil(upcase)
         | 
| 136 222 | 
             
                lowcase = hex_or_nil(lowcase)
         | 
| 137 223 | 
             
                titlecase = hex_or_nil(titlecase)
         | 
| 138 | 
            -
                udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
         | 
| 224 | 
            +
                udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
         | 
| 225 | 
            +
                if charname =~ /^<(.*, (First|Last))>$/
         | 
| 226 | 
            +
                  charname = $1.upcase.gsub(/,? /, '_')
         | 
| 227 | 
            +
                  range_data << [charname, code]
         | 
| 228 | 
            +
                end
         | 
| 229 | 
            +
              end
         | 
| 230 | 
            +
            end
         | 
| 231 | 
            +
             | 
| 232 | 
            +
            ## scan EastAsianWidth
         | 
| 233 | 
            +
            ea_width = {}
         | 
| 234 | 
            +
            open(ARGV[3]) do |f|
         | 
| 235 | 
            +
              while l = f.gets
         | 
| 236 | 
            +
                l.chomp!
         | 
| 237 | 
            +
                next if l =~ /^\#/ || l =~ /^$/
         | 
| 238 | 
            +
                l =~ /^(.*)\s+#\s*(.*)$/
         | 
| 239 | 
            +
                l = $1
         | 
| 240 | 
            +
                comment = $2
         | 
| 241 | 
            +
                code,width = l.split(/;/)
         | 
| 242 | 
            +
                if code =~ /\.\./
         | 
| 243 | 
            +
                  start_code, end_code = code.split('..')
         | 
| 244 | 
            +
                  start_code = start_code.hex
         | 
| 245 | 
            +
                  end_code = end_code.hex
         | 
| 246 | 
            +
                  (start_code..end_code).each do |code|
         | 
| 247 | 
            +
                    ea_width[code] = width
         | 
| 248 | 
            +
                  end
         | 
| 249 | 
            +
                  next
         | 
| 250 | 
            +
                end
         | 
| 251 | 
            +
                code = code.hex
         | 
| 252 | 
            +
                ea_width[code] = width
         | 
| 139 253 | 
             
              end
         | 
| 140 254 | 
             
            end
         | 
| 141 255 |  | 
| 142 256 | 
             
            print HEAD
         | 
| 257 | 
            +
            range_data.each do |charname, code|
         | 
| 258 | 
            +
              printf("#define %s\t(0x%04x)\n", charname, code)
         | 
| 259 | 
            +
            end
         | 
| 260 | 
            +
             | 
| 261 | 
            +
            print HEAD1
         | 
| 143 262 | 
             
            udata.sort.each do |code, data|
         | 
| 144 | 
            -
              ccclass, canon, compat, upcase, lowcase, titlecase = data
         | 
| 263 | 
            +
              ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
         | 
| 145 264 | 
             
              ## Exclusions
         | 
| 146 265 | 
             
              ex = 0
         | 
| 147 266 | 
             
              if exclusion[code]  ## Script-specifics or Post Composition Version
         | 
| @@ -160,10 +279,15 @@ udata.sort.each do |code, data| | |
| 160 279 | 
             
                titlecase = casing[code][1] if casing[code][1]
         | 
| 161 280 | 
             
                upcase = casing[code][2] if casing[code][2]
         | 
| 162 281 | 
             
              end
         | 
| 163 | 
            -
               | 
| 164 | 
            -
             | 
| 282 | 
            +
              width = 'N'
         | 
| 283 | 
            +
              if ea_width[code]
         | 
| 284 | 
            +
                width = ea_width[code]
         | 
| 285 | 
            +
              end
         | 
| 286 | 
            +
             | 
| 287 | 
            +
              printf("  { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
         | 
| 288 | 
            +
                     code, printstr(canon),
         | 
| 165 289 | 
             
                     printstr(compat), printstr(upcase), printstr(lowcase),
         | 
| 166 | 
            -
                     printstr(titlecase))
         | 
| 290 | 
            +
                     printstr(titlecase), ccclass, ex, gencat, width)
         | 
| 167 291 | 
             
            end
         | 
| 168 | 
            -
            printf("  { -1,  | 
| 292 | 
            +
            printf("  { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
         | 
| 169 293 | 
             
            print TAIL
         | 
    
        data/unicode.gemspec
    CHANGED
    
    | @@ -2,11 +2,11 @@ | |
| 2 2 |  | 
| 3 3 | 
             
            Gem::Specification.new do |s|
         | 
| 4 4 | 
             
              s.name = %q{unicode}
         | 
| 5 | 
            -
              s.version = "0.4. | 
| 5 | 
            +
              s.version = "0.4.3"
         | 
| 6 6 |  | 
| 7 7 | 
             
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         | 
| 8 8 | 
             
              s.authors = [%q{Yoshida Masato}]
         | 
| 9 | 
            -
              s.date = %q{ | 
| 9 | 
            +
              s.date = %q{2012-08-07}
         | 
| 10 10 | 
             
              s.email = %q{yoshidam@yoshidam.net}
         | 
| 11 11 | 
             
              s.extensions = [%q{ext/unicode/extconf.rb}]
         | 
| 12 12 | 
             
              s.extra_rdoc_files = [%q{README}]
         | 
    
        metadata
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: unicode
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              hash:  | 
| 4 | 
            +
              hash: 9
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
              segments: 
         | 
| 7 7 | 
             
              - 0
         | 
| 8 8 | 
             
              - 4
         | 
| 9 | 
            -
              -  | 
| 10 | 
            -
              version: 0.4. | 
| 9 | 
            +
              - 3
         | 
| 10 | 
            +
              version: 0.4.3
         | 
| 11 11 | 
             
            platform: x86-mswin32-60
         | 
| 12 12 | 
             
            authors: 
         | 
| 13 13 | 
             
            - Yoshida Masato
         | 
| @@ -15,7 +15,7 @@ autorequire: | |
| 15 15 | 
             
            bindir: bin
         | 
| 16 16 | 
             
            cert_chain: []
         | 
| 17 17 |  | 
| 18 | 
            -
            date:  | 
| 18 | 
            +
            date: 2012-08-07 00:00:00 Z
         | 
| 19 19 | 
             
            dependencies: []
         | 
| 20 20 |  | 
| 21 21 | 
             
            description: Unicode normalization library.
         | 
| @@ -73,7 +73,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 73 73 | 
             
            requirements: []
         | 
| 74 74 |  | 
| 75 75 | 
             
            rubyforge_project: 
         | 
| 76 | 
            -
            rubygems_version: 1.8. | 
| 76 | 
            +
            rubygems_version: 1.8.24
         | 
| 77 77 | 
             
            signing_key: 
         | 
| 78 78 | 
             
            specification_version: 3
         | 
| 79 79 | 
             
            summary: Unicode normalization library.
         |