RubyGems - unicode - Versions diffs - 0.4.2-x86-mingw32 → 0.4.3-x86-mingw32 - Mend

unicode 0.4.2-x86-mingw32 → 0.4.3-x86-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/README +29 -7
data/ext/unicode/unicode.c +379 -16
data/ext/unicode/unidata.map +24536 -24435
data/ext/unicode/wstring.c +69 -1
data/ext/unicode/wstring.h +2 -0
data/lib/unicode/1.8/unicode_native.so +0 -0
data/lib/unicode/1.9/unicode_native.so +0 -0
data/tools/README +3 -2
data/tools/mkunidata.rb +136 -12
data/unicode.gemspec +2 -2
metadata +5 -5

data/ext/unicode/wstring.c CHANGED Viewed

@@ -43,7 +43,10 @@ WStr_free(WString* str)
 {
   str->size = 0;
   str->len = 0;
-  free(str->str);
+  if (str->str) {
+    free(str->str);
+    str->str = NULL;
+  }
 }
 int
@@ -164,6 +167,59 @@ WStr_allocWithUTF8(WString* s, const char* in)
   return s;
 }
+WString*
+WStr_allocWithUTF8L(WString* s, const char* in, int len)
+{
+  int i;
+  int u = 0;
+  int rest = 0;
+  WStr_alloc(s);
+  if (in == NULL)
+    return s;
+  for (i = 0; i < len; i++) {
+    unsigned char c = in[i];
+    if ((c & 0xc0) == 0x80) {
+      if (rest == 0)
+	return NULL;
+      u = (u << 6) | (c & 63);
+      rest--;
+      if (rest == 0) {
+	WStr_addWChar(s, u);
+      }
+    }
+    else if ((c & 0x80) == 0) {      /* 0b0nnnnnnn (7bit) */
+      WStr_addWChar(s, c);
+      rest = 0;
+    }
+    else if ((c & 0xe0) == 0xc0) {      /* 0b110nnnnn (11bit) */
+      rest = 1;
+      u = c & 31;
+    }
+    else if ((c & 0xf0) == 0xe0) {      /* 0b1110nnnn (16bit) */
+      rest = 2;
+      u = c & 15;
+    }
+    else if ((c & 0xf8) == 0xf0) {      /* 0b11110nnn (21bit) */
+      rest = 3;
+      u = c & 7;
+    }
+    else if ((c & 0xfc) == 0xf8) {      /* 0b111110nn (26bit) */
+      rest = 4;
+      u = c & 3;
+    }
+    else if ((c & 0xfe) == 0xfc) {      /* 0b1111110n (31bit) */
+      rest = 5;
+      u = c & 1;
+    }
+    else {
+      return NULL;
+    }
+  }
+  return s;
+}
 UString*
 WStr_convertIntoUString(WString* wstr, UString* ustr)
 {
@@ -176,6 +232,18 @@ WStr_convertIntoUString(WString* wstr, UString* ustr)
   return ustr;
 }
+UString*
+WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr)
+{
+  int i;
+  for (i = start; i < wstr->len && i < start + len; i++) {
+    UniStr_addWChar(ustr, wstr->str[i]);
+  }
+  return ustr;
+}
 void
 WStr_dump(WString* s)
 {

data/ext/unicode/wstring.h CHANGED Viewed

@@ -24,6 +24,7 @@ typedef struct _WString {
 WString* WStr_alloc(WString* str);
 WString* WStr_allocWithUTF8(WString* s, const char* u);
+WString* WStr_allocWithUTF8L(WString* s, const char* u, int len);
 WString* WStr_enlarge(WString* str, int size);
 void WStr_free(WString* str);
 int WStr_addWChars(WString* s, const int* a, int len);
@@ -32,6 +33,7 @@ int WStr_pushWString(WString* s, const WString* add);
 int WStr_addWChar2(WString* s, int a1, int a2);
 int WStr_addWChar3(WString* s, int a1, int a2, int a3);
 UString* WStr_convertIntoUString(WString* wstr, UString* ustr);
+UString* WStr_convertIntoUString2(WString* wstr, int start, int len, UString* ustr);
 void WStr_dump(WString* s);
 #ifdef __cplusplus

data/lib/unicode/1.8/unicode_native.so CHANGED Viewed

Binary file

data/lib/unicode/1.9/unicode_native.so CHANGED Viewed

Binary file

data/tools/README CHANGED Viewed

@@ -1,6 +1,7 @@
 The bundled unidata.map is created from UnicodeData.txt,
-DerivedNormalizationProps.txt and SpecialCasing.txt of Unicode 6.0.
+DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
+of Unicode 6.0.
 To update unidata.map,
-  ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt > unidata.map
+  ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt  > unidata.map

data/tools/mkunidata.rb CHANGED Viewed

@@ -7,22 +7,102 @@
 HEAD=<<EOS
 /*
  * UnicodeData
- * Copyright 1999, 2004, 2010 by yoshidam
+ * Copyright 1999, 2004, 2010, 2012 by yoshidam
  *
  */
 #ifndef _UNIDATA_MAP
 #define _UNIDATA_MAP
+EOS
+HEAD1=<<EOS
+enum GeneralCategory {
+  /* Letter */
+  c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
+  /* Mark */
+  c_Mn, c_Mc, c_Me,
+  /* Number */
+  c_Nd, c_Nl, c_No,
+  /* Punctuation */
+  c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
+  /* Symbol */
+  c_Sm, c_Sc, c_Sk, c_So,
+  /* Separator */
+  c_Zs, c_Zl, c_Zp,
+  /* Other */
+  c_Cc, c_Cf, c_Cs, c_Co, c_Cn
+};
+const char* const gencat_abbr[] = {
+  "", /* 0 */
+  /* Letter */
+  "Lu", "Ll", "Lt", "LC", "Lm", "Lo",
+  /* Mark */
+  "Mn", "Mc", "Me",
+  /* Number */
+  "Nd", "Nl", "No",
+  /* Punctuation */
+  "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
+  /* Symbol */
+  "Sm", "Sc", "Sk", "So",
+  /* Separator */
+  "Zs", "Zl", "Zp",
+  /* Other */
+  "Cc", "Cf", "Cs", "Co", "Cn"
+};
+const char* const gencat_long[] = {
+  "",
+  "Uppercase_Letter",
+  "Lowercase_Letter",
+  "Titlecase_Letter",
+  "Cased_Letter",
+  "Modifier_Letter",
+  "Other_Letter",
+  "Nonspacing_Mark",
+  "Spacing_Mark",
+  "Enclosing_Mark",
+  "Decimal_Number",
+  "Letter_Number",
+  "Other_Number",
+  "Connector_Punctuation",
+  "Dash_Punctuation",
+  "Open_Punctuation",
+  "Close_Punctuation",
+  "Initial_Punctuation",
+  "Final_Punctuation",
+  "Other_Punctuation",
+  "Math_Symbol",
+  "Currency_Symbol",
+  "Modifier_Symbol",
+  "Other_Symbol",
+  "Space_Separator",
+  "Line_Separator",
+  "Paragraph_Separator",
+  "Control",
+  "Format",
+  "Surrogate",
+  "Private_Use",
+  "Unassigned"
+};
+enum EastAsianWidth {
+  w_N = 1, w_A, w_H, w_W, w_F, w_Na
+};
 struct unicode_data {
   const int code;
-  const int combining_class;
-  const int exclusion;
   const char* const canon;
   const char* const compat;
-  const char* uppercase;
-  const char* lowercase;
-  const char* titlecase;
+  const char* const uppercase;
+  const char* const lowercase;
+  const char* const titlecase;
+  const unsigned char combining_class;
+  const unsigned char exclusion;
+  const unsigned char general_category;
+  const unsigned char east_asian_width;
 };
 static const struct unicode_data unidata[] = {
@@ -81,6 +161,11 @@ def printstr(str)
   return '"' + ret + '"'
 end
+if ARGV.length != 4
+  puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
+  exit 0
+end
 ## scan Composition Exclusions
 exclusion = {}
 open(ARGV[1]) do |f|
@@ -123,6 +208,7 @@ end
 ## scan UnicodeData
 udata = {}
+range_data = []
 open(ARGV[0]) do |f|
   while l = f.gets
     l.chomp!
@@ -135,13 +221,46 @@ open(ARGV[0]) do |f|
     upcase = hex_or_nil(upcase)
     lowcase = hex_or_nil(lowcase)
     titlecase = hex_or_nil(titlecase)
-    udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
+    udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
+    if charname =~ /^<(.*, (First|Last))>$/
+      charname = $1.upcase.gsub(/,? /, '_')
+      range_data << [charname, code]
+    end
+  end
+end
+## scan EastAsianWidth
+ea_width = {}
+open(ARGV[3]) do |f|
+  while l = f.gets
+    l.chomp!
+    next if l =~ /^\#/ || l =~ /^$/
+    l =~ /^(.*)\s+#\s*(.*)$/
+    l = $1
+    comment = $2
+    code,width = l.split(/;/)
+    if code =~ /\.\./
+      start_code, end_code = code.split('..')
+      start_code = start_code.hex
+      end_code = end_code.hex
+      (start_code..end_code).each do |code|
+        ea_width[code] = width
+      end
+      next
+    end
+    code = code.hex
+    ea_width[code] = width
   end
 end
 print HEAD
+range_data.each do |charname, code|
+  printf("#define %s\t(0x%04x)\n", charname, code)
+end
+print HEAD1
 udata.sort.each do |code, data|
-  ccclass, canon, compat, upcase, lowcase, titlecase = data
+  ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
   ## Exclusions
   ex = 0
   if exclusion[code]  ## Script-specifics or Post Composition Version
@@ -160,10 +279,15 @@ udata.sort.each do |code, data|
     titlecase = casing[code][1] if casing[code][1]
     upcase = casing[code][2] if casing[code][2]
   end
-  printf("  { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
-         code, ccclass, ex, printstr(canon),
+  width = 'N'
+  if ea_width[code]
+    width = ea_width[code]
+  end
+  printf("  { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
+         code, printstr(canon),
          printstr(compat), printstr(upcase), printstr(lowcase),
-         printstr(titlecase))
+         printstr(titlecase), ccclass, ex, gencat, width)
 end
-printf("  { -1, 0, 0, NULL, NULL, NULL, NULL, NULL }\n")
+printf("  { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
 print TAIL

data/unicode.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = %q{unicode}
-  s.version = "0.4.2"
+  s.version = "0.4.3"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = [%q{Yoshida Masato}]
-  s.date = %q{2011-02-03}
+  s.date = %q{2012-08-07}
   s.email = %q{yoshidam@yoshidam.net}
   s.extensions = [%q{ext/unicode/extconf.rb}]
   s.extra_rdoc_files = [%q{README}]

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: unicode
 version: !ruby/object:Gem::Version
-  hash: 11
+  hash: 9
   prerelease:
   segments:
   - 0
   - 4
-  - 2
-  version: 0.4.2
+  - 3
+  version: 0.4.3
 platform: x86-mingw32
 authors:
 - Yoshida Masato
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-02-03 00:00:00 Z
+date: 2012-08-07 00:00:00 Z
 dependencies: []
 description: Unicode normalization library.
@@ -73,7 +73,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.17
+rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
 summary: Unicode normalization library.