RubyGems - unicode - Versions diffs - 0.4.2 → 0.4.3 - Mend

unicode 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/README +29 -7
data/tools/README +3 -2
data/tools/mkunidata.rb +136 -12
data/unicode.c +379 -16
data/unidata.map +24536 -24435
data/wstring.c +69 -1
data/wstring.h +2 -0
metadata +20 -38

data/README CHANGED Viewed

@@ -1,5 +1,5 @@
 		   Unicode Library for Ruby
-			Version 0.4.2
+			Version 0.4.3
 		       Yoshida Masato
@@ -7,14 +7,14 @@
 - Introduction
   Unicode string manipulation library for Ruby.
-  This library is based on UTR #15 Unicode Normalization Forms(*1).
+  This library is based on UAX #15 Unicode Normalization Forms(*1).
     *1 <URL:http://www.unicode.org/unicode/reports/tr15/>
 - Install
-  This can work with ruby-1.8 or later. I recommend you to
+  This can work with ruby-1.8.7 or later. I recommend you to
   use ruby-1.9.3 or later.
   Make and install usually.
@@ -79,7 +79,7 @@
     These are aliases of decompose/decompose_compat.
   Unicode::normalize_D_safe(str)  (Unicode::nfd_safe(str))
-    This is an aliase of decompose_safe.
+    This is an alias of decompose_safe.
   Unicode::normalize_C(str) (Unicode::nfc(str))
   Unicode::normalize_KC(str) (Unicode::nfkc(str))
@@ -98,14 +98,35 @@
     The mappings that are used by these functions are not normative
     in UnicodeData.txt.
+  Unicode::categories(str)
+  Unicode::abbr_categories(str)
+    Get an array of general category names of the string.
+    get_abbr_categories returns abbreviated names.
+    These can be called with a block.
+      Unicode.get_category do |category| p category end
+  Unicode::text_elements(str)
+    Get an array of text elements.
+    A text element is a unit that is displayed as a single character.
+    These can be called with a block.
+  Unicode::width(str[, cjk])
+    Estimate the display width on the fixed pitch text terminal.
+    It based on Markus Kuhn's mk_wcwidth.
+    If the optional argument 'cjk' is true, East Asian
+    Ambiguous characters are treated as wide characters.
+      Unicode.width("\u03b1") #=> 1
+      Unicode.width("\u03b1", true) #=> 2
 - Bugs
-  UTR #15 suggests that the look up for Normalization Form C
+  UAX #15 suggests that the look up for Normalization Form C
   should not be implemented with a hash of string for better
   performance.
-  Case conversion functions should reflecte UTR #21.
 - Copying
@@ -123,6 +144,7 @@
 - History
+  Aug  8, 2012 version 0.4.3 add categories, text_elements and width
   Feb 29, 2012 version 0.4.2 add decompose_safe
   Feb  3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
   Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0

data/tools/README CHANGED Viewed

@@ -1,6 +1,7 @@
 The bundled unidata.map is created from UnicodeData.txt,
-DerivedNormalizationProps.txt and SpecialCasing.txt of Unicode 6.0.
+DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
+of Unicode 6.0.
 To update unidata.map,
-  ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt > unidata.map
+  ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt  > unidata.map

data/tools/mkunidata.rb CHANGED Viewed

@@ -7,22 +7,102 @@
 HEAD=<<EOS
 /*
  * UnicodeData
- * Copyright 1999, 2004, 2010 by yoshidam
+ * Copyright 1999, 2004, 2010, 2012 by yoshidam
  *
  */
 #ifndef _UNIDATA_MAP
 #define _UNIDATA_MAP
+EOS
+HEAD1=<<EOS
+enum GeneralCategory {
+  /* Letter */
+  c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
+  /* Mark */
+  c_Mn, c_Mc, c_Me,
+  /* Number */
+  c_Nd, c_Nl, c_No,
+  /* Punctuation */
+  c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
+  /* Symbol */
+  c_Sm, c_Sc, c_Sk, c_So,
+  /* Separator */
+  c_Zs, c_Zl, c_Zp,
+  /* Other */
+  c_Cc, c_Cf, c_Cs, c_Co, c_Cn
+};
+const char* const gencat_abbr[] = {
+  "", /* 0 */
+  /* Letter */
+  "Lu", "Ll", "Lt", "LC", "Lm", "Lo",
+  /* Mark */
+  "Mn", "Mc", "Me",
+  /* Number */
+  "Nd", "Nl", "No",
+  /* Punctuation */
+  "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
+  /* Symbol */
+  "Sm", "Sc", "Sk", "So",
+  /* Separator */
+  "Zs", "Zl", "Zp",
+  /* Other */
+  "Cc", "Cf", "Cs", "Co", "Cn"
+};
+const char* const gencat_long[] = {
+  "",
+  "Uppercase_Letter",
+  "Lowercase_Letter",
+  "Titlecase_Letter",
+  "Cased_Letter",
+  "Modifier_Letter",
+  "Other_Letter",
+  "Nonspacing_Mark",
+  "Spacing_Mark",
+  "Enclosing_Mark",
+  "Decimal_Number",
+  "Letter_Number",
+  "Other_Number",
+  "Connector_Punctuation",
+  "Dash_Punctuation",
+  "Open_Punctuation",
+  "Close_Punctuation",
+  "Initial_Punctuation",
+  "Final_Punctuation",
+  "Other_Punctuation",
+  "Math_Symbol",
+  "Currency_Symbol",
+  "Modifier_Symbol",
+  "Other_Symbol",
+  "Space_Separator",
+  "Line_Separator",
+  "Paragraph_Separator",
+  "Control",
+  "Format",
+  "Surrogate",
+  "Private_Use",
+  "Unassigned"
+};
+enum EastAsianWidth {
+  w_N = 1, w_A, w_H, w_W, w_F, w_Na
+};
 struct unicode_data {
   const int code;
-  const int combining_class;
-  const int exclusion;
   const char* const canon;
   const char* const compat;
-  const char* uppercase;
-  const char* lowercase;
-  const char* titlecase;
+  const char* const uppercase;
+  const char* const lowercase;
+  const char* const titlecase;
+  const unsigned char combining_class;
+  const unsigned char exclusion;
+  const unsigned char general_category;
+  const unsigned char east_asian_width;
 };
 static const struct unicode_data unidata[] = {
@@ -81,6 +161,11 @@ def printstr(str)
   return '"' + ret + '"'
 end
+if ARGV.length != 4
+  puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
+  exit 0
+end
 ## scan Composition Exclusions
 exclusion = {}
 open(ARGV[1]) do |f|
@@ -123,6 +208,7 @@ end
 ## scan UnicodeData
 udata = {}
+range_data = []
 open(ARGV[0]) do |f|
   while l = f.gets
     l.chomp!
@@ -135,13 +221,46 @@ open(ARGV[0]) do |f|
     upcase = hex_or_nil(upcase)
     lowcase = hex_or_nil(lowcase)
     titlecase = hex_or_nil(titlecase)
-    udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
+    udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
+    if charname =~ /^<(.*, (First|Last))>$/
+      charname = $1.upcase.gsub(/,? /, '_')
+      range_data << [charname, code]
+    end
+  end
+end
+## scan EastAsianWidth
+ea_width = {}
+open(ARGV[3]) do |f|
+  while l = f.gets
+    l.chomp!
+    next if l =~ /^\#/ || l =~ /^$/
+    l =~ /^(.*)\s+#\s*(.*)$/
+    l = $1
+    comment = $2
+    code,width = l.split(/;/)
+    if code =~ /\.\./
+      start_code, end_code = code.split('..')
+      start_code = start_code.hex
+      end_code = end_code.hex
+      (start_code..end_code).each do |code|
+        ea_width[code] = width
+      end
+      next
+    end
+    code = code.hex
+    ea_width[code] = width
   end
 end
 print HEAD
+range_data.each do |charname, code|
+  printf("#define %s\t(0x%04x)\n", charname, code)
+end
+print HEAD1
 udata.sort.each do |code, data|
-  ccclass, canon, compat, upcase, lowcase, titlecase = data
+  ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
   ## Exclusions
   ex = 0
   if exclusion[code]  ## Script-specifics or Post Composition Version
@@ -160,10 +279,15 @@ udata.sort.each do |code, data|
     titlecase = casing[code][1] if casing[code][1]
     upcase = casing[code][2] if casing[code][2]
   end
-  printf("  { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
-         code, ccclass, ex, printstr(canon),
+  width = 'N'
+  if ea_width[code]
+    width = ea_width[code]
+  end
+  printf("  { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
+         code, printstr(canon),
          printstr(compat), printstr(upcase), printstr(lowcase),
-         printstr(titlecase))
+         printstr(titlecase), ccclass, ex, gencat, width)
 end
-printf("  { -1, 0, 0, NULL, NULL, NULL, NULL, NULL }\n")
+printf("  { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
 print TAIL

data/unicode.c CHANGED Viewed

@@ -1,5 +1,6 @@
 /*
- * Unicode Library version 0.4
+ * Unicode Library version 0.4.3
+ * Aug  8, 2012: version 0.4
  * Oct 14, 2010: version 0.4
  * Feb 26, 2010: version 0.3
  * Dec 29, 2009: version 0.2
@@ -7,7 +8,7 @@
  *
  */
-#define UNICODE_VERSION "0.4.2"
+#define UNICODE_VERSION "0.4.3"
 #include "ruby.h"
 #ifdef HAVE_RUBY_IO_H
@@ -54,6 +55,8 @@ taintObject(VALUE src, VALUE obj) {
 static VALUE mUnicode;
 static VALUE unicode_data;
 static VALUE composition_table;
+static VALUE catname_long[c_Cn+1];
+static VALUE catname_abbr[c_Cn+1];
 /* Hangul */
 #define SBASE   (0xac00)
@@ -66,6 +69,86 @@ static VALUE composition_table;
 #define NCOUNT  (VCOUNT * TCOUNT) /* 588 */
 #define SCOUNT  (LCOUNT * NCOUNT) /* 11172 */
+VALUE
+get_unidata(int ucs) {
+  VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
+  if (!NIL_P(ch))
+    return ch;
+#ifdef CJK_IDEOGRAPH_EXTENSION_A_FIRST
+  else if (ucs >= CJK_IDEOGRAPH_EXTENSION_A_FIRST &&
+           ucs <= CJK_IDEOGRAPH_EXTENSION_A_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(CJK_IDEOGRAPH_EXTENSION_A_FIRST));
+#endif
+#ifdef CJK_IDEOGRAPH_FIRST
+  else if (ucs >= CJK_IDEOGRAPH_FIRST &&
+           ucs <= CJK_IDEOGRAPH_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(CJK_IDEOGRAPH_FIRST));
+#endif
+#ifdef HANGUL_SYLLABLE_FIRST
+  else if (ucs >= HANGUL_SYLLABLE_FIRST &&
+           ucs <= HANGUL_SYLLABLE_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(HANGUL_SYLLABLE_FIRST));
+#endif
+#ifdef NON_PRIVATE_USE_HIGH_SURROGATE_FIRST
+  else if (ucs >= NON_PRIVATE_USE_HIGH_SURROGATE_FIRST &&
+           ucs <= NON_PRIVATE_USE_HIGH_SURROGATE_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(NON_PRIVATE_USE_HIGH_SURROGATE_FIRST));
+#endif
+#ifdef PRIVATE_USE_HIGH_SURROGATE_FIRST
+  else if (ucs >= PRIVATE_USE_HIGH_SURROGATE_FIRST &&
+           ucs <= PRIVATE_USE_HIGH_SURROGATE_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(PRIVATE_USE_HIGH_SURROGATE_FIRST));
+#endif
+#ifdef LOW_SURROGATE_FIRST
+  else if (ucs >= LOW_SURROGATE_FIRST &&
+           ucs <= LOW_SURROGATE_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(LOW_SURROGATE_FIRST));
+#endif
+#ifdef PRIVATE_USE_FIRST
+  else if (ucs >= PRIVATE_USE_FIRST &&
+           ucs <= PRIVATE_USE_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(PRIVATE_USE_FIRST));
+#endif
+#ifdef CJK_IDEOGRAPH_EXTENSION_B_FIRST
+  else if (ucs >= CJK_IDEOGRAPH_EXTENSION_B_FIRST &&
+           ucs <= CJK_IDEOGRAPH_EXTENSION_B_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(CJK_IDEOGRAPH_EXTENSION_B_FIRST));
+#endif
+#ifdef CJK_IDEOGRAPH_EXTENSION_C_FIRST
+  else if (ucs >= CJK_IDEOGRAPH_EXTENSION_C_FIRST &&
+           ucs <= CJK_IDEOGRAPH_EXTENSION_C_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(CJK_IDEOGRAPH_EXTENSION_C_FIRST));
+#endif
+#ifdef CJK_IDEOGRAPH_EXTENSION_D_FIRST
+  else if (ucs >= CJK_IDEOGRAPH_EXTENSION_D_FIRST &&
+           ucs <= CJK_IDEOGRAPH_EXTENSION_D_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(CJK_IDEOGRAPH_EXTENSION_D_FIRST));
+#endif
+#ifdef PLANE_15_PRIVATE_USE_FIRST
+  else if (ucs >= PLANE_15_PRIVATE_USE_FIRST &&
+           ucs <= PLANE_15_PRIVATE_USE_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(PLANE_15_PRIVATE_USE_FIRST));
+#endif
+#ifdef PLANE_16_PRIVATE_USE_FIRST
+  else if (ucs >= PLANE_16_PRIVATE_USE_FIRST &&
+           ucs <= PLANE_16_PRIVATE_USE_LAST)
+    return rb_hash_aref(unicode_data,
+                        INT2FIX(PLANE_16_PRIVATE_USE_FIRST));
+#endif
+  return Qnil;
+}
 static int
 get_cc(int ucs)
 {
@@ -77,6 +160,28 @@ get_cc(int ucs)
   return 0;
 }
+static int
+get_gencat(int ucs)
+{
+  VALUE ch = get_unidata(ucs);
+  if (!NIL_P(ch)) {
+    return unidata[FIX2INT(ch)].general_category;
+  }
+  return c_Cn; /* Unassigned */
+}
+static int
+get_eawidth(int ucs)
+{
+  VALUE ch = get_unidata(ucs);
+  if (!NIL_P(ch)) {
+    return unidata[FIX2INT(ch)].east_asian_width;
+  }
+  return w_N; /* Neutral */
+}
 static const char*
 get_canon(int ucs)
 {
@@ -538,8 +643,8 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
   CONVERT_TO_UTF8(str1);
   CONVERT_TO_UTF8(str2);
 #endif
-  WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
-  WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
+  WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
+  WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
   WStr_alloc(&result1);
   WStr_alloc(&result2);
   decompose_internal(&wstr1, &result1);
@@ -580,8 +685,8 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
   CONVERT_TO_UTF8(str1);
   CONVERT_TO_UTF8(str2);
 #endif
-  WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
-  WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
+  WStr_allocWithUTF8L(&wstr1, RSTRING_PTR(str1), RSTRING_LEN(str1));
+  WStr_allocWithUTF8L(&wstr2, RSTRING_PTR(str2), RSTRING_LEN(str2));
   WStr_alloc(&result1);
   WStr_alloc(&result2);
   decompose_compat_internal(&wstr1, &result1);
@@ -617,7 +722,7 @@ unicode_decompose(VALUE obj, VALUE str)
 #ifdef HAVE_RUBY_ENCODING_H
   CONVERT_TO_UTF8(str);
 #endif
-  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
+  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
   WStr_alloc(&result);
   decompose_internal(&ustr, &result);
   WStr_free(&ustr);
@@ -643,7 +748,7 @@ unicode_decompose_safe(VALUE obj, VALUE str)
 #ifdef HAVE_RUBY_ENCODING_H
   CONVERT_TO_UTF8(str);
 #endif
-  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
+  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
   WStr_alloc(&result);
   decompose_safe_internal(&ustr, &result);
   WStr_free(&ustr);
@@ -669,7 +774,7 @@ unicode_decompose_compat(VALUE obj, VALUE str)
 #ifdef HAVE_RUBY_ENCODING_H
   CONVERT_TO_UTF8(str);
 #endif
-  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
+  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
   WStr_alloc(&result);
   decompose_compat_internal(&ustr, &result);
   WStr_free(&ustr);
@@ -695,7 +800,7 @@ unicode_compose(VALUE obj, VALUE str)
 #ifdef HAVE_RUBY_ENCODING_H
   CONVERT_TO_UTF8(str);
 #endif
-  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
+  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
   sort_canonical(&ustr);
   WStr_alloc(&result);
   compose_internal(&ustr, &result);
@@ -722,7 +827,7 @@ unicode_normalize_C(VALUE obj, VALUE str)
 #ifdef HAVE_RUBY_ENCODING_H
   CONVERT_TO_UTF8(str);
 #endif
-  WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
+  WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
   WStr_alloc(&ustr2);
   decompose_internal(&ustr1, &ustr2);
   WStr_free(&ustr1);
@@ -752,7 +857,7 @@ unicode_normalize_safe(VALUE obj, VALUE str)
 #ifdef HAVE_RUBY_ENCODING_H
   CONVERT_TO_UTF8(str);
 #endif
-  WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
+  WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
   WStr_alloc(&ustr2);
   decompose_safe_internal(&ustr1, &ustr2);
   WStr_free(&ustr1);
@@ -782,7 +887,7 @@ unicode_normalize_KC(VALUE obj, VALUE str)
 #ifdef HAVE_RUBY_ENCODING_H
   CONVERT_TO_UTF8(str);
 #endif
-  WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
+  WStr_allocWithUTF8L(&ustr1, RSTRING_PTR(str), RSTRING_LEN(str));
   WStr_alloc(&ustr2);
   decompose_compat_internal(&ustr1, &ustr2);
   WStr_free(&ustr1);
@@ -811,7 +916,7 @@ unicode_upcase(VALUE obj, VALUE str)
 #ifdef HAVE_RUBY_ENCODING_H
   CONVERT_TO_UTF8(str);
 #endif
-  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
+  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
   WStr_alloc(&result);
   upcase_internal(&ustr, &result);
   //sort_canonical(&result);
@@ -837,7 +942,7 @@ unicode_downcase(VALUE obj, VALUE str)
 #ifdef HAVE_RUBY_ENCODING_H
   CONVERT_TO_UTF8(str);
 #endif
-  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
+  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
   WStr_alloc(&result);
   downcase_internal(&ustr, &result);
   //sort_canonical(&result);
@@ -868,7 +973,7 @@ unicode_capitalize(VALUE obj, VALUE str)
 #ifdef HAVE_RUBY_ENCODING_H
   CONVERT_TO_UTF8(str);
 #endif
-  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
+  WStr_allocWithUTF8L(&ustr, RSTRING_PTR(str), RSTRING_LEN(str));
   WStr_alloc(&result);
   capitalize_internal(&ustr, &result);
   //sort_canonical(&result);
@@ -882,6 +987,248 @@ unicode_capitalize(VALUE obj, VALUE str)
   return vret;
 }
+typedef struct _get_categories_param {
+  WString* wstr;
+  VALUE str;
+  VALUE* catname;
+} get_categories_param;
+static VALUE
+get_categories_internal(get_categories_param* param)
+{
+  WString* wstr = param->wstr;
+  VALUE str = param->str;
+  VALUE* catname = param->catname;
+  int pos;
+  int block_p = rb_block_given_p();
+  volatile VALUE ret = str;
+  if (!block_p)
+    ret = rb_ary_new();
+  for (pos = 0; pos < wstr->len; pos++) {
+    int gencat = get_gencat(wstr->str[pos]);
+    if (!block_p)
+      rb_ary_push(ret, catname[gencat]);
+    else {
+      rb_yield(catname[gencat]);
+    }
+  }
+  return ret;
+}
+VALUE
+get_categories_ensure(WString* wstr)
+{
+  WStr_free(wstr);
+  return Qnil;
+}
+VALUE
+unicode_get_categories(VALUE obj, VALUE str)
+{
+  WString wstr;
+  get_categories_param param = { &wstr, str, catname_long };
+  Check_Type(str, T_STRING);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
+  return rb_ensure(get_categories_internal, (VALUE)&param,
+                   get_categories_ensure, (VALUE)&wstr);
+  /* wstr will be freed in get_text_elements_ensure() */
+}
+VALUE
+unicode_get_abbr_categories(VALUE obj, VALUE str)
+{
+  WString wstr;
+  get_categories_param param = { &wstr, str, catname_abbr };
+  Check_Type(str, T_STRING);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
+  return rb_ensure(get_categories_internal, (VALUE)&param,
+                   get_categories_ensure, (VALUE)&wstr);
+  /* wstr will be freed in get_text_elements_ensure() */
+}
+VALUE
+unicode_wcswidth(int argc, VALUE* argv, VALUE obj)
+{
+  WString wstr;
+  int i, count;
+  int width = 0;
+  int cjk_p = 0;
+  VALUE str;
+  VALUE cjk;
+  count = rb_scan_args(argc, argv, "11", &str, &cjk);
+  if (count > 1)
+    cjk_p = RTEST(cjk);
+  Check_Type(str, T_STRING);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
+  for (i = 0; i <wstr.len; i++) {
+    int c = wstr.str[i];
+    int cat = get_gencat(c);
+    int eaw = get_eawidth(c);
+    if ((c > 0 && c < 32) || (c >= 0x7f && c < 0xa0)) {
+      /* Control Characters */
+      width = -1;
+      break;
+    }
+    else if (c != 0x00ad && /* SOFT HYPHEN */
+             (cat == c_Mn || cat == c_Me || /* Non-spacing Marks */
+              cat == c_Cf || /* Format */
+              c == 0 || /* NUL */
+              (c >= 0x1160 && c <= 0x11ff))) /* HANGUL JUNGSEONG/JONGSEONG */
+      /* zero width */ ;
+    else if (eaw == w_F || eaw == w_W || /* Fullwidth or Wide */
+             (c >= 0x4db6 && c <= 0x4dbf) || /* CJK Reserved */
+             (c >= 0x9fcd && c <= 0x9fff) || /* CJK Reserved */
+             (c >= 0xfa6e && c <= 0xfa6f) || /* CJK Reserved */
+             (c >= 0xfada && c <= 0xfaff) || /* CJK Reserved */
+             (c >= 0x2a6d7 && c <= 0x2a6ff) || /* CJK Reserved */
+             (c >= 0x2b735 && c <= 0x2b73f) || /* CJK Reserved */
+             (c >= 0x2b81e && c <= 0x2f7ff) || /* CJK Reserved */
+             (c >= 0x2fa1e && c <= 0x2fffd) || /* CJK Reserved */
+             (c >= 0x30000 && c <= 0x3fffd) || /* CJK Reserved */
+             (cjk_p && eaw == w_A)) /* East Asian Ambiguous */
+      width += 2;
+    else
+      width++; /* Halfwidth or Neutral */
+  }
+  WStr_free(&wstr);
+  return INT2FIX(width);
+}
+VALUE
+wstring_to_rstring(WString* wstr, int start, int len) {
+  UString ret;
+  volatile VALUE vret;
+  UniStr_alloc(&ret);
+  WStr_convertIntoUString2(wstr, start, len, &ret);
+  vret = ENC_(rb_str_new((char*)ret.str, ret.len));
+  UniStr_free(&ret);
+  return vret;
+}
+typedef struct _get_text_elements_param {
+  WString* wstr;
+  VALUE str;
+} get_text_elements_param;
+VALUE
+get_text_elements_internal(get_text_elements_param* param)
+{
+  WString* wstr = param->wstr;
+  VALUE str = param->str;
+  int start_pos;
+  int block_p = rb_block_given_p();
+  volatile VALUE ret = str;
+  if (!block_p)
+    ret = rb_ary_new();
+  for (start_pos = 0; start_pos < wstr->len;) {
+    int c0 = wstr->str[start_pos];
+    int cat = get_gencat(c0);
+    int length = 1;
+    int j;
+    if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
+      volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
+      if (!block_p)
+        rb_ary_push(ret, rstr);
+      else
+        rb_yield(rstr);
+      start_pos++;
+      continue;
+    }
+    for (j = start_pos + 1; j < wstr->len; j++) {
+      int c1 = wstr->str[j];
+      int cat = get_gencat(c1);
+      if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
+          j + 1 < wstr->len &&
+          c1 >= VBASE && c1 < VBASE + VCOUNT &&
+          wstr->str[j+1] >= TBASE && wstr->str[j+1] < TBASE + TCOUNT) {
+        /* Hangul L+V+T */
+        length += 2;
+        j++;
+      }
+      else if (c0 >= LBASE && c0 < LBASE + LCOUNT &&
+               c1 >= VBASE && c1< VBASE + VCOUNT) {
+        /* Hangul L+V */
+        length++;
+      }
+      else if (c0 >= SBASE && c0 < SBASE + SCOUNT &&
+               (c0 - SBASE) % TCOUNT == 0 &&
+               c1 >= TBASE && c1 < TBASE + TCOUNT) {
+        /* Hangul LV+T */
+        length++;
+      }
+      else if (cat == c_Mn || cat == c_Mc || cat == c_Me) {
+        /* Mark */
+        length++;
+      }
+      else {
+        volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
+        if (!block_p)
+          rb_ary_push(ret, rstr);
+        else
+          rb_yield(rstr);
+        length = 0;
+        break;
+      }
+    }
+    if (length > 0) {
+      volatile VALUE rstr = TO_(str, wstring_to_rstring(wstr, start_pos, length));
+      if (!block_p)
+        rb_ary_push(ret, rstr);
+      else
+        rb_yield(rstr);
+    }
+    start_pos = j;
+  }
+  return ret;
+}
+VALUE
+get_text_elements_ensure(WString* wstr)
+{
+  WStr_free(wstr);
+  return Qnil;
+}
+VALUE
+unicode_get_text_elements(VALUE obj, VALUE str)
+{
+  WString wstr;
+  get_text_elements_param param = { &wstr, str };
+  Check_Type(str, T_STRING);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8L(&wstr, RSTRING_PTR(str), RSTRING_LEN(str));
+  return rb_ensure(get_text_elements_internal, (VALUE)&param,
+                   get_text_elements_ensure, (VALUE)&wstr);
+  /* wstr will be freed in get_text_elements_ensure() */
+}
 void
 Init_unicode()
 {
@@ -909,6 +1256,13 @@ Init_unicode()
     }
   }
+  for (i = 0; i < c_Cn + 1; i++) {
+    catname_abbr[i] = ID2SYM(rb_intern(gencat_abbr[i]));
+    catname_long[i] = ID2SYM(rb_intern(gencat_long[i]));
+    rb_global_variable(&catname_abbr[i]);
+    rb_global_variable(&catname_long[i]);
+  }
   rb_define_module_function(mUnicode, "strcmp",
 			    unicode_strcmp, 2);
   rb_define_module_function(mUnicode, "strcmp_compat",
@@ -957,6 +1311,15 @@ Init_unicode()
   rb_define_module_function(mUnicode, "capitalize",
 			    unicode_capitalize, 1);
+  rb_define_module_function(mUnicode, "categories",
+			    unicode_get_categories, 1);
+  rb_define_module_function(mUnicode, "abbr_categories",
+			    unicode_get_abbr_categories, 1);
+  rb_define_module_function(mUnicode, "width",
+			    unicode_wcswidth, -1);
+  rb_define_module_function(mUnicode, "text_elements",
+			    unicode_get_text_elements, 1);
   rb_define_const(mUnicode, "VERSION",
 		  rb_str_new2(UNICODE_VERSION));
 }