RubyGems - unicode - Versions diffs - 0.1.1 → 0.2.0 - Mend

unicode 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/README +16 -11
data/test.rb +9 -8
data/tools/README +2 -2
data/tools/mkunidata.rb +20 -10
data/unicode.c +132 -54
data/unidata.map +12976 -1764
data/ustring.c +27 -25
data/ustring.h +12 -12
data/wstring.c +11 -11
metadata +4 -4

data/README CHANGED

@@ -1,5 +1,5 @@
 		   Unicode Library for Ruby
-			Version 0.1
+			Version 0.2.0
 		       Yoshida Masato
@@ -14,8 +14,8 @@
 - Install
-  This can work with ruby-1.4 or later. I recommend you to
-  use ruby-1.4.2 or later.
+  This can work with ruby-1.8 or later. I recommend you to
+  use ruby-1.8.1 or later.
   Make and install usually.
   For example, when Ruby supports dynamic linking on your OS,
@@ -36,16 +36,16 @@
 - Module Functions
-  All parameters of functions must be UTF-8.
+  All parameters of functions must be UTF-8 strings.
   Unicode::strcmp(str1, str2)
   Unicode::strcmp_compat(str1, str2)
-    Compares Unicode strings with normalization.
-    strcmp uses Normalization Form D, strcmp_compat uses
+    Compare Unicode strings with a normalization.
+    strcmp uses the Normalization Form D, strcmp_compat uses
     Normalization Form KD.
-  Unicode::decopose(str)
-  Unicode::decopose_compat(str)
+  Unicode::decompose(str)
+  Unicode::decompose_compat(str)
     Decompose Unicode string. Then the trailing characters
     are sorted in canonical order.
     decompose uses the canonical decomposition,
@@ -65,12 +65,12 @@
   Unicode::normalize_D(str)
   Unicode::normalize_KD(str)
-    Normalizes Unicode string in form D or form KD.
+    Normalize Unicode string in form D or form KD.
     These are aliases of decompose/decompose_compat.
   Unicode::normalize_C(str)
   Unicode::normalize_KC(str)
-    Normalizes Unicode string in form C or form KC.
+    Normalize Unicode string in form C or form KC.
       normalize_C  = decompose + compose
       normalize_KC = decompose_compat + compose
@@ -78,7 +78,7 @@
   Unicode::downcase(str)
   Unicode::capitalize(str)
     Case conversion functions.
-    The mappings which these functions use are not normative
+    The mappings that are used by these functions are not normative
     in UnicodeData.txt.
 - Bugs
@@ -87,6 +87,8 @@
   should not be implemented with a hash of string for better
   performance.
+  Case conversion functions should reflecte UTR #21.
 - Copying
@@ -104,4 +106,7 @@
 - History
+  Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
+  Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
+  Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
   Nov 23, 1999 version 0.1

data/test.rb CHANGED

@@ -1,4 +1,5 @@
 #! /usr/local/bin/ruby -KU
+# -*- coding: utf-8 -*-
 require 'unicode'
@@ -29,12 +30,12 @@ p Unicode::strcmp("ガ", "ｶﾞ")
 p Unicode::strcmp_compat("ｶﾞ", "ガ")
 print "Decomposition/composition\n"
-p Unicode::normalize_D([?c, 0x301, 0x327].pack("U*")).udump
-p Unicode::normalize_D([?c, 0x327, 0x301].pack("U*")).udump
+p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
+p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
 p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
 p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
-p Unicode::normalize_C([?c, 0x301, 0x327].pack("U*")).udump
-p Unicode::normalize_C([?c, 0x327, 0x301].pack("U*")).udump
+p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
+p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
 p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
 p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
@@ -50,7 +51,7 @@ p Unicode::normalize_D("요시담").udump
 p Unicode::normalize_C("요시담").udump
 print "Composition Exclusion\n"
-print "   ANGSTROM SIGN [U+221B]\n"
+print "   ANGSTROM SIGN [U+212B]\n"
 p Unicode::normalize_D([0x212b].pack("U")).udump
 p Unicode::normalize_C([0x212b].pack("U")).udump
 print "   LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
@@ -58,9 +59,9 @@ p Unicode::normalize_D([0x00c5].pack("U")).udump
 p Unicode::normalize_C([0x00c5].pack("U")).udump
 print "Case conversion\n"
-p Unicode::normalize_C(Unicode::upcase([?c, 0x301, 0x327, 0xff41].pack("U*"))).udump
-p Unicode::normalize_C(Unicode::downcase([?C, 0x301, 0x327, 0xff21].pack("U*"))).udump
-p Unicode::capitalize([0x1f1, ?A, ?a, 0xff21].pack("U*")).udump
+p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
+p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
+p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
 ## Local variables:

data/tools/README CHANGED

@@ -1,6 +1,6 @@
 The unidata.map is created from UnicodeData.txt and
-CompositionExclusions.txt of Unicode 3.0.0.
+DerivedNormalizationProps.txt of Unicode 4.1.0
 To update unidata.map,
-  ruby mkunidata.rb UnicodeData.txt CompositionExclusions.txt > unidata.map
+  ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt > unidata.map

data/tools/mkunidata.rb CHANGED

@@ -1,13 +1,13 @@
 #! /usr/local/bin/ruby -KU
-if $KCODE != 'UTF8'
-  raise "$KCODE must be UTF8"
-end
+#if $KCODE != 'UTF8'
+#  raise "$KCODE must be UTF8"
+#end
 HEAD=<<EOS
 /*
  * UnicodeData
- * 1999 by yoshidam
+ * Copyright 1999, 2004 by yoshidam
  *
  */
@@ -25,7 +25,7 @@ struct unicode_data {
   const int titlecase;
 };
-const static struct unicode_data unidata[] = {
+static const struct unicode_data unidata[] = {
 EOS
 TAIL=<<EOS
@@ -41,7 +41,7 @@ def hex2str(hex)
   canon = ""
   compat = ""
   chars = hex.split(" ")
-  if chars[0] =~ /^[0-9A-F]{4}$/
+  if chars[0] =~ /^[0-9A-F]{4,6}$/
     chars.each do |c|
       canon << [c.hex].pack("U")
     end
@@ -59,7 +59,7 @@ def hex2str(hex)
 end
 def hex_or_nil(str)
-  return "-1" if str.nil?
+  return "-1" if str.nil? || str == ''
   return format("0x%04x", str.hex)
 end
@@ -81,9 +81,19 @@ exclusion = {}
 open(ARGV[1]) do |f|
   while l = f.gets
     next if l =~ /^\#/ || l =~ /^$/
+    next if l !~ /Full_Composition_Exclusion/
     code, = l.split(/\s/)
-    code = code.hex
-    exclusion[code] = true
+    if code =~ /^[0-9A-F]+$/
+      code = code.hex
+      exclusion[code] = true
+    elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
+#      p [$1, $2]
+      scode = $1.hex
+      ecode = $2.hex
+      for code in scode..ecode
+        exclusion[code] = true
+      end
+    end
   end
 end
@@ -94,7 +104,7 @@ open(ARGV[0]) do |f|
     l.chomp!
     code, charname, gencat, ccclass, bidicat,decomp,
       dec, digit, num, mirror, uni1_0, comment, upcase,
-      lowcase, titlecase = l.split(";");
+      lowcase, titlecase = l.split(";", 15);
     code = code.hex
     ccclass = ccclass.to_i
     canon, compat = hex2str(decomp)

data/unicode.c CHANGED

@@ -1,15 +1,52 @@
 /*
- * Unicode Library version 0.1
+ * Unicode Library version 0.2
+ * Dec 29, 2009: version 0.2
  * Nov 23, 1999 yoshidam
  *
  */
 #include "ruby.h"
-#include "rubyio.h"
+#ifdef HAVE_RUBY_IO_H
+#  include "ruby/io.h"
+#else
+#  include "rubyio.h"
+#endif
 #include <stdio.h>
 #include "wstring.h"
 #include "unidata.map"
+#ifndef RSTRING_PTR
+#  define RSTRING_PTR(s) (RSTRING(s)->ptr)
+#  define RSTRING_LEN(s) (RSTRING(s)->len)
+#endif
+#ifdef HAVE_RUBY_ENCODING_H
+static rb_encoding* enc_out;
+#  define ENC_(o) (rb_enc_associate(o, enc_out))
+#else
+#  define ENC_(o) (o)
+#endif
+inline static VALUE
+taintObject(VALUE src, VALUE obj) {
+  if (OBJ_TAINTED(src))
+    OBJ_TAINT(obj);
+  return obj;
+}
+#define TO_(src, obj) (taintObject(src, obj))
+#ifdef HAVE_RUBY_ENCODING_H
+#  define CONVERT_TO_UTF8(str) do { \
+    int encindex = ENCODING_GET(str); \
+    volatile VALUE encobj; \
+    if (encindex != rb_utf8_encindex() && \
+        encindex != rb_usascii_encindex()) { \
+      encobj = rb_enc_from_encoding(enc_out); \
+      str = rb_str_encode(str, encobj, 0, Qnil); \
+    } \
+  } while (0)
+#endif
 static VALUE mUnicode;
 static VALUE unicode_data;
 static VALUE composition_table;
@@ -58,7 +95,7 @@ get_compat(int ucs)
   return NULL;
 }
-static const int
+static int
 get_uppercase(int ucs)
 {
   VALUE ch = rb_hash_aref(unicode_data, INT2FIX(ucs));
@@ -252,7 +289,7 @@ decompose_compat_internal(WString* ustr, WString* result)
   } while (0)
 static int
-compose_pair(int c1, int c2)
+compose_pair(unsigned int c1, unsigned int c2)
 {
   int ret;
   char ustr[13]; /* stored two UTF-8 chars */
@@ -370,8 +407,12 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
   Check_Type(str1, T_STRING);
   Check_Type(str2, T_STRING);
-  WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);
-  WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str1);
+  CONVERT_TO_UTF8(str2);
+#endif
+  WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
+  WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
   WStr_alloc(&result1);
   WStr_alloc(&result2);
   decompose_internal(&wstr1, &result1);
@@ -380,17 +421,17 @@ unicode_strcmp(VALUE obj, VALUE str1, VALUE str2)
   WStr_free(&wstr2);
   sort_canonical(&result1);
   sort_canonical(&result2);
-  UStr_alloc(&ustr1);
-  UStr_alloc(&ustr2);
+  UniStr_alloc(&ustr1);
+  UniStr_alloc(&ustr2);
   WStr_convertIntoUString(&result1, &ustr1);
   WStr_convertIntoUString(&result2, &ustr2);
   WStr_free(&result1);
   WStr_free(&result2);
-  UStr_addChar(&ustr1, '\0');
-  UStr_addChar(&ustr2, '\0');
-  ret = strcmp(ustr1.str, ustr2.str);
-  UStr_free(&ustr1);
-  UStr_free(&ustr2);
+  UniStr_addChar(&ustr1, '\0');
+  UniStr_addChar(&ustr2, '\0');
+  ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
+  UniStr_free(&ustr1);
+  UniStr_free(&ustr2);
   return INT2FIX(ret);
 }
@@ -408,8 +449,12 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
   Check_Type(str1, T_STRING);
   Check_Type(str2, T_STRING);
-  WStr_allocWithUTF8(&wstr1, RSTRING(str1)->ptr);
-  WStr_allocWithUTF8(&wstr2, RSTRING(str2)->ptr);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str1);
+  CONVERT_TO_UTF8(str2);
+#endif
+  WStr_allocWithUTF8(&wstr1, RSTRING_PTR(str1));
+  WStr_allocWithUTF8(&wstr2, RSTRING_PTR(str2));
   WStr_alloc(&result1);
   WStr_alloc(&result2);
   decompose_compat_internal(&wstr1, &result1);
@@ -418,17 +463,17 @@ unicode_strcmp_compat(VALUE obj, VALUE str1, VALUE str2)
   WStr_free(&wstr2);
   sort_canonical(&result1);
   sort_canonical(&result2);
-  UStr_alloc(&ustr1);
-  UStr_alloc(&ustr2);
+  UniStr_alloc(&ustr1);
+  UniStr_alloc(&ustr2);
   WStr_convertIntoUString(&result1, &ustr1);
   WStr_convertIntoUString(&result2, &ustr2);
   WStr_free(&result1);
   WStr_free(&result2);
-  UStr_addChar(&ustr1, '\0');
-  UStr_addChar(&ustr2, '\0');
-  ret = strcmp(ustr1.str, ustr2.str);
-  UStr_free(&ustr1);
-  UStr_free(&ustr2);
+  UniStr_addChar(&ustr1, '\0');
+  UniStr_addChar(&ustr2, '\0');
+  ret = strcmp((char*)ustr1.str, (char*)ustr2.str);
+  UniStr_free(&ustr1);
+  UniStr_free(&ustr2);
   return INT2FIX(ret);
 }
@@ -442,16 +487,19 @@ unicode_decompose(VALUE obj, VALUE str)
   VALUE vret;
   Check_Type(str, T_STRING);
-  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
   WStr_alloc(&result);
   decompose_internal(&ustr, &result);
   WStr_free(&ustr);
   sort_canonical(&result);
-  UStr_alloc(&ret);
+  UniStr_alloc(&ret);
   WStr_convertIntoUString(&result, &ret);
   WStr_free(&result);
-  vret = rb_str_new(ret.str, ret.len);
-  UStr_free(&ret);
+  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
+  UniStr_free(&ret);
   return vret;
 }
@@ -465,16 +513,19 @@ unicode_decompose_compat(VALUE obj, VALUE str)
   VALUE vret;
   Check_Type(str, T_STRING);
-  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
   WStr_alloc(&result);
   decompose_compat_internal(&ustr, &result);
   WStr_free(&ustr);
   sort_canonical(&result);
-  UStr_alloc(&ret);
+  UniStr_alloc(&ret);
   WStr_convertIntoUString(&result, &ret);
   WStr_free(&result);
-  vret = rb_str_new(ret.str, ret.len);
-  UStr_free(&ret);
+  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
+  UniStr_free(&ret);
   return vret;
 }
@@ -488,16 +539,19 @@ unicode_compose(VALUE obj, VALUE str)
   VALUE vret;
   Check_Type(str, T_STRING);
-  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
   sort_canonical(&ustr);
   WStr_alloc(&result);
   compose_internal(&ustr, &result);
   WStr_free(&ustr);
-  UStr_alloc(&ret);
+  UniStr_alloc(&ret);
   WStr_convertIntoUString(&result, &ret);
   WStr_free(&result);
-  vret = rb_str_new(ret.str, ret.len);
-  UStr_free(&ret);
+  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
+  UniStr_free(&ret);
   return vret;
 }
@@ -512,7 +566,10 @@ unicode_normalize_C(VALUE obj, VALUE str)
   VALUE vret;
   Check_Type(str, T_STRING);
-  WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
   WStr_alloc(&ustr2);
   decompose_internal(&ustr1, &ustr2);
   WStr_free(&ustr1);
@@ -520,11 +577,11 @@ unicode_normalize_C(VALUE obj, VALUE str)
   WStr_alloc(&result);
   compose_internal(&ustr2, &result);
   WStr_free(&ustr2);
-  UStr_alloc(&ret);
+  UniStr_alloc(&ret);
   WStr_convertIntoUString(&result, &ret);
   WStr_free(&result);
-  vret = rb_str_new(ret.str, ret.len);
-  UStr_free(&ret);
+  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
+  UniStr_free(&ret);
   return vret;
 }
@@ -539,7 +596,10 @@ unicode_normalize_KC(VALUE obj, VALUE str)
   VALUE vret;
   Check_Type(str, T_STRING);
-  WStr_allocWithUTF8(&ustr1, RSTRING(str)->ptr);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8(&ustr1, RSTRING_PTR(str));
   WStr_alloc(&ustr2);
   decompose_compat_internal(&ustr1, &ustr2);
   WStr_free(&ustr1);
@@ -547,11 +607,11 @@ unicode_normalize_KC(VALUE obj, VALUE str)
   WStr_alloc(&result);
   compose_internal(&ustr2, &result);
   WStr_free(&ustr2);
-  UStr_alloc(&ret);
+  UniStr_alloc(&ret);
   WStr_convertIntoUString(&result, &ret);
   WStr_free(&result);
-  vret = rb_str_new(ret.str, ret.len);
-  UStr_free(&ret);
+  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
+  UniStr_free(&ret);
   return vret;
 }
@@ -564,13 +624,16 @@ unicode_upcase(VALUE obj, VALUE str)
   VALUE vret;
   Check_Type(str, T_STRING);
-  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
   upcase_internal(&ustr);
-  UStr_alloc(&ret);
+  UniStr_alloc(&ret);
   WStr_convertIntoUString(&ustr, &ret);
   WStr_free(&ustr);
-  vret = rb_str_new(ret.str, ret.len);
-  UStr_free(&ret);
+  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
+  UniStr_free(&ret);
   return vret;
 }
@@ -583,17 +646,25 @@ unicode_downcase(VALUE obj, VALUE str)
   VALUE vret;
   Check_Type(str, T_STRING);
-  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
   downcase_internal(&ustr);
-  UStr_alloc(&ret);
+  UniStr_alloc(&ret);
   WStr_convertIntoUString(&ustr, &ret);
   WStr_free(&ustr);
-  vret = rb_str_new(ret.str, ret.len);
-  UStr_free(&ret);
+  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
+  UniStr_free(&ret);
   return vret;
 }
+#ifdef HAVE_RUBY_ENCODING_H
+#endif
 static VALUE
 unicode_capitalize(VALUE obj, VALUE str)
 {
@@ -602,13 +673,16 @@ unicode_capitalize(VALUE obj, VALUE str)
   VALUE vret;
   Check_Type(str, T_STRING);
-  WStr_allocWithUTF8(&ustr, RSTRING(str)->ptr);
+#ifdef HAVE_RUBY_ENCODING_H
+  CONVERT_TO_UTF8(str);
+#endif
+  WStr_allocWithUTF8(&ustr, RSTRING_PTR(str));
   capitalize_internal(&ustr);
-  UStr_alloc(&ret);
+  UniStr_alloc(&ret);
   WStr_convertIntoUString(&ustr, &ret);
   WStr_free(&ustr);
-  vret = rb_str_new(ret.str, ret.len);
-  UStr_free(&ret);
+  vret = TO_(str, ENC_(rb_str_new((char*)ret.str, ret.len)));
+  UniStr_free(&ret);
   return vret;
 }
@@ -618,6 +692,10 @@ Init_unicode()
 {
   int i;
+#ifdef HAVE_RUBY_ENCODING_H
+  enc_out = rb_utf8_encoding();
+#endif
   mUnicode = rb_define_module("Unicode");
   unicode_data = rb_hash_new();
   composition_table = rb_hash_new();