RubyGems - unicode - Versions diffs - 0.3.1-x86-mingw32 - Mend

unicode 0.3.1-x86-mingw32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/README +113 -0
data/Rakefile +16 -0
data/ext/unicode/extconf.rb +3 -0
data/ext/unicode/unicode.c +789 -0
data/ext/unicode/unidata.map +21854 -0
data/ext/unicode/ustring.c +208 -0
data/ext/unicode/ustring.h +48 -0
data/ext/unicode/wstring.c +189 -0
data/ext/unicode/wstring.h +41 -0
data/lib/unicode.rb +6 -0
data/lib/unicode/unicode_native.so +0 -0
data/test/test.rb +69 -0
data/tools/README +6 -0
data/tools/mkunidata.rb +169 -0
data/unicode.gemspec +13 -0
metadata +81 -0

data/lib/unicode.rb ADDED Viewed

@@ -0,0 +1,6 @@
+begin
+  RUBY_VERSION =~ /(\d+.\d+)/
+  require "unicode/#{$1}/unicode_native"
+rescue LoadError
+  require 'unicode/unicode_native'
+end

data/lib/unicode/unicode_native.so ADDED Viewed

Binary file

data/test/test.rb ADDED Viewed

@@ -0,0 +1,69 @@
+#! /usr/local/bin/ruby -KU
+# -*- coding: utf-8 -*-
+require 'unicode'
+## dump Unicode string
+class String
+  def udump
+    ustr = self.unpack("U*")
+    ret = []
+    ustr.each do |e|
+      if e.is_a?(Integer)
+        ret << "U+%04X" % e
+      else
+        ret << e
+      end
+    end
+    ret
+  end
+end
+print "Canonical decomposition vs compatibility decomposition\n"
+p Unicode::decompose("⑽ o\xef\xac\x83ce").udump
+p Unicode::decompose_compat("⑽ o\xef\xac\x83ce")
+print "Canonical equivalent vs Compatibility equivalent\n"
+p Unicode::strcmp("ｶﾞ", "ガ")
+p Unicode::strcmp("ガ", "ｶﾞ")
+p Unicode::strcmp_compat("ｶﾞ", "ガ")
+print "Decomposition/composition\n"
+p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
+p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
+p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
+p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
+p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
+p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
+p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
+p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
+print "Kana Normalization\n"
+p Unicode::normalize_D("ｶﾞガ").udump
+p Unicode::normalize_C("ｶﾞガ").udump
+p Unicode::normalize_KD("ｶﾞガ").udump
+p Unicode::normalize_KC("ｶﾞガ").udump
+print "Hangul\n"
+p "요시담".udump
+p Unicode::normalize_D("요시담").udump
+p Unicode::normalize_C("요시담").udump
+print "Composition Exclusion\n"
+print "   ANGSTROM SIGN [U+212B]\n"
+p Unicode::normalize_D([0x212b].pack("U")).udump
+p Unicode::normalize_C([0x212b].pack("U")).udump
+print "   LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
+p Unicode::normalize_D([0x00c5].pack("U")).udump
+p Unicode::normalize_C([0x00c5].pack("U")).udump
+print "Case conversion\n"
+p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
+p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
+p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
+## Local variables:
+## coding: utf-8
+## End:

data/tools/README ADDED Viewed

@@ -0,0 +1,6 @@
+The unidata.map is created from UnicodeData.txt,
+DerivedNormalizationProps.txt and SpecialCasing.txt of Unicode 5.2
+To update unidata.map,
+  ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt > unidata.map

data/tools/mkunidata.rb ADDED Viewed

@@ -0,0 +1,169 @@
+#! /usr/local/bin/ruby -KU
+#if $KCODE != 'UTF8'
+#  raise "$KCODE must be UTF8"
+#end
+HEAD=<<EOS
+/*
+ * UnicodeData
+ * Copyright 1999, 2004, 2010 by yoshidam
+ *
+ */
+#ifndef _UNIDATA_MAP
+#define _UNIDATA_MAP
+struct unicode_data {
+  const int code;
+  const int combining_class;
+  const int exclusion;
+  const char* const canon;
+  const char* const compat;
+  const char* uppercase;
+  const char* lowercase;
+  const char* titlecase;
+};
+static const struct unicode_data unidata[] = {
+EOS
+TAIL=<<EOS
+};
+#endif
+EOS
+def hex2str(hex)
+  if hex.nil? || hex == ''
+    return [nil, nil]
+  end
+  canon = ""
+  compat = ""
+  chars = hex.split(" ")
+  if chars[0] =~ /^[0-9A-F]{4,6}$/
+    chars.each do |c|
+      canon << [c.hex].pack("U")
+    end
+    compat = canon
+  elsif chars[0] =~ /^<.+>$/
+    chars.shift
+    chars.each do |c|
+      compat << [c.hex].pack("U")
+    end
+    canon = nil
+  else
+    raise "unknown value: " + hex
+  end
+  [canon, compat]
+end
+def hex_or_nil(str)
+  return nil if str.nil? || str == ''
+  ret = ""
+  chars = str.split(" ")
+  chars.each do |c|
+    ret << [c.hex].pack("U")
+  end
+  return ret
+end
+def printstr(str)
+  return "NULL" if !str
+  ret = ""
+  str.each_byte do |c|
+    if c >= 32 && c < 127 && c != 34 && c != 92
+      ret << c
+    else
+      ret << format("\\%03o", c)
+    end
+  end
+  return '"' + ret + '"'
+end
+## scan Composition Exclusions
+exclusion = {}
+open(ARGV[1]) do |f|
+  while l = f.gets
+    next if l =~ /^\#/ || l =~ /^$/
+    next if l !~ /Full_Composition_Exclusion/
+    code, = l.split(/\s/)
+    if code =~ /^[0-9A-F]+$/
+      code = code.hex
+      exclusion[code] = true
+    elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
+#      p [$1, $2]
+      scode = $1.hex
+      ecode = $2.hex
+      for code in scode..ecode
+        exclusion[code] = true
+      end
+    end
+  end
+end
+## scan Special Casing
+casing = {}
+open(ARGV[2]) do |f|
+  while l = f.gets
+    l.chomp!
+    next if l =~ /^\#/ || l =~ /^$/
+    l =~ /^(.*)#\s*(.*)$/
+    l = $1
+    comment = $2
+    code,lower,title,upper,cond = l.split(/;\s/)
+    next if cond
+    lower = nil if code == lower
+    title = nil if code == title
+    upper = nil if code == upper
+    code = code.hex
+    casing[code] = [hex_or_nil(lower), hex_or_nil(title), hex_or_nil(upper)]
+  end
+end
+## scan UnicodeData
+udata = {}
+open(ARGV[0]) do |f|
+  while l = f.gets
+    l.chomp!
+    code, charname, gencat, ccclass, bidicat,decomp,
+      dec, digit, num, mirror, uni1_0, comment, upcase,
+      lowcase, titlecase = l.split(";", 15);
+    code = code.hex
+    ccclass = ccclass.to_i
+    canon, compat = hex2str(decomp)
+    upcase = hex_or_nil(upcase)
+    lowcase = hex_or_nil(lowcase)
+    titlecase = hex_or_nil(titlecase)
+    udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
+  end
+end
+print HEAD
+udata.sort.each do |code, data|
+  ccclass, canon, compat, upcase, lowcase, titlecase = data
+  ## Exclusions
+  ex = 0
+  if exclusion[code]  ## Script-specifics or Post Composition Version
+    ex = 1
+  elsif canon =~ /^.$/ ## Singltons
+    ex = 2
+  elsif !canon.nil?
+    starter = canon.unpack("U*")[0]
+    if udata[starter][0] != 0 ## Non-stater decompositions
+      ex = 3
+    end
+  end
+  ## Special Casing
+  if casing[code]
+    lowcase = casing[code][0] if casing[code][0]
+    titlecase = casing[code][1] if casing[code][1]
+    upcase = casing[code][2] if casing[code][2]
+  end
+  printf("  { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
+         code, ccclass, ex, printstr(canon),
+         printstr(compat), printstr(upcase), printstr(lowcase),
+         printstr(titlecase))
+end
+printf("  { -1, 0, 0, NULL, NULL, NULL, NULL, NULL }\n")
+print TAIL

data/unicode.gemspec ADDED Viewed

@@ -0,0 +1,13 @@
+Gem::Specification.new { |s|
+  s.name             = %q{unicode}
+  s.version          = %q{0.3.1}
+  s.date             = %q{2010-02-26}
+  s.summary          = %q{Unicode normalization library.}
+  s.require_paths    = %w[lib]
+  s.author           = %q{Yoshida Masato}
+  s.email            = %q{yoshidam@yoshidam.net}
+  s.homepage         = %q{http://www.yoshidam.net/Ruby.html#unicode}
+  s.files            = `git ls-files`.split("\n").reject {|f| f =~ /^\./}
+  s.extra_rdoc_files = %w[README]
+  s.extensions       = %w[ext/unicode/extconf.rb]
+}

metadata ADDED Viewed

@@ -0,0 +1,81 @@
+--- !ruby/object:Gem::Specification
+name: unicode
+version: !ruby/object:Gem::Version
+  hash: 17
+  prerelease: false
+  segments:
+  - 0
+  - 3
+  - 1
+  version: 0.3.1
+platform: x86-mingw32
+authors:
+- Yoshida Masato
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-02-26 00:00:00 -03:00
+default_executable:
+dependencies: []
+description:
+email: yoshidam@yoshidam.net
+executables: []
+extensions: []
+extra_rdoc_files:
+- README
+files:
+- README
+- Rakefile
+- ext/unicode/extconf.rb
+- ext/unicode/unicode.c
+- ext/unicode/unidata.map
+- ext/unicode/ustring.c
+- ext/unicode/ustring.h
+- ext/unicode/wstring.c
+- ext/unicode/wstring.h
+- lib/unicode.rb
+- test/test.rb
+- tools/README
+- tools/mkunidata.rb
+- unicode.gemspec
+- lib/unicode/unicode_native.so
+has_rdoc: true
+homepage: http://www.yoshidam.net/Ruby.html#unicode
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: Unicode normalization library.
+test_files: []