unicode 0.4.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README +156 -0
- data/Rakefile +117 -0
- data/lib/unicode.rb +13 -0
- data/test/test.rb +69 -0
- data/tools/README +7 -0
- data/tools/mkunidata.rb +293 -0
- data/tools/normtest.rb +111 -0
- data/unicode.gemspec +30 -0
- metadata +52 -0
    
        checksums.yaml
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            SHA1:
         | 
| 3 | 
            +
              metadata.gz: 314559dfe96e14c8d0f640f1aa8498fb76fd1b23
         | 
| 4 | 
            +
              data.tar.gz: 972889e38f2b9641c5d926b33ee5bd49a3b0df19
         | 
| 5 | 
            +
            SHA512:
         | 
| 6 | 
            +
              metadata.gz: 11aefd96309397148db1cfa440e520775bbc75696eb040375e7ce994781294768f6ed594123f2f131ca90bc215670681472d8aff05e54b8743932f45607160f0
         | 
| 7 | 
            +
              data.tar.gz: db4c1e76b612de731e5868fd737bf53e7bf9a55ce12430e47e08f3bc94c8320363d066b1235a4178cd0406aabebe70edccd600198a4582d2c1d73a96951647ec
         | 
    
        data/README
    ADDED
    
    | @@ -0,0 +1,156 @@ | |
| 1 | 
            +
            		   Unicode Library for Ruby
         | 
| 2 | 
            +
            			Version 0.4.4
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            		       Yoshida Masato
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             | 
| 7 | 
            +
            - Introduction
         | 
| 8 | 
            +
             | 
| 9 | 
            +
              Unicode string manipulation library for Ruby.
         | 
| 10 | 
            +
              This library is based on UAX #15 Unicode Normalization Forms(*1).
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                *1 <URL:http://www.unicode.org/unicode/reports/tr15/>
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            - Install
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              This can work with ruby-1.8.7 or later. I recommend you to
         | 
| 18 | 
            +
              use ruby-1.9.3 or later.
         | 
| 19 | 
            +
             | 
| 20 | 
            +
              Make and install usually.
         | 
| 21 | 
            +
              For example, when Ruby supports dynamic linking on your OS,
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                ruby extconf.rb
         | 
| 24 | 
            +
                make
         | 
| 25 | 
            +
                make install
         | 
| 26 | 
            +
             | 
| 27 | 
            +
              To install using gem, for exapmle:
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                gem build unicdoe.gemspac
         | 
| 30 | 
            +
                gem install unicode
         | 
| 31 | 
            +
             | 
| 32 | 
            +
             | 
| 33 | 
            +
            - Usage
         | 
| 34 | 
            +
             | 
| 35 | 
            +
              If you do not link this module with Ruby statically, 
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                require "unicode"
         | 
| 38 | 
            +
             | 
| 39 | 
            +
              before using.
         | 
| 40 | 
            +
             | 
| 41 | 
            +
             | 
| 42 | 
            +
            - Module Functions
         | 
| 43 | 
            +
             | 
| 44 | 
            +
              All parameters of functions must be UTF-8 strings.
         | 
| 45 | 
            +
             | 
| 46 | 
            +
              Unicode::strcmp(str1, str2)
         | 
| 47 | 
            +
              Unicode::strcmp_compat(str1, str2)
         | 
| 48 | 
            +
                Compare Unicode strings with a normalization.
         | 
| 49 | 
            +
                strcmp uses the Normalization Form D, strcmp_compat uses
         | 
| 50 | 
            +
                Normalization Form KD.
         | 
| 51 | 
            +
             | 
| 52 | 
            +
              Unicode::decompose(str)
         | 
| 53 | 
            +
              Unicode::decompose_compat(str)
         | 
| 54 | 
            +
                Decompose Unicode string. Then the trailing characters
         | 
| 55 | 
            +
                are sorted in canonical order.
         | 
| 56 | 
            +
                decompose uses the canonical decomposition,
         | 
| 57 | 
            +
                decompose_compat uses the compatibility decomposition.
         | 
| 58 | 
            +
                The decomposition is based on the character decomposition
         | 
| 59 | 
            +
                mapping in UnicodeData.txt and the Hangul decomposition
         | 
| 60 | 
            +
                algorithm.
         | 
| 61 | 
            +
             | 
| 62 | 
            +
              Unicode::decompose_safe(str)
         | 
| 63 | 
            +
                Decompose Unicode string with a non-standard mapping.
         | 
| 64 | 
            +
                It does not decompose the characters in
         | 
| 65 | 
            +
                CompositionExclusions.txt.
         | 
| 66 | 
            +
             | 
| 67 | 
            +
              Unicode::compose(str)
         | 
| 68 | 
            +
                Compose Unicode string. Before composing, the trailing
         | 
| 69 | 
            +
                characters are sorted in canonical order.
         | 
| 70 | 
            +
                The parameter must be decomposed.
         | 
| 71 | 
            +
                The composition is based on the reverse of the
         | 
| 72 | 
            +
                character decomposition mapping in UnicodeData.txt,
         | 
| 73 | 
            +
                CompositionExclusions.txt and the Hangul composition
         | 
| 74 | 
            +
                algorithm.
         | 
| 75 | 
            +
             | 
| 76 | 
            +
              Unicode::normalize_D(str)  (Unicode::nfd(str))
         | 
| 77 | 
            +
              Unicode::normalize_KD(str)  (Unicode::nfkd(str))
         | 
| 78 | 
            +
                Normalize Unicode string in form D or form KD.
         | 
| 79 | 
            +
                These are aliases of decompose/decompose_compat.
         | 
| 80 | 
            +
             | 
| 81 | 
            +
              Unicode::normalize_D_safe(str)  (Unicode::nfd_safe(str))
         | 
| 82 | 
            +
                This is an alias of decompose_safe.
         | 
| 83 | 
            +
             | 
| 84 | 
            +
              Unicode::normalize_C(str) (Unicode::nfc(str))
         | 
| 85 | 
            +
              Unicode::normalize_KC(str) (Unicode::nfkc(str))
         | 
| 86 | 
            +
                Normalize Unicode string in form C or form KC.
         | 
| 87 | 
            +
                  normalize_C  = decompose + compose
         | 
| 88 | 
            +
                  normalize_KC = decompose_compat + compose
         | 
| 89 | 
            +
             | 
| 90 | 
            +
              Unicode::normalize_C_safe(str) (Unicode::nfc_safe(str))
         | 
| 91 | 
            +
                Normalize Unicode string with decompose_safe.
         | 
| 92 | 
            +
                  normalize_C_safe  = decompose_safe + compose
         | 
| 93 | 
            +
             | 
| 94 | 
            +
              Unicode::upcase(str)
         | 
| 95 | 
            +
              Unicode::downcase(str)
         | 
| 96 | 
            +
              Unicode::capitalize(str)
         | 
| 97 | 
            +
                Case conversion functions.
         | 
| 98 | 
            +
                The mappings that are used by these functions are not normative
         | 
| 99 | 
            +
                in UnicodeData.txt.
         | 
| 100 | 
            +
             | 
| 101 | 
            +
              Unicode::categories(str)
         | 
| 102 | 
            +
              Unicode::abbr_categories(str)
         | 
| 103 | 
            +
                Get an array of general category names of the string.
         | 
| 104 | 
            +
                get_abbr_categories returns abbreviated names.
         | 
| 105 | 
            +
                These can be called with a block.
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                  Unicode.get_category do |category| p category end
         | 
| 108 | 
            +
             | 
| 109 | 
            +
              Unicode::text_elements(str)
         | 
| 110 | 
            +
                Get an array of text elements.
         | 
| 111 | 
            +
                A text element is a unit that is displayed as a single character.
         | 
| 112 | 
            +
                These can be called with a block.
         | 
| 113 | 
            +
             | 
| 114 | 
            +
              Unicode::width(str[, cjk])
         | 
| 115 | 
            +
                Estimate the display width on the fixed pitch text terminal.
         | 
| 116 | 
            +
                It based on Markus Kuhn's mk_wcwidth.
         | 
| 117 | 
            +
                If the optional argument 'cjk' is true, East Asian
         | 
| 118 | 
            +
                Ambiguous characters are treated as wide characters.
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                  Unicode.width("\u03b1") #=> 1
         | 
| 121 | 
            +
                  Unicode.width("\u03b1", true) #=> 2
         | 
| 122 | 
            +
             | 
| 123 | 
            +
             | 
| 124 | 
            +
            - Bugs
         | 
| 125 | 
            +
             | 
| 126 | 
            +
              UAX #15 suggests that the look up for Normalization Form C
         | 
| 127 | 
            +
              should not be implemented with a hash of string for better
         | 
| 128 | 
            +
              performance.
         | 
| 129 | 
            +
             | 
| 130 | 
            +
             | 
| 131 | 
            +
            - Copying
         | 
| 132 | 
            +
             | 
| 133 | 
            +
              This extension module is copyrighted free software by
         | 
| 134 | 
            +
              Yoshida Masato.
         | 
| 135 | 
            +
             | 
| 136 | 
            +
              You can redistribute it and/or modify it under the same
         | 
| 137 | 
            +
              term as Ruby.
         | 
| 138 | 
            +
             | 
| 139 | 
            +
             | 
| 140 | 
            +
            - Author
         | 
| 141 | 
            +
             | 
| 142 | 
            +
              Yoshida Masato <yoshidam@yoshidam.net>
         | 
| 143 | 
            +
             | 
| 144 | 
            +
             | 
| 145 | 
            +
            - History
         | 
| 146 | 
            +
             | 
| 147 | 
            +
              Feb  7, 2013 version 0.4.4 update unidata.map for Unicode 6.2
         | 
| 148 | 
            +
              Aug  8, 2012 version 0.4.3 add categories, text_elements and width
         | 
| 149 | 
            +
              Feb 29, 2012 version 0.4.2 add decompose_safe
         | 
| 150 | 
            +
              Feb  3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
         | 
| 151 | 
            +
              Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
         | 
| 152 | 
            +
              Feb 26, 2010 version 0.3.0 fix a capitalize bug and support SpecialCasing
         | 
| 153 | 
            +
              Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
         | 
| 154 | 
            +
              Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
         | 
| 155 | 
            +
              Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
         | 
| 156 | 
            +
              Nov 23, 1999 version 0.1
         | 
    
        data/Rakefile
    ADDED
    
    | @@ -0,0 +1,117 @@ | |
| 1 | 
            +
            require "rake/clean"
         | 
| 2 | 
            +
            require "rake/extensiontask"
         | 
| 3 | 
            +
            require "rubygems/package_task"
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            CLEAN << "pkg" << "tmp" << "lib/unicode"
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            UPSTREAM_URL = 'http://www.yoshidam.net/unicode-%s.tar.gz'
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            gem_spec = eval(File.read(File.expand_path("../unicode.gemspec", __FILE__)))
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            gem_task = Gem::PackageTask.new(gem_spec) {|pkg|}
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            Rake::ExtensionTask.new('unicode_native', gem_spec) do |ext|
         | 
| 14 | 
            +
              ext.cross_compile = true
         | 
| 15 | 
            +
              ext.cross_platform = ['x86-mingw32', 'x86-mswin32-60']
         | 
| 16 | 
            +
              ext.ext_dir = "ext/unicode"
         | 
| 17 | 
            +
              ext.lib_dir = "lib/unicode"
         | 
| 18 | 
            +
            end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            namespace :gem do
         | 
| 21 | 
            +
             | 
| 22 | 
            +
              desc 'Build all gem files'
         | 
| 23 | 
            +
              task :all => %w[clean gem gem:java gem:windows]
         | 
| 24 | 
            +
             | 
| 25 | 
            +
              java_gem_spec = gem_spec.dup
         | 
| 26 | 
            +
              java_gem_spec.platform = 'java'
         | 
| 27 | 
            +
              java_gem_spec.extensions.clear
         | 
| 28 | 
            +
              java_gem_spec.files.delete_if { |f| f.start_with?('ext/') }
         | 
| 29 | 
            +
             | 
| 30 | 
            +
              directory java_gem_dir = gem_task.package_dir
         | 
| 31 | 
            +
             | 
| 32 | 
            +
              java_gem_file = File.basename(java_gem_spec.cache_file)
         | 
| 33 | 
            +
              java_gem_path = File.join(java_gem_dir, java_gem_file)
         | 
| 34 | 
            +
             | 
| 35 | 
            +
              desc "Build the gem file #{java_gem_file}"
         | 
| 36 | 
            +
              task :java => java_gem_path
         | 
| 37 | 
            +
             | 
| 38 | 
            +
              file java_gem_path => [java_gem_dir] + java_gem_spec.files do
         | 
| 39 | 
            +
                lib_file = 'lib/unicode.rb'
         | 
| 40 | 
            +
                tmp_file = "#{lib_file}.tmp-#{$$}"
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                begin
         | 
| 43 | 
            +
                  mv lib_file, tmp_file
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                  File.write(lib_file, <<-EOT)
         | 
| 46 | 
            +
              module Unicode
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                extend self
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                def upcase(str)
         | 
| 51 | 
            +
                  str.to_java.to_upper_case
         | 
| 52 | 
            +
                end
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                def downcase(str)
         | 
| 55 | 
            +
                  str.to_java.to_lower_case
         | 
| 56 | 
            +
                end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
              end
         | 
| 59 | 
            +
                  EOT
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                  Gem::Package.build(java_gem_spec)
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                  mv java_gem_file, java_gem_dir
         | 
| 64 | 
            +
                ensure
         | 
| 65 | 
            +
                  mv tmp_file, lib_file if File.exist?(tmp_file)
         | 
| 66 | 
            +
                end
         | 
| 67 | 
            +
              end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
              desc "Build native gems for Windows"
         | 
| 70 | 
            +
              task :windows do
         | 
| 71 | 
            +
                ENV["RUBY_CC_VERSION"] = "1.8.7:1.9.3"
         | 
| 72 | 
            +
                sh "rake cross compile"
         | 
| 73 | 
            +
                sh "rake cross native gem"
         | 
| 74 | 
            +
              end
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            end
         | 
| 77 | 
            +
             | 
| 78 | 
            +
            desc "Update from upstream"
         | 
| 79 | 
            +
            task :update, [:version] do |t, args|
         | 
| 80 | 
            +
              require 'zlib'
         | 
| 81 | 
            +
              require 'open-uri'
         | 
| 82 | 
            +
              require 'archive/tar/minitar'
         | 
| 83 | 
            +
             | 
| 84 | 
            +
              unless version = args.version || ENV['UPSTREAM_VERSION']
         | 
| 85 | 
            +
                abort "Please specify UPSTREAM_VERSION. See #{gem_spec.homepage}."
         | 
| 86 | 
            +
              end
         | 
| 87 | 
            +
             | 
| 88 | 
            +
              io = begin
         | 
| 89 | 
            +
                open(url = UPSTREAM_URL % version)
         | 
| 90 | 
            +
              rescue OpenURI::HTTPError
         | 
| 91 | 
            +
                abort "Upstream version not found: #{url}. See #{gem_spec.homepage}."
         | 
| 92 | 
            +
              end
         | 
| 93 | 
            +
             | 
| 94 | 
            +
              Archive::Tar::Minitar.open(Zlib::GzipReader.new(io)) { |tar|
         | 
| 95 | 
            +
                basedir = File.expand_path('..', __FILE__)
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                extract = lambda { |entry, name, dir|
         | 
| 98 | 
            +
                  puts "Extracting `#{name}' to `#{dir || '.'}'..."
         | 
| 99 | 
            +
                  tar.extract_entry(dir ? File.join(basedir, dir) : basedir, entry)
         | 
| 100 | 
            +
                }
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                tar.each { |entry|
         | 
| 103 | 
            +
                  entry.name.sub!(/\Aunicode\//, '')
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                  case name = entry.full_name
         | 
| 106 | 
            +
                    when /\Atools\/|\.gemspec\z/, 'README'
         | 
| 107 | 
            +
                      extract[entry, name, nil]
         | 
| 108 | 
            +
                    when /\.(?:[ch]|map)\z/, 'extconf.rb'
         | 
| 109 | 
            +
                      extract[entry, name, 'ext/unicode']
         | 
| 110 | 
            +
                    when /\Atest/
         | 
| 111 | 
            +
                      extract[entry, name, 'test']
         | 
| 112 | 
            +
                    else
         | 
| 113 | 
            +
                      puts "Skipping `#{name}'..."
         | 
| 114 | 
            +
                  end
         | 
| 115 | 
            +
                }
         | 
| 116 | 
            +
              }
         | 
| 117 | 
            +
            end
         | 
    
        data/lib/unicode.rb
    ADDED
    
    
    
        data/test/test.rb
    ADDED
    
    | @@ -0,0 +1,69 @@ | |
| 1 | 
            +
            #! /usr/local/bin/ruby -KU
         | 
| 2 | 
            +
            # -*- coding: utf-8 -*-
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            require 'unicode'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            ## dump Unicode string
         | 
| 7 | 
            +
            class String
         | 
| 8 | 
            +
              def udump
         | 
| 9 | 
            +
                ustr = self.unpack("U*")
         | 
| 10 | 
            +
                ret = []
         | 
| 11 | 
            +
                ustr.each do |e|
         | 
| 12 | 
            +
                  if e.is_a?(Integer)
         | 
| 13 | 
            +
                    ret << "U+%04X" % e
         | 
| 14 | 
            +
                  else
         | 
| 15 | 
            +
                    ret << e
         | 
| 16 | 
            +
                  end
         | 
| 17 | 
            +
                end
         | 
| 18 | 
            +
                ret
         | 
| 19 | 
            +
              end
         | 
| 20 | 
            +
            end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
             | 
| 23 | 
            +
            print "Canonical decomposition vs compatibility decomposition\n"
         | 
| 24 | 
            +
            p Unicode::decompose("⑽ o\xef\xac\x83ce").udump
         | 
| 25 | 
            +
            p Unicode::decompose_compat("⑽ o\xef\xac\x83ce")
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            print "Canonical equivalent vs Compatibility equivalent\n"
         | 
| 28 | 
            +
            p Unicode::strcmp("ガ", "ガ")
         | 
| 29 | 
            +
            p Unicode::strcmp("ガ", "ガ")
         | 
| 30 | 
            +
            p Unicode::strcmp_compat("ガ", "ガ")
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            print "Decomposition/composition\n"
         | 
| 33 | 
            +
            p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
         | 
| 34 | 
            +
            p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
         | 
| 35 | 
            +
            p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
         | 
| 36 | 
            +
            p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
         | 
| 37 | 
            +
            p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
         | 
| 38 | 
            +
            p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
         | 
| 39 | 
            +
            p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
         | 
| 40 | 
            +
            p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            print "Kana Normalization\n"
         | 
| 43 | 
            +
            p Unicode::normalize_D("ガガ").udump
         | 
| 44 | 
            +
            p Unicode::normalize_C("ガガ").udump
         | 
| 45 | 
            +
            p Unicode::normalize_KD("ガガ").udump
         | 
| 46 | 
            +
            p Unicode::normalize_KC("ガガ").udump
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            print "Hangul\n"
         | 
| 49 | 
            +
            p "요시담".udump
         | 
| 50 | 
            +
            p Unicode::normalize_D("요시담").udump
         | 
| 51 | 
            +
            p Unicode::normalize_C("요시담").udump
         | 
| 52 | 
            +
             | 
| 53 | 
            +
            print "Composition Exclusion\n"
         | 
| 54 | 
            +
            print "   ANGSTROM SIGN [U+212B]\n"
         | 
| 55 | 
            +
            p Unicode::normalize_D([0x212b].pack("U")).udump
         | 
| 56 | 
            +
            p Unicode::normalize_C([0x212b].pack("U")).udump
         | 
| 57 | 
            +
            print "   LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
         | 
| 58 | 
            +
            p Unicode::normalize_D([0x00c5].pack("U")).udump
         | 
| 59 | 
            +
            p Unicode::normalize_C([0x00c5].pack("U")).udump
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            print "Case conversion\n"
         | 
| 62 | 
            +
            p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
         | 
| 63 | 
            +
            p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
         | 
| 64 | 
            +
            p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
         | 
| 65 | 
            +
             | 
| 66 | 
            +
             | 
| 67 | 
            +
            ## Local variables:
         | 
| 68 | 
            +
            ## coding: utf-8
         | 
| 69 | 
            +
            ## End:
         | 
    
        data/tools/README
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            The bundled unidata.map is created from UnicodeData.txt,
         | 
| 2 | 
            +
            DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
         | 
| 3 | 
            +
            of Unicode 6.0.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            To update unidata.map,
         | 
| 6 | 
            +
             | 
| 7 | 
            +
              ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt  > unidata.map
         | 
    
        data/tools/mkunidata.rb
    ADDED
    
    | @@ -0,0 +1,293 @@ | |
| 1 | 
            +
            #! /usr/local/bin/ruby -KU
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            #if $KCODE != 'UTF8'
         | 
| 4 | 
            +
            #  raise "$KCODE must be UTF8"
         | 
| 5 | 
            +
            #end
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            HEAD=<<EOS
         | 
| 8 | 
            +
            /*
         | 
| 9 | 
            +
             * UnicodeData
         | 
| 10 | 
            +
             * Copyright 1999, 2004, 2010, 2012 by yoshidam
         | 
| 11 | 
            +
             *
         | 
| 12 | 
            +
             */
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            #ifndef _UNIDATA_MAP
         | 
| 15 | 
            +
            #define _UNIDATA_MAP
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            EOS
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            HEAD1=<<EOS
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            enum GeneralCategory {
         | 
| 22 | 
            +
              /* Letter */
         | 
| 23 | 
            +
              c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
         | 
| 24 | 
            +
              /* Mark */
         | 
| 25 | 
            +
              c_Mn, c_Mc, c_Me,
         | 
| 26 | 
            +
              /* Number */
         | 
| 27 | 
            +
              c_Nd, c_Nl, c_No,
         | 
| 28 | 
            +
              /* Punctuation */
         | 
| 29 | 
            +
              c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
         | 
| 30 | 
            +
              /* Symbol */
         | 
| 31 | 
            +
              c_Sm, c_Sc, c_Sk, c_So,
         | 
| 32 | 
            +
              /* Separator */
         | 
| 33 | 
            +
              c_Zs, c_Zl, c_Zp,
         | 
| 34 | 
            +
              /* Other */
         | 
| 35 | 
            +
              c_Cc, c_Cf, c_Cs, c_Co, c_Cn
         | 
| 36 | 
            +
            };
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            const char* const gencat_abbr[] = {
         | 
| 39 | 
            +
              "", /* 0 */
         | 
| 40 | 
            +
              /* Letter */
         | 
| 41 | 
            +
              "Lu", "Ll", "Lt", "LC", "Lm", "Lo",
         | 
| 42 | 
            +
              /* Mark */
         | 
| 43 | 
            +
              "Mn", "Mc", "Me",
         | 
| 44 | 
            +
              /* Number */
         | 
| 45 | 
            +
              "Nd", "Nl", "No",
         | 
| 46 | 
            +
              /* Punctuation */
         | 
| 47 | 
            +
              "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
         | 
| 48 | 
            +
              /* Symbol */
         | 
| 49 | 
            +
              "Sm", "Sc", "Sk", "So",
         | 
| 50 | 
            +
              /* Separator */
         | 
| 51 | 
            +
              "Zs", "Zl", "Zp",
         | 
| 52 | 
            +
              /* Other */
         | 
| 53 | 
            +
              "Cc", "Cf", "Cs", "Co", "Cn"
         | 
| 54 | 
            +
            };
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            const char* const gencat_long[] = {
         | 
| 57 | 
            +
              "",
         | 
| 58 | 
            +
              "Uppercase_Letter",
         | 
| 59 | 
            +
              "Lowercase_Letter",
         | 
| 60 | 
            +
              "Titlecase_Letter",
         | 
| 61 | 
            +
              "Cased_Letter",
         | 
| 62 | 
            +
              "Modifier_Letter",
         | 
| 63 | 
            +
              "Other_Letter",
         | 
| 64 | 
            +
              "Nonspacing_Mark",
         | 
| 65 | 
            +
              "Spacing_Mark",
         | 
| 66 | 
            +
              "Enclosing_Mark",
         | 
| 67 | 
            +
              "Decimal_Number",
         | 
| 68 | 
            +
              "Letter_Number",
         | 
| 69 | 
            +
              "Other_Number",
         | 
| 70 | 
            +
              "Connector_Punctuation",
         | 
| 71 | 
            +
              "Dash_Punctuation",
         | 
| 72 | 
            +
              "Open_Punctuation",
         | 
| 73 | 
            +
              "Close_Punctuation",
         | 
| 74 | 
            +
              "Initial_Punctuation",
         | 
| 75 | 
            +
              "Final_Punctuation",
         | 
| 76 | 
            +
              "Other_Punctuation",
         | 
| 77 | 
            +
              "Math_Symbol",
         | 
| 78 | 
            +
              "Currency_Symbol",
         | 
| 79 | 
            +
              "Modifier_Symbol",
         | 
| 80 | 
            +
              "Other_Symbol",
         | 
| 81 | 
            +
              "Space_Separator",
         | 
| 82 | 
            +
              "Line_Separator",
         | 
| 83 | 
            +
              "Paragraph_Separator",
         | 
| 84 | 
            +
              "Control",
         | 
| 85 | 
            +
              "Format",
         | 
| 86 | 
            +
              "Surrogate",
         | 
| 87 | 
            +
              "Private_Use",
         | 
| 88 | 
            +
              "Unassigned"
         | 
| 89 | 
            +
            };
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            enum EastAsianWidth {
         | 
| 92 | 
            +
              w_N = 1, w_A, w_H, w_W, w_F, w_Na
         | 
| 93 | 
            +
            };
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            struct unicode_data {
         | 
| 96 | 
            +
              const int code;
         | 
| 97 | 
            +
              const char* const canon;
         | 
| 98 | 
            +
              const char* const compat;
         | 
| 99 | 
            +
              const char* const uppercase;
         | 
| 100 | 
            +
              const char* const lowercase;
         | 
| 101 | 
            +
              const char* const titlecase;
         | 
| 102 | 
            +
              const unsigned char combining_class;
         | 
| 103 | 
            +
              const unsigned char exclusion;
         | 
| 104 | 
            +
              const unsigned char general_category;
         | 
| 105 | 
            +
              const unsigned char east_asian_width;
         | 
| 106 | 
            +
            };
         | 
| 107 | 
            +
             | 
| 108 | 
            +
            static const struct unicode_data unidata[] = {
         | 
| 109 | 
            +
            EOS
         | 
| 110 | 
            +
             | 
| 111 | 
            +
            TAIL=<<EOS
         | 
| 112 | 
            +
            };
         | 
| 113 | 
            +
             | 
| 114 | 
            +
            #endif
         | 
| 115 | 
            +
            EOS
         | 
| 116 | 
            +
             | 
| 117 | 
            +
            def hex2str(hex)
         | 
| 118 | 
            +
              if hex.nil? || hex == ''
         | 
| 119 | 
            +
                return [nil, nil]
         | 
| 120 | 
            +
              end
         | 
| 121 | 
            +
              canon = ""
         | 
| 122 | 
            +
              compat = ""
         | 
| 123 | 
            +
              chars = hex.split(" ")
         | 
| 124 | 
            +
              if chars[0] =~ /^[0-9A-F]{4,6}$/
         | 
| 125 | 
            +
                chars.each do |c|
         | 
| 126 | 
            +
                  canon << [c.hex].pack("U")
         | 
| 127 | 
            +
                end
         | 
| 128 | 
            +
                compat = canon
         | 
| 129 | 
            +
              elsif chars[0] =~ /^<.+>$/
         | 
| 130 | 
            +
                chars.shift
         | 
| 131 | 
            +
                chars.each do |c|
         | 
| 132 | 
            +
                  compat << [c.hex].pack("U")
         | 
| 133 | 
            +
                end
         | 
| 134 | 
            +
                canon = nil
         | 
| 135 | 
            +
              else
         | 
| 136 | 
            +
                raise "unknown value: " + hex
         | 
| 137 | 
            +
              end
         | 
| 138 | 
            +
              [canon, compat]
         | 
| 139 | 
            +
            end
         | 
| 140 | 
            +
             | 
| 141 | 
            +
            def hex_or_nil(str)
         | 
| 142 | 
            +
              return nil if str.nil? || str == ''
         | 
| 143 | 
            +
              ret = ""
         | 
| 144 | 
            +
              chars = str.split(" ")
         | 
| 145 | 
            +
              chars.each do |c|
         | 
| 146 | 
            +
                ret << [c.hex].pack("U")
         | 
| 147 | 
            +
              end
         | 
| 148 | 
            +
              return ret
         | 
| 149 | 
            +
            end
         | 
| 150 | 
            +
             | 
| 151 | 
            +
            def printstr(str)
         | 
| 152 | 
            +
              return "NULL" if !str
         | 
| 153 | 
            +
              ret = ""
         | 
| 154 | 
            +
              str.each_byte do |c|
         | 
| 155 | 
            +
                if c >= 32 && c < 127 && c != 34 && c != 92
         | 
| 156 | 
            +
                  ret << c
         | 
| 157 | 
            +
                else
         | 
| 158 | 
            +
                  ret << format("\\%03o", c)
         | 
| 159 | 
            +
                end
         | 
| 160 | 
            +
              end
         | 
| 161 | 
            +
              return '"' + ret + '"'
         | 
| 162 | 
            +
            end
         | 
| 163 | 
            +
             | 
| 164 | 
            +
            if ARGV.length != 4
         | 
| 165 | 
            +
              puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
         | 
| 166 | 
            +
              exit 0
         | 
| 167 | 
            +
            end
         | 
| 168 | 
            +
             | 
| 169 | 
            +
            ## scan Composition Exclusions
         | 
| 170 | 
            +
            exclusion = {}
         | 
| 171 | 
            +
            open(ARGV[1]) do |f|
         | 
| 172 | 
            +
              while l = f.gets
         | 
| 173 | 
            +
                next if l =~ /^\#/ || l =~ /^$/
         | 
| 174 | 
            +
                next if l !~ /Full_Composition_Exclusion/
         | 
| 175 | 
            +
                code, = l.split(/\s/)
         | 
| 176 | 
            +
                if code =~ /^[0-9A-F]+$/
         | 
| 177 | 
            +
                  code = code.hex
         | 
| 178 | 
            +
                  exclusion[code] = true
         | 
| 179 | 
            +
                elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
         | 
| 180 | 
            +
            #      p [$1, $2]
         | 
| 181 | 
            +
                  scode = $1.hex
         | 
| 182 | 
            +
                  ecode = $2.hex
         | 
| 183 | 
            +
                  for code in scode..ecode
         | 
| 184 | 
            +
                    exclusion[code] = true
         | 
| 185 | 
            +
                  end
         | 
| 186 | 
            +
                end
         | 
| 187 | 
            +
              end
         | 
| 188 | 
            +
            end
         | 
| 189 | 
            +
             | 
| 190 | 
            +
            ## scan Special Casing
         | 
| 191 | 
            +
            casing = {}
         | 
| 192 | 
            +
            open(ARGV[2]) do |f|
         | 
| 193 | 
            +
              while l = f.gets
         | 
| 194 | 
            +
                l.chomp!
         | 
| 195 | 
            +
                next if l =~ /^\#/ || l =~ /^$/
         | 
| 196 | 
            +
                l =~ /^(.*)#\s*(.*)$/
         | 
| 197 | 
            +
                l = $1
         | 
| 198 | 
            +
                comment = $2
         | 
| 199 | 
            +
                code,lower,title,upper,cond = l.split(/;\s/)
         | 
| 200 | 
            +
                next if cond
         | 
| 201 | 
            +
                lower = nil if code == lower
         | 
| 202 | 
            +
                title = nil if code == title
         | 
| 203 | 
            +
                upper = nil if code == upper
         | 
| 204 | 
            +
                code = code.hex
         | 
| 205 | 
            +
                casing[code] = [hex_or_nil(lower), hex_or_nil(title), hex_or_nil(upper)]
         | 
| 206 | 
            +
              end
         | 
| 207 | 
            +
            end
         | 
| 208 | 
            +
             | 
| 209 | 
            +
            ## scan UnicodeData
         | 
| 210 | 
            +
            udata = {}
         | 
| 211 | 
            +
            range_data = []
         | 
| 212 | 
            +
            open(ARGV[0]) do |f|
         | 
| 213 | 
            +
              while l = f.gets
         | 
| 214 | 
            +
                l.chomp!
         | 
| 215 | 
            +
                code, charname, gencat, ccclass, bidicat,decomp,
         | 
| 216 | 
            +
                  dec, digit, num, mirror, uni1_0, comment, upcase,
         | 
| 217 | 
            +
                  lowcase, titlecase = l.split(";", 15);
         | 
| 218 | 
            +
                code = code.hex
         | 
| 219 | 
            +
                ccclass = ccclass.to_i
         | 
| 220 | 
            +
                canon, compat = hex2str(decomp)
         | 
| 221 | 
            +
                upcase = hex_or_nil(upcase)
         | 
| 222 | 
            +
                lowcase = hex_or_nil(lowcase)
         | 
| 223 | 
            +
                titlecase = hex_or_nil(titlecase)
         | 
| 224 | 
            +
                udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
         | 
| 225 | 
            +
                if charname =~ /^<(.*, (First|Last))>$/
         | 
| 226 | 
            +
                  charname = $1.upcase.gsub(/,? /, '_')
         | 
| 227 | 
            +
                  range_data << [charname, code]
         | 
| 228 | 
            +
                end
         | 
| 229 | 
            +
              end
         | 
| 230 | 
            +
            end
         | 
| 231 | 
            +
             | 
| 232 | 
            +
            ## scan EastAsianWidth
         | 
| 233 | 
            +
            ea_width = {}
         | 
| 234 | 
            +
            open(ARGV[3]) do |f|
         | 
| 235 | 
            +
              while l = f.gets
         | 
| 236 | 
            +
                l.chomp!
         | 
| 237 | 
            +
                next if l =~ /^\#/ || l =~ /^$/
         | 
| 238 | 
            +
                l =~ /^(.*)\s+#\s*(.*)$/
         | 
| 239 | 
            +
                l = $1
         | 
| 240 | 
            +
                comment = $2
         | 
| 241 | 
            +
                code,width = l.split(/;/)
         | 
| 242 | 
            +
                if code =~ /\.\./
         | 
| 243 | 
            +
                  start_code, end_code = code.split('..')
         | 
| 244 | 
            +
                  start_code = start_code.hex
         | 
| 245 | 
            +
                  end_code = end_code.hex
         | 
| 246 | 
            +
                  (start_code..end_code).each do |code|
         | 
| 247 | 
            +
                    ea_width[code] = width
         | 
| 248 | 
            +
                  end
         | 
| 249 | 
            +
                  next
         | 
| 250 | 
            +
                end
         | 
| 251 | 
            +
                code = code.hex
         | 
| 252 | 
            +
                ea_width[code] = width
         | 
| 253 | 
            +
              end
         | 
| 254 | 
            +
            end
         | 
| 255 | 
            +
             | 
| 256 | 
            +
            print HEAD
         | 
| 257 | 
            +
            range_data.each do |charname, code|
         | 
| 258 | 
            +
              printf("#define %s\t(0x%04x)\n", charname, code)
         | 
| 259 | 
            +
            end
         | 
| 260 | 
            +
             | 
| 261 | 
            +
            print HEAD1
         | 
| 262 | 
            +
            udata.sort.each do |code, data|
         | 
| 263 | 
            +
              ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
         | 
| 264 | 
            +
              ## Exclusions
         | 
| 265 | 
            +
              ex = 0
         | 
| 266 | 
            +
              if exclusion[code]  ## Script-specifics or Post Composition Version
         | 
| 267 | 
            +
                ex = 1
         | 
| 268 | 
            +
              elsif canon =~ /^.$/ ## Singltons
         | 
| 269 | 
            +
                ex = 2
         | 
| 270 | 
            +
              elsif !canon.nil?
         | 
| 271 | 
            +
                starter = canon.unpack("U*")[0]
         | 
| 272 | 
            +
                if udata[starter][0] != 0 ## Non-stater decompositions
         | 
| 273 | 
            +
                  ex = 3
         | 
| 274 | 
            +
                end
         | 
| 275 | 
            +
              end
         | 
| 276 | 
            +
              ## Special Casing
         | 
| 277 | 
            +
              if casing[code]
         | 
| 278 | 
            +
                lowcase = casing[code][0] if casing[code][0]
         | 
| 279 | 
            +
                titlecase = casing[code][1] if casing[code][1]
         | 
| 280 | 
            +
                upcase = casing[code][2] if casing[code][2]
         | 
| 281 | 
            +
              end
         | 
| 282 | 
            +
              width = 'N'
         | 
| 283 | 
            +
              if ea_width[code]
         | 
| 284 | 
            +
                width = ea_width[code]
         | 
| 285 | 
            +
              end
         | 
| 286 | 
            +
             | 
| 287 | 
            +
              printf("  { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
         | 
| 288 | 
            +
                     code, printstr(canon),
         | 
| 289 | 
            +
                     printstr(compat), printstr(upcase), printstr(lowcase),
         | 
| 290 | 
            +
                     printstr(titlecase), ccclass, ex, gencat, width)
         | 
| 291 | 
            +
            end
         | 
| 292 | 
            +
            printf("  { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
         | 
| 293 | 
            +
            print TAIL
         | 
    
        data/tools/normtest.rb
    ADDED
    
    | @@ -0,0 +1,111 @@ | |
| 1 | 
            +
            #! /usr/local/bin/ruby -KU
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            ## Conformance test with NormaliztionTest.txt
         | 
| 4 | 
            +
            ## Copyrigth 2010 yoshidam
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            require 'unicode'
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            TESTFILE = "NormalizationTest.txt"
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            def from_hex(str)
         | 
| 11 | 
            +
              ret = ""
         | 
| 12 | 
            +
              chars = str.split(" ")
         | 
| 13 | 
            +
              chars.each do |c|
         | 
| 14 | 
            +
                ret << [c.hex].pack("U")
         | 
| 15 | 
            +
              end
         | 
| 16 | 
            +
              return ret
         | 
| 17 | 
            +
            end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            def to_hex(str)
         | 
| 20 | 
            +
              ret = ""
         | 
| 21 | 
            +
              str = str.unpack('U*')
         | 
| 22 | 
            +
              str.each do |c|
         | 
| 23 | 
            +
                ret += sprintf("%04X ", c)
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
              ret
         | 
| 26 | 
            +
            end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            open(TESTFILE) do |f|
         | 
| 29 | 
            +
              while l = f.gets
         | 
| 30 | 
            +
                next if l =~ /^#/
         | 
| 31 | 
            +
                l.chomp
         | 
| 32 | 
            +
                if l =~ /^@/
         | 
| 33 | 
            +
                  puts l
         | 
| 34 | 
            +
                  next
         | 
| 35 | 
            +
                end
         | 
| 36 | 
            +
                c1, c2, c3, c4, c5 = l.split(';')
         | 
| 37 | 
            +
                code = c1
         | 
| 38 | 
            +
                c1 = from_hex(c1)
         | 
| 39 | 
            +
                c2 = from_hex(c2)
         | 
| 40 | 
            +
                c3 = from_hex(c3)
         | 
| 41 | 
            +
                c4 = from_hex(c4)
         | 
| 42 | 
            +
                c5 = from_hex(c5)
         | 
| 43 | 
            +
                ## NFC TEST
         | 
| 44 | 
            +
                if c2 == Unicode.nfc(c1) && c2 == Unicode.nfc(c2) &&
         | 
| 45 | 
            +
                    c2 == Unicode.nfc(c3) &&
         | 
| 46 | 
            +
                    c4 == Unicode.nfc(c4) && c4 == Unicode.nfc(c4)
         | 
| 47 | 
            +
                  ##puts "NFC OK: " + code
         | 
| 48 | 
            +
                else
         | 
| 49 | 
            +
                  puts "NFC NG: " + to_hex(c1)
         | 
| 50 | 
            +
                  printf("  c2=%s NFC(c1)=%s NFC(c2)=%s NFC(c3)=%s\n",
         | 
| 51 | 
            +
                         to_hex(c2),
         | 
| 52 | 
            +
                         to_hex(Unicode.nfc(c1)),
         | 
| 53 | 
            +
                         to_hex(Unicode.nfc(c2)),
         | 
| 54 | 
            +
                         to_hex(Unicode.nfc(c3)))
         | 
| 55 | 
            +
                  printf("  c4=%s NFC(c4)=%s NFC(c5)=%s\n",
         | 
| 56 | 
            +
                         to_hex(c4),
         | 
| 57 | 
            +
                         to_hex(Unicode.nfc(c4)),
         | 
| 58 | 
            +
                         to_hex(Unicode.nfc(c5)))
         | 
| 59 | 
            +
                end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                ## NFD TEST
         | 
| 62 | 
            +
                if c3 == Unicode.nfd(c1) && c3 == Unicode.nfd(c2) &&
         | 
| 63 | 
            +
                    c3 == Unicode.nfd(c3) &&
         | 
| 64 | 
            +
                    c5 == Unicode.nfd(c4) && c5 == Unicode.nfd(c5)
         | 
| 65 | 
            +
                  ##puts "NFD OK: " + code
         | 
| 66 | 
            +
                else
         | 
| 67 | 
            +
                  puts "NFD NG: " + to_hex(c1)
         | 
| 68 | 
            +
                  printf("  c3=%s NFD(c1)=%s NFD(c2)=%s NFD(c3)=%s\n",
         | 
| 69 | 
            +
                         to_hex(c3),
         | 
| 70 | 
            +
                         to_hex(Unicode.nfd(c1)),
         | 
| 71 | 
            +
                         to_hex(Unicode.nfd(c2)),
         | 
| 72 | 
            +
                         to_hex(Unicode.nfd(c3)))
         | 
| 73 | 
            +
                  printf("  c5=%s NFD(c4)=%s NFD(c5)=%s\n",
         | 
| 74 | 
            +
                         to_hex(c5),
         | 
| 75 | 
            +
                         to_hex(Unicode.nfd(c4)),
         | 
| 76 | 
            +
                         to_hex(Unicode.nfd(c5)))
         | 
| 77 | 
            +
                end
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                ## NFKC TEST
         | 
| 80 | 
            +
                if c4 == Unicode.nfkc(c1) && c4 == Unicode.nfkc(c2) &&
         | 
| 81 | 
            +
                    c4 == Unicode.nfkc(c3) &&
         | 
| 82 | 
            +
                    c4 == Unicode.nfkc(c4) && c4 == Unicode.nfkc(c5)
         | 
| 83 | 
            +
                  ##puts "NFKC OK: " + code
         | 
| 84 | 
            +
                else
         | 
| 85 | 
            +
                  puts "NFKC NG: " + to_hex(c1)
         | 
| 86 | 
            +
                  printf("  c4=%s NFKC(c1)=%s NFKC(c2)=%s NFKC(c3)=%s NFKC(c4)=%s NFKC(c5)=%s\n",
         | 
| 87 | 
            +
                         to_hex(c4),
         | 
| 88 | 
            +
                         to_hex(Unicode.nfkc(c1)),
         | 
| 89 | 
            +
                         to_hex(Unicode.nfkc(c2)),
         | 
| 90 | 
            +
                         to_hex(Unicode.nfkc(c3)),
         | 
| 91 | 
            +
                         to_hex(Unicode.nfkc(c4)),
         | 
| 92 | 
            +
                         to_hex(Unicode.nfkc(c5)))
         | 
| 93 | 
            +
                end
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                ## NFKD TEST
         | 
| 96 | 
            +
                if c5 == Unicode.nfkd(c1) && c5 == Unicode.nfkd(c2) &&
         | 
| 97 | 
            +
                    c5 == Unicode.nfkd(c3) &&
         | 
| 98 | 
            +
                    c5 == Unicode.nfkd(c4) && c5 == Unicode.nfkd(c5)
         | 
| 99 | 
            +
                  ##puts "NFKD OK: " + code
         | 
| 100 | 
            +
                else
         | 
| 101 | 
            +
                  puts "NFKD NG: " + to_hex(c1)
         | 
| 102 | 
            +
                  printf("  c5=%s NFKD(c1)=%s NFKD(c2)=%s NFKD(c3)=%s NFKD(c4)=%s NFKD(c5)=%s\n",
         | 
| 103 | 
            +
                         to_hex(c5),
         | 
| 104 | 
            +
                         to_hex(Unicode.nfkd(c1)),
         | 
| 105 | 
            +
                         to_hex(Unicode.nfkd(c2)),
         | 
| 106 | 
            +
                         to_hex(Unicode.nfkd(c3)),
         | 
| 107 | 
            +
                         to_hex(Unicode.nfkd(c4)),
         | 
| 108 | 
            +
                         to_hex(Unicode.nfkd(c5)))
         | 
| 109 | 
            +
                end
         | 
| 110 | 
            +
              end
         | 
| 111 | 
            +
            end
         | 
    
        data/unicode.gemspec
    ADDED
    
    | @@ -0,0 +1,30 @@ | |
| 1 | 
            +
            # -*- encoding: utf-8 -*-
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            Gem::Specification.new do |s|
         | 
| 4 | 
            +
              s.name = %q{unicode}
         | 
| 5 | 
            +
              s.version = "0.4.4"
         | 
| 6 | 
            +
             | 
| 7 | 
            +
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         | 
| 8 | 
            +
              s.authors = [%q{Yoshida Masato}]
         | 
| 9 | 
            +
              s.date = %q{2013-02-07}
         | 
| 10 | 
            +
              s.email = %q{yoshidam@yoshidam.net}
         | 
| 11 | 
            +
              s.licenses = %w[Ruby]
         | 
| 12 | 
            +
              s.extensions = %w[ext/unicode/extconf.rb]
         | 
| 13 | 
            +
              s.extra_rdoc_files = [%q{README}]
         | 
| 14 | 
            +
              s.files = %w[
         | 
| 15 | 
            +
                README Rakefile unicode.gemspec lib/unicode.rb
         | 
| 16 | 
            +
                test/test.rb tools/README tools/mkunidata.rb tools/normtest.rb
         | 
| 17 | 
            +
                ext/unicode/extconf.rb ext/unicode/unicode.c ext/unicode/unidata.map
         | 
| 18 | 
            +
                ext/unicode/ustring.c ext/unicode/ustring.h ext/unicode/wstring.c ext/unicode/wstring.h
         | 
| 19 | 
            +
              ]
         | 
| 20 | 
            +
              s.homepage = %q{http://www.yoshidam.net/Ruby.html#unicode}
         | 
| 21 | 
            +
              s.require_paths = [%q{lib}]
         | 
| 22 | 
            +
              s.rubygems_version = %q{1.8.6}
         | 
| 23 | 
            +
              s.summary = %q{Unicode normalization library.}
         | 
| 24 | 
            +
              s.description = %q{Unicode normalization library.}
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              if s.respond_to? :specification_version then
         | 
| 27 | 
            +
                s.specification_version = 3
         | 
| 28 | 
            +
              end
         | 
| 29 | 
            +
            end
         | 
| 30 | 
            +
             | 
    
        metadata
    ADDED
    
    | @@ -0,0 +1,52 @@ | |
| 1 | 
            +
            --- !ruby/object:Gem::Specification
         | 
| 2 | 
            +
            name: unicode
         | 
| 3 | 
            +
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            +
              version: 0.4.4
         | 
| 5 | 
            +
            platform: java
         | 
| 6 | 
            +
            authors:
         | 
| 7 | 
            +
            - Yoshida Masato
         | 
| 8 | 
            +
            autorequire: 
         | 
| 9 | 
            +
            bindir: bin
         | 
| 10 | 
            +
            cert_chain: []
         | 
| 11 | 
            +
            date: 2013-02-07 00:00:00.000000000 Z
         | 
| 12 | 
            +
            dependencies: []
         | 
| 13 | 
            +
            description: Unicode normalization library.
         | 
| 14 | 
            +
            email: yoshidam@yoshidam.net
         | 
| 15 | 
            +
            executables: []
         | 
| 16 | 
            +
            extensions: []
         | 
| 17 | 
            +
            extra_rdoc_files:
         | 
| 18 | 
            +
            - README
         | 
| 19 | 
            +
            files:
         | 
| 20 | 
            +
            - README
         | 
| 21 | 
            +
            - Rakefile
         | 
| 22 | 
            +
            - lib/unicode.rb
         | 
| 23 | 
            +
            - test/test.rb
         | 
| 24 | 
            +
            - tools/README
         | 
| 25 | 
            +
            - tools/mkunidata.rb
         | 
| 26 | 
            +
            - tools/normtest.rb
         | 
| 27 | 
            +
            - unicode.gemspec
         | 
| 28 | 
            +
            homepage: http://www.yoshidam.net/Ruby.html#unicode
         | 
| 29 | 
            +
            licenses:
         | 
| 30 | 
            +
            - Ruby
         | 
| 31 | 
            +
            metadata: {}
         | 
| 32 | 
            +
            post_install_message: 
         | 
| 33 | 
            +
            rdoc_options: []
         | 
| 34 | 
            +
            require_paths:
         | 
| 35 | 
            +
            - lib
         | 
| 36 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 37 | 
            +
              requirements:
         | 
| 38 | 
            +
              - - ">="
         | 
| 39 | 
            +
                - !ruby/object:Gem::Version
         | 
| 40 | 
            +
                  version: '0'
         | 
| 41 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 42 | 
            +
              requirements:
         | 
| 43 | 
            +
              - - ">="
         | 
| 44 | 
            +
                - !ruby/object:Gem::Version
         | 
| 45 | 
            +
                  version: '0'
         | 
| 46 | 
            +
            requirements: []
         | 
| 47 | 
            +
            rubyforge_project: 
         | 
| 48 | 
            +
            rubygems_version: 2.4.4
         | 
| 49 | 
            +
            signing_key: 
         | 
| 50 | 
            +
            specification_version: 3
         | 
| 51 | 
            +
            summary: Unicode normalization library.
         | 
| 52 | 
            +
            test_files: []
         |