RubyGems - camertron-eprun - Versions diffs - 1.1.0 - Mend

camertron-eprun 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +7 -0
data/Gemfile +17 -0
data/History.txt +4 -0
data/LICENSE +6 -0
data/README.md +63 -0
data/Rakefile +40 -0
data/lib/eprun.rb +9 -0
data/lib/eprun/core_ext/string.rb +20 -0
data/lib/eprun/helpers.rb +27 -0
data/lib/eprun/normalize.rb +185 -0
data/lib/eprun/ruby18/normalize.rb +198 -0
data/lib/eprun/ruby18/tables.rb +621 -0
data/lib/eprun/tables.rb +621 -0
data/lib/eprun/version.rb +9 -0
metadata +57 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: d9ad241156fd22c9d731cd199bcbed44c536335b
+  data.tar.gz: 336265b2dab02fbde594fbbd096b5658a4121be8
+SHA512:
+  metadata.gz: 7cc4b9b77085815f16332187963520cf2155eb6f01fab8807209f0c685a8bd9c3f0f0549a83d6af4f336c1adb3164c27e5e97f906c495d8aed7fe88564ca7d4a
+  data.tar.gz: 9d03fab650391d0baf79c481b6ae1f3fbcf32be5a67b57edf310ecf96200ceeadc58a523810dec54680c46e72ac47d61005116dd8513d8812ce92f4b34d24f7f

data/Gemfile ADDED

@@ -0,0 +1,17 @@
+source "http://rubygems.org"
+gemspec
+group :development do
+  gem "rake"
+  gem "pry-nav"
+  gem 'unicode'
+  gem 'unf'
+  # gem 'unicode_utils'
+  gem 'activesupport'
+end
+group :test do
+  gem "rspec"
+  gem "rr"
+end

data/History.txt ADDED

@@ -0,0 +1,4 @@
+== 1.0.0
+* Repo converted into gem.
+* Added MRI 1.8 compatibility.

data/LICENSE ADDED

@@ -0,0 +1,6 @@
+Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+available under the same licence as Ruby itself
+(see http://www.ruby-lang.org/en/LICENSE.txt)
+The files in the 'data' subdirectory are © Unicode Consortium,
+downloaded from http://www.unicode.org/Public/UCD/latest/ucd/.

data/README.md ADDED

@@ -0,0 +1,63 @@
+Efficient Pure Ruby Unicode Normalization (eprun)
+=================================================
+(pronounced e-prune)
+The Talk
+--------
+Please see the
+[Internationalization & Unicode Conference 37](http://www.unicodeconference.org/)
+talk on
+[Implementing Normalization in Pure Ruby - the Fast and Easy Way](http://www.sw.it.aoyama.ac.jp/2013/pub/RubyNorm/).
+Directories and Files
+---------------------
+*   lib/normalize.rb: The core normalization code.
+*   lib/string_normalize.rm: String#normalize.
+*   lib/generate.rb: Generation script, generates lib/normalize_tables.rb
+    from data/UnicodeData.txt and data/CompositionExclusions.txt.
+    This needs to be run only once when updating to a new Unicode version.
+*   lib/normalize_tables.rb: Data used for normalization,
+    automatically generated by lib/generate.rb.
+*   data/: All three files in this directory are downloaded from the
+    [Unicode Character Database](http://www.unicode.org/Public/UCD/latest/ucd/).
+    They are currently at Unicode version 6.3. They need to be updated for
+    a newer Unicode version (happens about once a year).
+*   test/test_normalize.rb: Tests for lib/string_normalize.rb,
+    using data/NormalizationTest.txt.
+*   benchmark/benchmark.rb: Runs the benchmark with example text files.
+    Automatically checks for existing gems/libraries; if e.g. the unicode_util
+    gem is not available, that part of the benchmark is skipped.
+    This also applies to eprun, which will not be run on Ruby 1.8.
+*   benchmark/Deutsch_.txt, Japanese_.txt, Korean_.txt, Vietnamese_.txt:
+    example texts extracted from random Wikipedia pages
+    (see http://en.wikipedia.org/wiki/Wikipedia:Random).
+    The languages are choosen based on number of characters affected
+    by normalization (Deutsch < Japanese < Vietnamese < Korean).
+    These files have somewhat differing lengths,
+    so the results cannot directly be compared across languages.
+    Adding other files with ending "_.txt" will include them in
+    the benchmark.
+*   benchmark/benchmark_results.rb:
+    Results of benchmark for eprun, unicode_utils,
+    ActiveSupport::Multibyte (version 3.0.0), twitter_cldr, and the unicode gem.
+    Eprun, unicode_utils, and unicode normalizations are run 100 times each,
+    ActiveSupport::Multibyte is run 10 times each, and
+    twitter_cldr is run only 1 time (didn't want to wait any longer).
+*   benchmark/benchmark_results_jruby.txt:
+    Results of benchmark when using jruby (excludes unicode gem),
+    version 1.7.4 (1.9.3p392, 2013-05-16 2390d3b on Java HotSpot(TM) Client VM 1.7.0_07-b10 [Windows 7-x86]).
+*   benchmark/benchmark.pl: Runs the benchmark using Perl, both with
+    xsub (i.e. C) version (run 100 times) and pure Perl version
+    (run 10 times).
+*   benchmark/benchmark_results_pl.txt: Results of Perl benchmarks.
+TODOs and Ideas
+---------------
+*   Publish as a gem, or several gems.
+*   Deal better with encodings other than UTF-8.
+*   Add methods such as String#nfc, String#nfd,...
+*   Add methods for normalization variants.
+*   See [talk](http://www.sw.it.aoyama.ac.jp/2013/pub/RubyNorm/) for more.

data/Rakefile ADDED

@@ -0,0 +1,40 @@
+# encoding: utf-8
+# Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+# available under the same licence as Ruby itself
+# (see http://www.ruby-lang.org/en/LICENSE.txt)
+ROOT_DIR = Pathname.new(File.join(File.dirname(__FILE__)))
+$:.push(ROOT_DIR.to_s)
+require 'eprun/helpers'
+require 'tasks/erb_template'
+require 'tasks/tables_generator'
+require 'rubygems/package_task'
+task :default => :test
+Bundler::GemHelper.install_tasks
+task :test do
+  require 'test/unit'
+  files = Dir.glob("./test/test_*.rb")
+  runner = Test::Unit::AutoRunner.new(true)
+  runner.process_args(files)
+  runner.run
+end
+task :generate_tables do
+  EprunTasks::TablesGenerator.new(
+    ROOT_DIR.join("data").to_s,
+    ROOT_DIR.join("lib", Eprun.require_path).to_s
+  ).generate
+end
+task :benchmark do
+  require 'eprun'
+  require ROOT_DIR.join("benchmark/benchmark").to_s
+  Eprun.enable_core_extensions!
+  EprunTasks::Benchmarks.new(ROOT_DIR.join("benchmark".to_s)).run
+end

data/lib/eprun.rb ADDED

@@ -0,0 +1,9 @@
+# encoding: utf-8
+# Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+# available under the same licence as Ruby itself
+# (see http://www.ruby-lang.org/en/LICENSE.txt)
+require "eprun/helpers"
+Eprun.require_file("tables")
+Eprun.require_file("normalize")

data/lib/eprun/core_ext/string.rb ADDED

@@ -0,0 +1,20 @@
+# encoding: utf-8
+# Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+# available under the same licence as Ruby itself
+# (see http://www.ruby-lang.org/en/LICENSE.txt)
+class String
+  def normalize(form = :nfc)
+    Eprun.normalize(self, form)
+  end
+  def normalize!(form = :nfc)
+    replace(self.normalize(form))
+  end
+  def normalized?(form = :nfc)
+    Eprun.normalized?(self, form)
+  end
+end

data/lib/eprun/helpers.rb ADDED

@@ -0,0 +1,27 @@
+# encoding: utf-8
+# Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+# available under the same licence as Ruby itself
+# (see http://www.ruby-lang.org/en/LICENSE.txt)
+class Eprun
+  class << self
+    def enable_core_extensions!
+      require 'eprun/core_ext/string' unless "".respond_to?(:normalize)
+    end
+    def ruby18?
+      RUBY_VERSION >= "1.8.0" && RUBY_VERSION < "1.9.0"
+    end
+    def require_path
+      ruby18? ? "eprun/ruby18" : "eprun"
+    end
+    def require_file(file)
+      require File.join(require_path, file)
+    end
+  end
+end

data/lib/eprun/normalize.rb ADDED

@@ -0,0 +1,185 @@
+# encoding: utf-8
+# Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+# available under the same licence as Ruby itself
+# (see http://www.ruby-lang.org/en/LICENSE.txt)
+class Eprun
+  class << self
+    # Constant for max hash capacity to avoid DoS attack
+    MAX_HASH_LENGTH = 18000 # enough for all test cases, otherwise tests get slow
+    ## Regular Expressions and Hash Constants
+    REGEXP_D = Regexp.compile(REGEXP_D_STRING, Regexp::EXTENDED)
+    REGEXP_C = Regexp.compile(REGEXP_C_STRING, Regexp::EXTENDED)
+    REGEXP_K = Regexp.compile(REGEXP_K_STRING, Regexp::EXTENDED)
+    NF_HASH_D = Hash.new do |hash, key|
+      hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
+      hash[key] = Eprun.nfd_one(key)
+    end
+    NF_HASH_C = Hash.new do |hash, key|
+      hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
+      hash[key] = Eprun.nfc_one(key)
+    end
+    NF_HASH_K = Hash.new do |hash, key|
+      hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
+      hash[key] = Eprun.nfkd_one(key)
+    end
+    def nf_hash_d
+      NF_HASH_D
+    end
+    def nf_hash_c
+      NF_HASH_C
+    end
+    def nf_hash_k
+      NF_HASH_K
+    end
+    ## Constants For Hangul
+    SBASE = 0xAC00
+    LBASE = 0x1100
+    VBASE = 0x1161
+    TBASE = 0x11A7
+    LCOUNT = 19
+    VCOUNT = 21
+    TCOUNT = 28
+    NCOUNT = VCOUNT * TCOUNT
+    SCOUNT = LCOUNT * NCOUNT
+    ## Hangul Algorithm
+    def hangul_decomp_one(target)
+      sIndex = target.ord - SBASE
+      return target if sIndex < 0 || sIndex >= SCOUNT
+      l = LBASE + sIndex / NCOUNT
+      v = VBASE + (sIndex % NCOUNT) / TCOUNT
+      t = TBASE + sIndex % TCOUNT
+      (t == TBASE ? [l, v] : [l, v, t]).pack('U*') + target[1..-1]
+    end
+    def hangul_comp_one(string)
+      length = string.length
+      in_range = length > 1 &&
+        0 <= (lead = string[0].ord - LBASE) &&
+        lead  < LCOUNT &&
+        0 <= (vowel = string[1].ord - VBASE) &&
+        vowel < VCOUNT
+      if in_range
+        lead_vowel = SBASE + (lead * VCOUNT + vowel) * TCOUNT
+        if length > 2 && 0 <= (trail = string[2].ord - TBASE) && trail < TCOUNT
+          (lead_vowel + trail).chr(Encoding::UTF_8) + string[3..-1]
+        else
+          lead_vowel.chr(Encoding::UTF_8) + string[2..-1]
+        end
+      else
+        string
+      end
+    end
+    ## Canonical Ordering
+    def canonical_ordering_one(string)
+      sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] }
+      (sorting.length - 2).downto(0) do |i| # bubble sort
+        (0..i).each do |j|
+          later_class = sorting[j + 1].last
+          if 0 < later_class && later_class < sorting[j].last
+            sorting[j], sorting[j + 1] = sorting[j + 1], sorting[j]
+          end
+        end
+      end
+      sorting.collect(&:first).join
+    end
+    ## Normalization Forms for Patterns (not whole Strings)
+    def nfd_one(string)
+      string = string.dup
+      (0...string.length).each do |position|
+        if decomposition = DECOMPOSITION_TABLE[string[position]]
+          string[position] = decomposition
+        end
+      end
+      canonical_ordering_one(hangul_decomp_one(string))
+    end
+    def nfkd_one(string)
+      string = string.dup
+      position = 0
+      while position < string.length
+        if decomposition = KOMPATIBLE_TABLE[string[position]]
+          string[position] = decomposition
+        else
+          position += 1
+        end
+      end
+      string
+    end
+    def nfc_one(string)
+      nfd_string = nfd_one string
+      start = nfd_string[0]
+      last_class = CLASS_TABLE[start] - 1
+      accents = ''
+      nfd_string[1..-1].each_char do |accent|
+        accent_class = CLASS_TABLE[accent]
+        if last_class < accent_class && composite = COMPOSITION_TABLE[start+accent]
+          start = composite
+        else
+          accents += accent
+          last_class = accent_class
+        end
+      end
+      hangul_comp_one(start + accents)
+    end
+    def normalize(string, form = :nfc)
+      encoding = string.encoding
+      if encoding == Encoding::UTF_8
+        case form
+          when :nfc then
+            string.gsub(REGEXP_C, NF_HASH_C)
+          when :nfd then
+            string.gsub(REGEXP_D, NF_HASH_D)
+          when :nfkc then
+            string.gsub(REGEXP_K, NF_HASH_K).gsub(REGEXP_C, NF_HASH_C)
+          when :nfkd then
+            string.gsub(REGEXP_K, NF_HASH_K).gsub(REGEXP_D, NF_HASH_D)
+          else
+            raise ArgumentError, "Invalid normalization form #{form}."
+        end
+      else
+        normalize(string.encode(Encoding::UTF_8), form).encode(encoding)
+      end
+    end
+    def normalized?(string, form = :nfc)
+      string = string.encode(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8
+      case form
+        when :nfc then
+          string.scan(REGEXP_C) do |match|
+            return false if NF_HASH_C[match] != match
+          end
+          true
+        when :nfd then
+          string.scan(REGEXP_D) do |match|
+            return false if NF_HASH_D[match] != match
+          end
+          true
+        when :nfkc then
+          normalized?(string, :nfc) && string !~ REGEXP_K
+        when :nfkd then
+          normalized?(string, :nfd) && string !~ REGEXP_K
+        else
+          raise ArgumentError, "Invalid normalization form #{form}."
+      end
+    end
+  end
+end

data/lib/eprun/ruby18/normalize.rb ADDED

@@ -0,0 +1,198 @@
+# encoding: utf-8
+# Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+# available under the same licence as Ruby itself
+# (see http://www.ruby-lang.org/en/LICENSE.txt)
+class Eprun
+  class << self
+    ## Constant for max hash capacity to avoid DoS attack
+    MAX_HASH_LENGTH = 18000 # enough for all test cases, otherwise tests get slow
+    ## Regular Expressions and Hash Constants
+    REGEXP_D = Regexp.compile(REGEXP_D_STRING, Regexp::EXTENDED)
+    REGEXP_C = Regexp.compile(REGEXP_C_STRING, Regexp::EXTENDED)
+    REGEXP_K = Regexp.compile(REGEXP_K_STRING, Regexp::EXTENDED)
+    NF_HASH_D = Hash.new do |hash, key|
+      hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
+      hash[key] = Eprun.nfd_one(key).pack("U*")
+    end
+    NF_HASH_C = Hash.new do |hash, key|
+      hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
+      hash[key] = Eprun.nfc_one(key).pack("U*")
+    end
+    NF_HASH_K = Hash.new do |hash, key|
+      hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
+      hash[key] = Eprun.nfkd_one(key).pack("U*")
+    end
+    def nf_hash_d
+      NF_HASH_D
+    end
+    def nf_hash_c
+      NF_HASH_C
+    end
+    def nf_hash_k
+      NF_HASH_K
+    end
+    ## Constants For Hangul
+    SBASE = 0xAC00
+    LBASE = 0x1100
+    VBASE = 0x1161
+    TBASE = 0x11A7
+    LCOUNT = 19
+    VCOUNT = 21
+    TCOUNT = 28
+    NCOUNT = VCOUNT * TCOUNT
+    SCOUNT = LCOUNT * NCOUNT
+    def get_codepoints(source)
+      if source.is_a?(Array)
+        source
+      elsif source.is_a?(String)
+        source.unpack("U*")
+      else
+        raise ArgumentError, "Source must be a string or an array."
+      end
+    end
+    ## Hangul Algorithm
+    def hangul_decomp_one(target)
+      cps = get_codepoints(target)
+      sIndex = cps.first - SBASE
+      return target if sIndex < 0 || sIndex >= SCOUNT
+      l = LBASE + sIndex / NCOUNT
+      v = VBASE + (sIndex % NCOUNT) / TCOUNT
+      t = TBASE + sIndex % TCOUNT
+      (t == TBASE ? [l, v] : [l, v, t]) + cps[1..-1]
+    end
+    def hangul_comp_one(string)
+      cps = get_codepoints(string)
+      length = cps.length
+      in_range = length > 1 &&
+        0 <= (lead = cps[0] - LBASE) &&
+        lead < LCOUNT &&
+        0 <= (vowel = cps[1] - VBASE) &&
+        vowel < VCOUNT
+      if in_range
+        lead_vowel = SBASE + (lead * VCOUNT + vowel) * TCOUNT
+        if length > 2 && 0 <= (trail = cps[2] - TBASE) && trail < TCOUNT
+          [lead_vowel + trail] + cps[3..-1]
+        else
+          [lead_vowel] + cps[2..-1]
+        end
+      else
+        string
+      end
+    end
+    ## Canonical Ordering
+    def canonical_ordering_one(string)
+      cps = get_codepoints(string)
+      sorting = cps.collect { |c| [c, CLASS_TABLE[c]] }
+      (sorting.length - 2).downto(0) do |i| # bubble sort
+        (0..i).each do |j|
+          later_class = sorting[j + 1].last
+          if 0 < later_class && later_class < sorting[j].last
+            sorting[j], sorting[j + 1] = sorting[j + 1], sorting[j]
+          end
+        end
+      end
+      sorting.collect(&:first)
+    end
+    ## Normalization Forms for Patterns (not whole Strings)
+    def nfd_one(string)
+      cps = get_codepoints(string)
+      cps = cps.inject([]) do |ret, cp|
+        if decomposition = DECOMPOSITION_TABLE[cp]
+          ret += decomposition
+        else
+          ret << cp
+        end
+      end
+      canonical_ordering_one(hangul_decomp_one(cps))
+    end
+    def nfkd_one(string)
+      cps = get_codepoints(string)
+      final_cps = []
+      position = 0
+      while position < cps.length
+        if decomposition = KOMPATIBLE_TABLE[cps[position]]
+          final_cps += nfkd_one(decomposition)
+        else
+          final_cps << cps[position]
+        end
+        position += 1
+      end
+      final_cps
+    end
+    def nfc_one(string)
+      nfd_cps = nfd_one(string)
+      start = nfd_cps[0]
+      last_class = CLASS_TABLE[start] - 1
+      accents = []
+      nfd_cps[1..-1].each do |accent_cp|
+        accent_class = CLASS_TABLE[accent_cp]
+        if last_class < accent_class && composite = COMPOSITION_TABLE[[start, accent_cp]]
+          start = composite
+        else
+          accents << accent_cp
+          last_class = accent_class
+        end
+      end
+      hangul_comp_one([start] + accents)
+    end
+    def normalize(string, form = :nfc)
+      case form
+        when :nfc then
+          string.gsub(REGEXP_C) { |s| NF_HASH_C[s] }
+        when :nfd then
+          string.gsub(REGEXP_D) { |s| NF_HASH_D[s] }
+        when :nfkc then
+          string.gsub(REGEXP_K) { |s| NF_HASH_K[s] }.gsub(REGEXP_C) { |s| NF_HASH_C[s] }
+        when :nfkd then
+          string.gsub(REGEXP_K) { |s| NF_HASH_K[s] }.gsub(REGEXP_D) { |s| NF_HASH_D[s] }
+        else
+          raise ArgumentError, "Invalid normalization form #{form}."
+      end
+    end
+    def normalized?(string, form = :nfc)
+      case form
+      when :nfc then
+        string.scan REGEXP_C do |match|
+          return false if NF_HASH_C[match] != match
+        end
+        true
+      when :nfd then
+        string.scan REGEXP_D do |match|
+          return false if NF_HASH_D[match] != match
+        end
+        true
+      when :nfkc then
+        normalized?(string, :nfc) && string !~ REGEXP_K
+      when :nfkd then
+        normalized?(string, :nfd) && string !~ REGEXP_K
+      else
+        raise ArgumentError, "Invalid normalization form #{form}."
+      end
+    end
+  end
+end # class