RubyGems - character-encodings - Versions diffs - 0.2.0 - Mend

character-encodings 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

data/README +26 -0
data/Rakefile +157 -0
data/ext/encoding/character/unicode/codepoint.c +48 -0
data/ext/encoding/character/utf-8/break.c +38 -0
data/ext/encoding/character/utf-8/data/break.h +22931 -0
data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
data/ext/encoding/character/utf-8/data/compose.h +1607 -0
data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
data/ext/encoding/character/utf-8/decompose.c +476 -0
data/ext/encoding/character/utf-8/depend +64 -0
data/ext/encoding/character/utf-8/extconf.rb +47 -0
data/ext/encoding/character/utf-8/private.h +68 -0
data/ext/encoding/character/utf-8/properties.c +1061 -0
data/ext/encoding/character/utf-8/rb_includes.h +18 -0
data/ext/encoding/character/utf-8/rb_methods.h +49 -0
data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
data/ext/encoding/character/utf-8/unicode.c +319 -0
data/ext/encoding/character/utf-8/unicode.h +208 -0
data/ext/encoding/character/utf-8/utf.c +1332 -0
data/lib/encoding/character/utf-8.rb +201 -0
data/specifications/aref.rb +45 -0
data/specifications/count.rb +29 -0
data/specifications/delete.rb +25 -0
data/specifications/each_char.rb +28 -0
data/specifications/index.rb +35 -0
data/specifications/insert.rb +67 -0
data/specifications/length.rb +45 -0
data/specifications/rindex.rb +52 -0
data/specifications/squeeze.rb +25 -0
data/specifications/to_i.rb +54 -0
data/specifications/tr.rb +39 -0
data/tests/foldcase.rb +28 -0
data/tests/normalize.rb +101 -0
data/tests/unicodedatatestbase.rb +45 -0
metadata +112 -0

data/specifications/rindex.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# contents: Specification of String#rindex.
+#
+# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+require 'encoding/character/utf-8'
+context "An empty string" do
+  setup do
+    @string = u""
+  end
+  specify "should contain the empty string at index 0" do
+    @string.rindex("").should_equal 0
+  end
+=begin
+  specify "shouldn’t contain any string at an index > 0" do
+    @string.rindex("", 1).should_be nil
+    @string.rindex("", -1).should_be nil
+  end
+=end
+end
+context "The string “hëllö”" do
+  setup do
+    @string = u"hëllö"
+  end
+  specify "should contain the string “lö” at index 3" do
+    @string.rindex("lö").should_equal 3
+    @string.rindex("lö", 3).should_equal 3
+  end
+  specify "should contain the string “hë” at index 0" do
+    @string.rindex("hë").should_equal 0
+  end
+end
+context "The string “hëllölö”" do
+  setup do
+    @string = u"hëllölö"
+  end
+  specify "should contain the string “lö” at index 5" do
+    @string.rindex("lö").should_equal 5
+    @string.rindex("lö", 5).should_equal 5
+  end
+  specify "should contain the string “lö” at index 3, when starting at index 4" do
+    @string.rindex("lö", 4).should_equal 3
+  end
+end

data/specifications/squeeze.rb ADDED Viewed

@@ -0,0 +1,25 @@
+# contents: Specification of String#squeeze.
+#
+# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+require 'encoding/character/utf-8'
+context "An empty string" do
+  setup do
+    @string = u""
+  end
+  specify "should return an empty string after squeezing anything" do
+    @string.delete("whatever").should_be_empty
+  end
+end
+context "The string “hëllö”" do
+  setup do
+    @string = u"hëllö"
+  end
+  specify "should return “hëlö” after squeezing all ‘ö’’s" do
+    @string.squeeze.should_equal "hëlö"
+  end
+end

data/specifications/to_i.rb ADDED Viewed

@@ -0,0 +1,54 @@
+# contents: Specification of String#to_i.
+#
+# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+require 'encoding/character/utf-8'
+context "An empty string" do
+  setup do
+    @string = u""
+  end
+  specify "should raise an ArgumentError when sent #to_i with an illegal base" do
+    [-2, -1, 0, 1, 37, 38].each{ |base| proc{ @string.to_i(1) }.should_raise ArgumentError }
+  end
+  specify "should return 0 when sent #to_i, using any legal base" do
+    @string.to_i.should_equal 0
+    2.upto(36){ |base| @string.to_i(base).should_equal 0 }
+  end
+end
+context "The string “1”" do
+  setup do
+    @string = u"1"
+  end
+  specify "should return 1 when sent #to_i, using any legal base" do
+    @string.to_i.should_equal 1
+    2.upto(36){ |base| @string.to_i(base).should_equal 1 }
+  end
+end
+context "The string “٠”" do
+  setup do
+    @string = u"١"
+  end
+  specify "should return 1 when sent #to_i, using any legal base" do
+    @string.to_i.should_equal 1
+    2.upto(36){ |base| @string.to_i(base).should_equal 1 }
+  end
+end
+=begin
+context "The string “ⅷ”" do
+  setup do
+    @string = u"ⅷ"
+  end
+  specify "should return 8 when sent #to_i, using base 10" do
+    @string.to_i.should_equal 8
+  end
+end
+=end

data/specifications/tr.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# contents: Specification of String#tr.
+#
+# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+require 'encoding/character/utf-8'
+context "An empty string" do
+  setup do
+    @string = u""
+  end
+  specify "should stay the same for any translation" do
+    @string.tr("abc", "def").should_be_empty
+  end
+end
+context "The string “äbcdë”" do
+  setup do
+    @string = u"äbcdë"
+  end
+  specify "should return the string “abcde” when ‘ä’ and ‘ë’ are translated to ‘a’ and ‘e’" do
+    @string.tr("äë", "ae").should_equal "abcde"
+  end
+  specify "should return the string “ëëëëë” when “a-zäë” are translated to ‘ë’" do
+    @string.tr("a-zäë", "ë").should_equal "ëëëëë"
+  end
+end
+context "The string “aaaaa”" do
+  setup do
+    @string = u"aaaaa"
+  end
+  specify "should return the string “ëëëëë” when “a” is translated to ‘ä-ë’" do
+    @string.tr("a", "ä-ë").should_equal "ëëëëë"
+  end
+end

data/tests/foldcase.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# contents: Tests for String#foldcase.
+#
+# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+require 'tests/unicodedatatestbase'
+require 'encoding/character/utf-8'
+class TC_StringFoldcase < Test::Unit::TestCase
+  include UnicodeDataTestBase
+  Code, Status, Mapping = (0..2).to_a
+  def test_foldcase
+    open_data_file('CaseFolding.txt') do |file|
+      i = 0
+      file.each_line do |line|
+        i += 1
+        next if line =~ /^#/
+        next if line =~ /^\s*$/
+        fields = line.split('; ')
+        raise "#{line}: Wrong number of fields; #{field.size} instead of 4." unless fields.size == 4
+        next if fields[Status] == 'S' || fields[Status] == 'T'
+        numbers = fields[Mapping].split(' ').map{ |s| s.hex }
+        assert_equal(numbers.pack('U*'), u([fields[Code].hex].pack('U')).foldcase)
+      end
+    end
+  end
+end

data/tests/normalize.rb ADDED Viewed

@@ -0,0 +1,101 @@
+# contents: Tests for String#normalize.
+#
+# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+require 'ostruct'
+require 'tests/unicodedatatestbase'
+require 'encoding/character/utf-8'
+class TC_StringUnicodeNormalize < Test::Unit::TestCase
+  include UnicodeDataTestBase
+  Data = OpenStruct.new
+  Data.wanted_line = ENV['line'] ? ENV['line'].to_i : 0
+  Data.line = 0
+  def test_part_0
+    Data.file = open_data_file('NormalizationTest.txt')
+    read_lines_until(/^@Part0/)
+    read_lines_until(/^@Part1/){ |columns| test_columns(columns) }
+  end
+  def test_part_1
+    read_lines_until(/^@Part2/){ |columns| test_columns(columns) }
+  end
+  def test_part_2
+    read_lines_until(/^@Part3/){ |columns| test_columns(columns) }
+  end
+  def test_part_3
+    read_lines_until(:last){ |columns| test_columns(columns) }
+    Data.file.close
+  end
+private
+  def read_lines_until(line = :last, &block)
+    if line == :last
+      until Data.file.eof?
+        deal_with_line(Data.file.gets, &block)
+      end
+    else
+      while (got_line = Data.file.gets) !~ line
+        raise "unexpected end of file while looking for #{line}" unless got_line
+        deal_with_line(got_line, &block)
+      end
+      Data.line += 1
+    end
+  end
+  def deal_with_line(line)
+    Data.line += 1
+    return if line[0] == ?#
+    columns = line.split(';', 6)
+    return if columns.length == 0
+    raise "#{Data.file}:#{Data.line}: Format of line does not conform to standard" unless columns.length == 6
+    return if Data.wanted_line != 0 and Data.line != Data.wanted_line
+    yield columns if block_given?
+  end
+  def encode(string)
+    string.unpack('U*').map{ |c| '%04X' % c }.join(' ')
+  end
+  def test_columns(columns)
+    catch :skip do
+      strings = columns[0..4].map do |c|
+        s = u(c.split(' ').map{ |i| i.to_i(16) }.pack("U*"));
+        throw :skip if s.empty?
+        s
+      end
+      [ [:nfd, false, 2], [:nfd, true, 4],
+        [:nfc, false, 1], [:nfc, true, 3],
+        [:nfkd, true, 4],
+        [:nfkc, true, 3] ].each do |mode, compat, expected|
+        test_normalization(columns, strings, mode, compat, expected)
+      end
+    end
+  end
+  def test_normalization(columns, strings, mode, compat, expected)
+    mode_is_compat = (mode == :nfkc || mode == :nfkd)
+    if mode_is_compat || !compat
+      0.upto(2){ |i| test_one_normalization(columns, strings, mode, compat, expected, i, i + 1) }
+    end
+    if mode_is_compat || compat
+      3.upto(4){ |i| test_one_normalization(columns, strings, mode, compat, expected, i, i) }
+    end
+  end
+  def test_one_normalization(columns, strings, mode, compat, expected, column, file_column)
+    normalized = strings[column].normalize(mode)
+    unless normalized.eql? strings[expected]
+      m = mode.to_s.upcase
+      flunk <<EOM
+#{Data.line}:#{file_column}: #{m} normalization failed for #{columns[5].chomp.sub(/\s+#\s+\([^)]+\) /, "")}.
+      #{m}(#{columns[column]}) = #{columns[expected]}, not #{encode(normalized)} (#{normalized.inspect}).
+EOM
+    end
+  end
+end

data/tests/unicodedatatestbase.rb ADDED Viewed

@@ -0,0 +1,45 @@
+# contents: Auxiliary classes and methods for Test::Unit based tests.
+#
+# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+require 'test/unit'
+module UnicodeDataTestBase
+  UnidataBase = 'http://www.unicode.org/Public/UNIDATA/'
+  def open_data_file(file, &block)
+    dir = File.join(File.dirname(__FILE__), 'data')
+    begin
+      Dir.mkdir(dir)
+    rescue Errno::EEXIST
+    end
+    path = File.join(dir, file)
+    begin
+      File.open(path, &block)
+    rescue Errno::ENOENT
+      url = UnidataBase + file
+      print <<EOM
+#{file} is missing.  However, it can easily be downloaded at
+#{url}.
+EOM
+      require 'readline' rescue exit 1
+      print <<EOM
+If you would like, I can download and install it for you.
+If so, please type “yes”:
+EOM
+      exit 1 unless Readline.readline == 'yes'
+      puts "OK, trying to fetch #{file} for you…"
+      require 'open-uri'
+      length = 0
+      open(url,
+          :content_length_proc => proc{ |size| length = size if size and size > 0 },
+          :progress_proc => proc{ |size| print(length ? "\r%3d%" % (100 * size / length) : "\r#{size}") }) do |remote|
+        File.open(path, 'w') do |local|
+          local.write remote.read(8192) until remote.eof?
+        end
+      end
+      puts "\nAh, finally done.  I’ll try to open #{file} again now."
+      retry
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,112 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.9.0
+specification_version: 1
+name: character-encodings
+version: !ruby/object:Gem::Version
+  version: 0.2.0
+date: 2006-07-27 00:00:00 +02:00
+summary: A pluggable character-encoding library
+require_paths:
+- lib
+email: now@bitwi.se
+homepage: http://git.bitwi.se/?p=ruby-character-encodings.git;a=summary
+rubyforge_project:
+description: A pluggable character-encoding library
+autorequire:
+default_executable:
+bindir: bin
+has_rdoc: false
+required_ruby_version: !ruby/object:Gem::Version::Requirement
+  requirements:
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 0.0.0
+  version:
+platform: ruby
+signing_key:
+cert_chain:
+post_install_message:
+authors:
+- Nikolai Weibull
+files:
+- README
+- Rakefile
+- lib/encoding
+- lib/encoding/character
+- lib/encoding/character/utf-8.rb
+- specifications/aref.rb
+- specifications/count.rb
+- specifications/delete.rb
+- specifications/each_char.rb
+- specifications/index.rb
+- specifications/insert.rb
+- specifications/length.rb
+- specifications/rindex.rb
+- specifications/squeeze.rb
+- specifications/to_i.rb
+- specifications/tr.rb
+- ext/encoding/character/unicode/codepoint.c
+- ext/encoding/character/utf-8/break.c
+- ext/encoding/character/utf-8/decompose.c
+- ext/encoding/character/utf-8/properties.c
+- ext/encoding/character/utf-8/rb_utf_aref.c
+- ext/encoding/character/utf-8/rb_utf_aset.c
+- ext/encoding/character/utf-8/rb_utf_casecmp.c
+- ext/encoding/character/utf-8/rb_utf_chomp.c
+- ext/encoding/character/utf-8/rb_utf_chop.c
+- ext/encoding/character/utf-8/rb_utf_collate.c
+- ext/encoding/character/utf-8/rb_utf_count.c
+- ext/encoding/character/utf-8/rb_utf_delete.c
+- ext/encoding/character/utf-8/rb_utf_downcase.c
+- ext/encoding/character/utf-8/rb_utf_each_char.c
+- ext/encoding/character/utf-8/rb_utf_foldcase.c
+- ext/encoding/character/utf-8/rb_utf_hex.c
+- ext/encoding/character/utf-8/rb_utf_index.c
+- ext/encoding/character/utf-8/rb_utf_insert.c
+- ext/encoding/character/utf-8/rb_utf_internal_tr.c
+- ext/encoding/character/utf-8/rb_utf_justify.c
+- ext/encoding/character/utf-8/rb_utf_length.c
+- ext/encoding/character/utf-8/rb_utf_lstrip.c
+- ext/encoding/character/utf-8/rb_utf_normalize.c
+- ext/encoding/character/utf-8/rb_utf_oct.c
+- ext/encoding/character/utf-8/rb_utf_reverse.c
+- ext/encoding/character/utf-8/rb_utf_rindex.c
+- ext/encoding/character/utf-8/rb_utf_rstrip.c
+- ext/encoding/character/utf-8/rb_utf_squeeze.c
+- ext/encoding/character/utf-8/rb_utf_strip.c
+- ext/encoding/character/utf-8/rb_utf_to_i.c
+- ext/encoding/character/utf-8/rb_utf_tr.c
+- ext/encoding/character/utf-8/rb_utf_upcase.c
+- ext/encoding/character/utf-8/unicode.c
+- ext/encoding/character/utf-8/utf.c
+- ext/encoding/character/utf-8/rb_utf_internal_bignum.c
+- ext/encoding/character/utf-8/private.h
+- ext/encoding/character/utf-8/rb_includes.h
+- ext/encoding/character/utf-8/rb_methods.h
+- ext/encoding/character/utf-8/rb_utf_internal_tr.h
+- ext/encoding/character/utf-8/unicode.h
+- ext/encoding/character/utf-8/rb_utf_internal_bignum.h
+- ext/encoding/character/utf-8/data/break.h
+- ext/encoding/character/utf-8/data/character-tables.h
+- ext/encoding/character/utf-8/data/compose.h
+- ext/encoding/character/utf-8/data/decompose.h
+- ext/encoding/character/utf-8/extconf.rb
+- ext/encoding/character/utf-8/data/generate-unicode-data.rb
+- ext/encoding/character/utf-8/depend
+- tests/foldcase.rb
+- tests/normalize.rb
+- tests/unicodedatatestbase.rb
+test_files: []
+rdoc_options: []
+extra_rdoc_files: []
+executables: []
+extensions:
+- ext/encoding/character/utf-8/extconf.rb
+requirements: []
+dependencies: []