character-encodings 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,52 @@
1
+ # contents: Specification of String#rindex.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should contain the empty string at index 0" do
13
+ @string.rindex("").should_equal 0
14
+ end
15
+
16
+ =begin
17
+ specify "shouldn’t contain any string at an index > 0" do
18
+ @string.rindex("", 1).should_be nil
19
+ @string.rindex("", -1).should_be nil
20
+ end
21
+ =end
22
+ end
23
+
24
+ context "The string “hëllö”" do
25
+ setup do
26
+ @string = u"hëllö"
27
+ end
28
+
29
+ specify "should contain the string “lö” at index 3" do
30
+ @string.rindex("lö").should_equal 3
31
+ @string.rindex("lö", 3).should_equal 3
32
+ end
33
+
34
+ specify "should contain the string “hë” at index 0" do
35
+ @string.rindex("hë").should_equal 0
36
+ end
37
+ end
38
+
39
+ context "The string “hëllölö”" do
40
+ setup do
41
+ @string = u"hëllölö"
42
+ end
43
+
44
+ specify "should contain the string “lö” at index 5" do
45
+ @string.rindex("lö").should_equal 5
46
+ @string.rindex("lö", 5).should_equal 5
47
+ end
48
+
49
+ specify "should contain the string “lö” at index 3, when starting at index 4" do
50
+ @string.rindex("lö", 4).should_equal 3
51
+ end
52
+ end
@@ -0,0 +1,25 @@
1
+ # contents: Specification of String#squeeze.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should return an empty string after squeezing anything" do
13
+ @string.delete("whatever").should_be_empty
14
+ end
15
+ end
16
+
17
+ context "The string “hëllö”" do
18
+ setup do
19
+ @string = u"hëllö"
20
+ end
21
+
22
+ specify "should return “hëlö” after squeezing all ‘ö’’s" do
23
+ @string.squeeze.should_equal "hëlö"
24
+ end
25
+ end
@@ -0,0 +1,54 @@
1
+ # contents: Specification of String#to_i.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should raise an ArgumentError when sent #to_i with an illegal base" do
13
+ [-2, -1, 0, 1, 37, 38].each{ |base| proc{ @string.to_i(1) }.should_raise ArgumentError }
14
+ end
15
+
16
+ specify "should return 0 when sent #to_i, using any legal base" do
17
+ @string.to_i.should_equal 0
18
+ 2.upto(36){ |base| @string.to_i(base).should_equal 0 }
19
+ end
20
+ end
21
+
22
+ context "The string “1”" do
23
+ setup do
24
+ @string = u"1"
25
+ end
26
+
27
+ specify "should return 1 when sent #to_i, using any legal base" do
28
+ @string.to_i.should_equal 1
29
+ 2.upto(36){ |base| @string.to_i(base).should_equal 1 }
30
+ end
31
+ end
32
+
33
+ context "The string “٠”" do
34
+ setup do
35
+ @string = u"١"
36
+ end
37
+
38
+ specify "should return 1 when sent #to_i, using any legal base" do
39
+ @string.to_i.should_equal 1
40
+ 2.upto(36){ |base| @string.to_i(base).should_equal 1 }
41
+ end
42
+ end
43
+
44
+ =begin
45
+ context "The string “ⅷ”" do
46
+ setup do
47
+ @string = u"ⅷ"
48
+ end
49
+
50
+ specify "should return 8 when sent #to_i, using base 10" do
51
+ @string.to_i.should_equal 8
52
+ end
53
+ end
54
+ =end
@@ -0,0 +1,39 @@
1
+ # contents: Specification of String#tr.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should stay the same for any translation" do
13
+ @string.tr("abc", "def").should_be_empty
14
+ end
15
+ end
16
+
17
+ context "The string “äbcdë”" do
18
+ setup do
19
+ @string = u"äbcdë"
20
+ end
21
+
22
+ specify "should return the string “abcde” when ‘ä’ and ‘ë’ are translated to ‘a’ and ‘e’" do
23
+ @string.tr("äë", "ae").should_equal "abcde"
24
+ end
25
+
26
+ specify "should return the string “ëëëëë” when “a-zäë” are translated to ‘ë’" do
27
+ @string.tr("a-zäë", "ë").should_equal "ëëëëë"
28
+ end
29
+ end
30
+
31
+ context "The string “aaaaa”" do
32
+ setup do
33
+ @string = u"aaaaa"
34
+ end
35
+
36
+ specify "should return the string “ëëëëë” when “a” is translated to ‘ä-ë’" do
37
+ @string.tr("a", "ä-ë").should_equal "ëëëëë"
38
+ end
39
+ end
data/tests/foldcase.rb ADDED
@@ -0,0 +1,28 @@
1
+ # contents: Tests for String#foldcase.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'tests/unicodedatatestbase'
6
+ require 'encoding/character/utf-8'
7
+
8
+ class TC_StringFoldcase < Test::Unit::TestCase
9
+ include UnicodeDataTestBase
10
+
11
+ Code, Status, Mapping = (0..2).to_a
12
+
13
+ def test_foldcase
14
+ open_data_file('CaseFolding.txt') do |file|
15
+ i = 0
16
+ file.each_line do |line|
17
+ i += 1
18
+ next if line =~ /^#/
19
+ next if line =~ /^\s*$/
20
+ fields = line.split('; ')
21
+ raise "#{line}: Wrong number of fields; #{field.size} instead of 4." unless fields.size == 4
22
+ next if fields[Status] == 'S' || fields[Status] == 'T'
23
+ numbers = fields[Mapping].split(' ').map{ |s| s.hex }
24
+ assert_equal(numbers.pack('U*'), u([fields[Code].hex].pack('U')).foldcase)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,101 @@
1
+ # contents: Tests for String#normalize.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'ostruct'
6
+ require 'tests/unicodedatatestbase'
7
+ require 'encoding/character/utf-8'
8
+
9
+ class TC_StringUnicodeNormalize < Test::Unit::TestCase
10
+ include UnicodeDataTestBase
11
+
12
+ Data = OpenStruct.new
13
+ Data.wanted_line = ENV['line'] ? ENV['line'].to_i : 0
14
+ Data.line = 0
15
+
16
+ def test_part_0
17
+ Data.file = open_data_file('NormalizationTest.txt')
18
+ read_lines_until(/^@Part0/)
19
+ read_lines_until(/^@Part1/){ |columns| test_columns(columns) }
20
+ end
21
+
22
+ def test_part_1
23
+ read_lines_until(/^@Part2/){ |columns| test_columns(columns) }
24
+ end
25
+
26
+ def test_part_2
27
+ read_lines_until(/^@Part3/){ |columns| test_columns(columns) }
28
+ end
29
+
30
+ def test_part_3
31
+ read_lines_until(:last){ |columns| test_columns(columns) }
32
+ Data.file.close
33
+ end
34
+
35
+ private
36
+
37
+ def read_lines_until(line = :last, &block)
38
+ if line == :last
39
+ until Data.file.eof?
40
+ deal_with_line(Data.file.gets, &block)
41
+ end
42
+ else
43
+ while (got_line = Data.file.gets) !~ line
44
+ raise "unexpected end of file while looking for #{line}" unless got_line
45
+ deal_with_line(got_line, &block)
46
+ end
47
+ Data.line += 1
48
+ end
49
+ end
50
+
51
+ def deal_with_line(line)
52
+ Data.line += 1
53
+ return if line[0] == ?#
54
+ columns = line.split(';', 6)
55
+ return if columns.length == 0
56
+ raise "#{Data.file}:#{Data.line}: Format of line does not conform to standard" unless columns.length == 6
57
+ return if Data.wanted_line != 0 and Data.line != Data.wanted_line
58
+ yield columns if block_given?
59
+ end
60
+
61
+ def encode(string)
62
+ string.unpack('U*').map{ |c| '%04X' % c }.join(' ')
63
+ end
64
+
65
+ def test_columns(columns)
66
+ catch :skip do
67
+ strings = columns[0..4].map do |c|
68
+ s = u(c.split(' ').map{ |i| i.to_i(16) }.pack("U*"));
69
+ throw :skip if s.empty?
70
+ s
71
+ end
72
+ [ [:nfd, false, 2], [:nfd, true, 4],
73
+ [:nfc, false, 1], [:nfc, true, 3],
74
+ [:nfkd, true, 4],
75
+ [:nfkc, true, 3] ].each do |mode, compat, expected|
76
+ test_normalization(columns, strings, mode, compat, expected)
77
+ end
78
+ end
79
+ end
80
+
81
+ def test_normalization(columns, strings, mode, compat, expected)
82
+ mode_is_compat = (mode == :nfkc || mode == :nfkd)
83
+ if mode_is_compat || !compat
84
+ 0.upto(2){ |i| test_one_normalization(columns, strings, mode, compat, expected, i, i + 1) }
85
+ end
86
+ if mode_is_compat || compat
87
+ 3.upto(4){ |i| test_one_normalization(columns, strings, mode, compat, expected, i, i) }
88
+ end
89
+ end
90
+
91
+ def test_one_normalization(columns, strings, mode, compat, expected, column, file_column)
92
+ normalized = strings[column].normalize(mode)
93
+ unless normalized.eql? strings[expected]
94
+ m = mode.to_s.upcase
95
+ flunk <<EOM
96
+ #{Data.line}:#{file_column}: #{m} normalization failed for #{columns[5].chomp.sub(/\s+#\s+\([^)]+\) /, "")}.
97
+ #{m}(#{columns[column]}) = #{columns[expected]}, not #{encode(normalized)} (#{normalized.inspect}).
98
+ EOM
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,45 @@
1
+ # contents: Auxiliary classes and methods for Test::Unit based tests.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'test/unit'
6
+
7
+ module UnicodeDataTestBase
8
+ UnidataBase = 'http://www.unicode.org/Public/UNIDATA/'
9
+
10
+ def open_data_file(file, &block)
11
+ dir = File.join(File.dirname(__FILE__), 'data')
12
+ begin
13
+ Dir.mkdir(dir)
14
+ rescue Errno::EEXIST
15
+ end
16
+ path = File.join(dir, file)
17
+ begin
18
+ File.open(path, &block)
19
+ rescue Errno::ENOENT
20
+ url = UnidataBase + file
21
+ print <<EOM
22
+ #{file} is missing. However, it can easily be downloaded at
23
+ #{url}.
24
+ EOM
25
+ require 'readline' rescue exit 1
26
+ print <<EOM
27
+ If you would like, I can download and install it for you.
28
+ If so, please type “yes”:
29
+ EOM
30
+ exit 1 unless Readline.readline == 'yes'
31
+ puts "OK, trying to fetch #{file} for you…"
32
+ require 'open-uri'
33
+ length = 0
34
+ open(url,
35
+ :content_length_proc => proc{ |size| length = size if size and size > 0 },
36
+ :progress_proc => proc{ |size| print(length ? "\r%3d%" % (100 * size / length) : "\r#{size}") }) do |remote|
37
+ File.open(path, 'w') do |local|
38
+ local.write remote.read(8192) until remote.eof?
39
+ end
40
+ end
41
+ puts "\nAh, finally done. I’ll try to open #{file} again now."
42
+ retry
43
+ end
44
+ end
45
+ end
metadata ADDED
@@ -0,0 +1,112 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: character-encodings
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.2.0
7
+ date: 2006-07-27 00:00:00 +02:00
8
+ summary: A pluggable character-encoding library
9
+ require_paths:
10
+ - lib
11
+ email: now@bitwi.se
12
+ homepage: http://git.bitwi.se/?p=ruby-character-encodings.git;a=summary
13
+ rubyforge_project:
14
+ description: A pluggable character-encoding library
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Nikolai Weibull
31
+ files:
32
+ - README
33
+ - Rakefile
34
+ - lib/encoding
35
+ - lib/encoding/character
36
+ - lib/encoding/character/utf-8.rb
37
+ - specifications/aref.rb
38
+ - specifications/count.rb
39
+ - specifications/delete.rb
40
+ - specifications/each_char.rb
41
+ - specifications/index.rb
42
+ - specifications/insert.rb
43
+ - specifications/length.rb
44
+ - specifications/rindex.rb
45
+ - specifications/squeeze.rb
46
+ - specifications/to_i.rb
47
+ - specifications/tr.rb
48
+ - ext/encoding/character/unicode/codepoint.c
49
+ - ext/encoding/character/utf-8/break.c
50
+ - ext/encoding/character/utf-8/decompose.c
51
+ - ext/encoding/character/utf-8/properties.c
52
+ - ext/encoding/character/utf-8/rb_utf_aref.c
53
+ - ext/encoding/character/utf-8/rb_utf_aset.c
54
+ - ext/encoding/character/utf-8/rb_utf_casecmp.c
55
+ - ext/encoding/character/utf-8/rb_utf_chomp.c
56
+ - ext/encoding/character/utf-8/rb_utf_chop.c
57
+ - ext/encoding/character/utf-8/rb_utf_collate.c
58
+ - ext/encoding/character/utf-8/rb_utf_count.c
59
+ - ext/encoding/character/utf-8/rb_utf_delete.c
60
+ - ext/encoding/character/utf-8/rb_utf_downcase.c
61
+ - ext/encoding/character/utf-8/rb_utf_each_char.c
62
+ - ext/encoding/character/utf-8/rb_utf_foldcase.c
63
+ - ext/encoding/character/utf-8/rb_utf_hex.c
64
+ - ext/encoding/character/utf-8/rb_utf_index.c
65
+ - ext/encoding/character/utf-8/rb_utf_insert.c
66
+ - ext/encoding/character/utf-8/rb_utf_internal_tr.c
67
+ - ext/encoding/character/utf-8/rb_utf_justify.c
68
+ - ext/encoding/character/utf-8/rb_utf_length.c
69
+ - ext/encoding/character/utf-8/rb_utf_lstrip.c
70
+ - ext/encoding/character/utf-8/rb_utf_normalize.c
71
+ - ext/encoding/character/utf-8/rb_utf_oct.c
72
+ - ext/encoding/character/utf-8/rb_utf_reverse.c
73
+ - ext/encoding/character/utf-8/rb_utf_rindex.c
74
+ - ext/encoding/character/utf-8/rb_utf_rstrip.c
75
+ - ext/encoding/character/utf-8/rb_utf_squeeze.c
76
+ - ext/encoding/character/utf-8/rb_utf_strip.c
77
+ - ext/encoding/character/utf-8/rb_utf_to_i.c
78
+ - ext/encoding/character/utf-8/rb_utf_tr.c
79
+ - ext/encoding/character/utf-8/rb_utf_upcase.c
80
+ - ext/encoding/character/utf-8/unicode.c
81
+ - ext/encoding/character/utf-8/utf.c
82
+ - ext/encoding/character/utf-8/rb_utf_internal_bignum.c
83
+ - ext/encoding/character/utf-8/private.h
84
+ - ext/encoding/character/utf-8/rb_includes.h
85
+ - ext/encoding/character/utf-8/rb_methods.h
86
+ - ext/encoding/character/utf-8/rb_utf_internal_tr.h
87
+ - ext/encoding/character/utf-8/unicode.h
88
+ - ext/encoding/character/utf-8/rb_utf_internal_bignum.h
89
+ - ext/encoding/character/utf-8/data/break.h
90
+ - ext/encoding/character/utf-8/data/character-tables.h
91
+ - ext/encoding/character/utf-8/data/compose.h
92
+ - ext/encoding/character/utf-8/data/decompose.h
93
+ - ext/encoding/character/utf-8/extconf.rb
94
+ - ext/encoding/character/utf-8/data/generate-unicode-data.rb
95
+ - ext/encoding/character/utf-8/depend
96
+ - tests/foldcase.rb
97
+ - tests/normalize.rb
98
+ - tests/unicodedatatestbase.rb
99
+ test_files: []
100
+
101
+ rdoc_options: []
102
+
103
+ extra_rdoc_files: []
104
+
105
+ executables: []
106
+
107
+ extensions:
108
+ - ext/encoding/character/utf-8/extconf.rb
109
+ requirements: []
110
+
111
+ dependencies: []
112
+