character-encodings 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,52 @@
1
+ # contents: Specification of String#rindex.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should contain the empty string at index 0" do
13
+ @string.rindex("").should_equal 0
14
+ end
15
+
16
+ =begin
17
+ specify "shouldn’t contain any string at an index > 0" do
18
+ @string.rindex("", 1).should_be nil
19
+ @string.rindex("", -1).should_be nil
20
+ end
21
+ =end
22
+ end
23
+
24
+ context "The string “hëllö”" do
25
+ setup do
26
+ @string = u"hëllö"
27
+ end
28
+
29
+ specify "should contain the string “lö” at index 3" do
30
+ @string.rindex("lö").should_equal 3
31
+ @string.rindex("lö", 3).should_equal 3
32
+ end
33
+
34
+ specify "should contain the string “hë” at index 0" do
35
+ @string.rindex("hë").should_equal 0
36
+ end
37
+ end
38
+
39
+ context "The string “hëllölö”" do
40
+ setup do
41
+ @string = u"hëllölö"
42
+ end
43
+
44
+ specify "should contain the string “lö” at index 5" do
45
+ @string.rindex("lö").should_equal 5
46
+ @string.rindex("lö", 5).should_equal 5
47
+ end
48
+
49
+ specify "should contain the string “lö” at index 3, when starting at index 4" do
50
+ @string.rindex("lö", 4).should_equal 3
51
+ end
52
+ end
@@ -0,0 +1,25 @@
1
+ # contents: Specification of String#squeeze.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should return an empty string after squeezing anything" do
13
+ @string.delete("whatever").should_be_empty
14
+ end
15
+ end
16
+
17
+ context "The string “hëllö”" do
18
+ setup do
19
+ @string = u"hëllö"
20
+ end
21
+
22
+ specify "should return “hëlö” after squeezing all ‘ö’’s" do
23
+ @string.squeeze.should_equal "hëlö"
24
+ end
25
+ end
@@ -0,0 +1,54 @@
1
+ # contents: Specification of String#to_i.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should raise an ArgumentError when sent #to_i with an illegal base" do
13
+ [-2, -1, 0, 1, 37, 38].each{ |base| proc{ @string.to_i(1) }.should_raise ArgumentError }
14
+ end
15
+
16
+ specify "should return 0 when sent #to_i, using any legal base" do
17
+ @string.to_i.should_equal 0
18
+ 2.upto(36){ |base| @string.to_i(base).should_equal 0 }
19
+ end
20
+ end
21
+
22
+ context "The string “1”" do
23
+ setup do
24
+ @string = u"1"
25
+ end
26
+
27
+ specify "should return 1 when sent #to_i, using any legal base" do
28
+ @string.to_i.should_equal 1
29
+ 2.upto(36){ |base| @string.to_i(base).should_equal 1 }
30
+ end
31
+ end
32
+
33
+ context "The string “٠”" do
34
+ setup do
35
+ @string = u"١"
36
+ end
37
+
38
+ specify "should return 1 when sent #to_i, using any legal base" do
39
+ @string.to_i.should_equal 1
40
+ 2.upto(36){ |base| @string.to_i(base).should_equal 1 }
41
+ end
42
+ end
43
+
44
+ =begin
45
+ context "The string “ⅷ”" do
46
+ setup do
47
+ @string = u"ⅷ"
48
+ end
49
+
50
+ specify "should return 8 when sent #to_i, using base 10" do
51
+ @string.to_i.should_equal 8
52
+ end
53
+ end
54
+ =end
@@ -0,0 +1,39 @@
1
+ # contents: Specification of String#tr.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should stay the same for any translation" do
13
+ @string.tr("abc", "def").should_be_empty
14
+ end
15
+ end
16
+
17
+ context "The string “äbcdë”" do
18
+ setup do
19
+ @string = u"äbcdë"
20
+ end
21
+
22
+ specify "should return the string “abcde” when ‘ä’ and ‘ë’ are translated to ‘a’ and ‘e’" do
23
+ @string.tr("äë", "ae").should_equal "abcde"
24
+ end
25
+
26
+ specify "should return the string “ëëëëë” when “a-zäë” are translated to ‘ë’" do
27
+ @string.tr("a-zäë", "ë").should_equal "ëëëëë"
28
+ end
29
+ end
30
+
31
+ context "The string “aaaaa”" do
32
+ setup do
33
+ @string = u"aaaaa"
34
+ end
35
+
36
+ specify "should return the string “ëëëëë” when “a” is translated to ‘ä-ë’" do
37
+ @string.tr("a", "ä-ë").should_equal "ëëëëë"
38
+ end
39
+ end
data/tests/foldcase.rb ADDED
@@ -0,0 +1,28 @@
1
+ # contents: Tests for String#foldcase.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'tests/unicodedatatestbase'
6
+ require 'encoding/character/utf-8'
7
+
8
+ class TC_StringFoldcase < Test::Unit::TestCase
9
+ include UnicodeDataTestBase
10
+
11
+ Code, Status, Mapping = (0..2).to_a
12
+
13
+ def test_foldcase
14
+ open_data_file('CaseFolding.txt') do |file|
15
+ i = 0
16
+ file.each_line do |line|
17
+ i += 1
18
+ next if line =~ /^#/
19
+ next if line =~ /^\s*$/
20
+ fields = line.split('; ')
21
+ raise "#{line}: Wrong number of fields; #{field.size} instead of 4." unless fields.size == 4
22
+ next if fields[Status] == 'S' || fields[Status] == 'T'
23
+ numbers = fields[Mapping].split(' ').map{ |s| s.hex }
24
+ assert_equal(numbers.pack('U*'), u([fields[Code].hex].pack('U')).foldcase)
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,101 @@
1
+ # contents: Tests for String#normalize.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'ostruct'
6
+ require 'tests/unicodedatatestbase'
7
+ require 'encoding/character/utf-8'
8
+
9
+ class TC_StringUnicodeNormalize < Test::Unit::TestCase
10
+ include UnicodeDataTestBase
11
+
12
+ Data = OpenStruct.new
13
+ Data.wanted_line = ENV['line'] ? ENV['line'].to_i : 0
14
+ Data.line = 0
15
+
16
+ def test_part_0
17
+ Data.file = open_data_file('NormalizationTest.txt')
18
+ read_lines_until(/^@Part0/)
19
+ read_lines_until(/^@Part1/){ |columns| test_columns(columns) }
20
+ end
21
+
22
+ def test_part_1
23
+ read_lines_until(/^@Part2/){ |columns| test_columns(columns) }
24
+ end
25
+
26
+ def test_part_2
27
+ read_lines_until(/^@Part3/){ |columns| test_columns(columns) }
28
+ end
29
+
30
+ def test_part_3
31
+ read_lines_until(:last){ |columns| test_columns(columns) }
32
+ Data.file.close
33
+ end
34
+
35
+ private
36
+
37
+ def read_lines_until(line = :last, &block)
38
+ if line == :last
39
+ until Data.file.eof?
40
+ deal_with_line(Data.file.gets, &block)
41
+ end
42
+ else
43
+ while (got_line = Data.file.gets) !~ line
44
+ raise "unexpected end of file while looking for #{line}" unless got_line
45
+ deal_with_line(got_line, &block)
46
+ end
47
+ Data.line += 1
48
+ end
49
+ end
50
+
51
+ def deal_with_line(line)
52
+ Data.line += 1
53
+ return if line[0] == ?#
54
+ columns = line.split(';', 6)
55
+ return if columns.length == 0
56
+ raise "#{Data.file}:#{Data.line}: Format of line does not conform to standard" unless columns.length == 6
57
+ return if Data.wanted_line != 0 and Data.line != Data.wanted_line
58
+ yield columns if block_given?
59
+ end
60
+
61
+ def encode(string)
62
+ string.unpack('U*').map{ |c| '%04X' % c }.join(' ')
63
+ end
64
+
65
+ def test_columns(columns)
66
+ catch :skip do
67
+ strings = columns[0..4].map do |c|
68
+ s = u(c.split(' ').map{ |i| i.to_i(16) }.pack("U*"));
69
+ throw :skip if s.empty?
70
+ s
71
+ end
72
+ [ [:nfd, false, 2], [:nfd, true, 4],
73
+ [:nfc, false, 1], [:nfc, true, 3],
74
+ [:nfkd, true, 4],
75
+ [:nfkc, true, 3] ].each do |mode, compat, expected|
76
+ test_normalization(columns, strings, mode, compat, expected)
77
+ end
78
+ end
79
+ end
80
+
81
+ def test_normalization(columns, strings, mode, compat, expected)
82
+ mode_is_compat = (mode == :nfkc || mode == :nfkd)
83
+ if mode_is_compat || !compat
84
+ 0.upto(2){ |i| test_one_normalization(columns, strings, mode, compat, expected, i, i + 1) }
85
+ end
86
+ if mode_is_compat || compat
87
+ 3.upto(4){ |i| test_one_normalization(columns, strings, mode, compat, expected, i, i) }
88
+ end
89
+ end
90
+
91
+ def test_one_normalization(columns, strings, mode, compat, expected, column, file_column)
92
+ normalized = strings[column].normalize(mode)
93
+ unless normalized.eql? strings[expected]
94
+ m = mode.to_s.upcase
95
+ flunk <<EOM
96
+ #{Data.line}:#{file_column}: #{m} normalization failed for #{columns[5].chomp.sub(/\s+#\s+\([^)]+\) /, "")}.
97
+ #{m}(#{columns[column]}) = #{columns[expected]}, not #{encode(normalized)} (#{normalized.inspect}).
98
+ EOM
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,45 @@
1
+ # contents: Auxiliary classes and methods for Test::Unit based tests.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'test/unit'
6
+
7
+ module UnicodeDataTestBase
8
+ UnidataBase = 'http://www.unicode.org/Public/UNIDATA/'
9
+
10
+ def open_data_file(file, &block)
11
+ dir = File.join(File.dirname(__FILE__), 'data')
12
+ begin
13
+ Dir.mkdir(dir)
14
+ rescue Errno::EEXIST
15
+ end
16
+ path = File.join(dir, file)
17
+ begin
18
+ File.open(path, &block)
19
+ rescue Errno::ENOENT
20
+ url = UnidataBase + file
21
+ print <<EOM
22
+ #{file} is missing. However, it can easily be downloaded at
23
+ #{url}.
24
+ EOM
25
+ require 'readline' rescue exit 1
26
+ print <<EOM
27
+ If you would like, I can download and install it for you.
28
+ If so, please type “yes”:
29
+ EOM
30
+ exit 1 unless Readline.readline == 'yes'
31
+ puts "OK, trying to fetch #{file} for you…"
32
+ require 'open-uri'
33
+ length = 0
34
+ open(url,
35
+ :content_length_proc => proc{ |size| length = size if size and size > 0 },
36
+ :progress_proc => proc{ |size| print(length ? "\r%3d%" % (100 * size / length) : "\r#{size}") }) do |remote|
37
+ File.open(path, 'w') do |local|
38
+ local.write remote.read(8192) until remote.eof?
39
+ end
40
+ end
41
+ puts "\nAh, finally done. I’ll try to open #{file} again now."
42
+ retry
43
+ end
44
+ end
45
+ end
metadata ADDED
@@ -0,0 +1,112 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: character-encodings
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.2.0
7
+ date: 2006-07-27 00:00:00 +02:00
8
+ summary: A pluggable character-encoding library
9
+ require_paths:
10
+ - lib
11
+ email: now@bitwi.se
12
+ homepage: http://git.bitwi.se/?p=ruby-character-encodings.git;a=summary
13
+ rubyforge_project:
14
+ description: A pluggable character-encoding library
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Nikolai Weibull
31
+ files:
32
+ - README
33
+ - Rakefile
34
+ - lib/encoding
35
+ - lib/encoding/character
36
+ - lib/encoding/character/utf-8.rb
37
+ - specifications/aref.rb
38
+ - specifications/count.rb
39
+ - specifications/delete.rb
40
+ - specifications/each_char.rb
41
+ - specifications/index.rb
42
+ - specifications/insert.rb
43
+ - specifications/length.rb
44
+ - specifications/rindex.rb
45
+ - specifications/squeeze.rb
46
+ - specifications/to_i.rb
47
+ - specifications/tr.rb
48
+ - ext/encoding/character/unicode/codepoint.c
49
+ - ext/encoding/character/utf-8/break.c
50
+ - ext/encoding/character/utf-8/decompose.c
51
+ - ext/encoding/character/utf-8/properties.c
52
+ - ext/encoding/character/utf-8/rb_utf_aref.c
53
+ - ext/encoding/character/utf-8/rb_utf_aset.c
54
+ - ext/encoding/character/utf-8/rb_utf_casecmp.c
55
+ - ext/encoding/character/utf-8/rb_utf_chomp.c
56
+ - ext/encoding/character/utf-8/rb_utf_chop.c
57
+ - ext/encoding/character/utf-8/rb_utf_collate.c
58
+ - ext/encoding/character/utf-8/rb_utf_count.c
59
+ - ext/encoding/character/utf-8/rb_utf_delete.c
60
+ - ext/encoding/character/utf-8/rb_utf_downcase.c
61
+ - ext/encoding/character/utf-8/rb_utf_each_char.c
62
+ - ext/encoding/character/utf-8/rb_utf_foldcase.c
63
+ - ext/encoding/character/utf-8/rb_utf_hex.c
64
+ - ext/encoding/character/utf-8/rb_utf_index.c
65
+ - ext/encoding/character/utf-8/rb_utf_insert.c
66
+ - ext/encoding/character/utf-8/rb_utf_internal_tr.c
67
+ - ext/encoding/character/utf-8/rb_utf_justify.c
68
+ - ext/encoding/character/utf-8/rb_utf_length.c
69
+ - ext/encoding/character/utf-8/rb_utf_lstrip.c
70
+ - ext/encoding/character/utf-8/rb_utf_normalize.c
71
+ - ext/encoding/character/utf-8/rb_utf_oct.c
72
+ - ext/encoding/character/utf-8/rb_utf_reverse.c
73
+ - ext/encoding/character/utf-8/rb_utf_rindex.c
74
+ - ext/encoding/character/utf-8/rb_utf_rstrip.c
75
+ - ext/encoding/character/utf-8/rb_utf_squeeze.c
76
+ - ext/encoding/character/utf-8/rb_utf_strip.c
77
+ - ext/encoding/character/utf-8/rb_utf_to_i.c
78
+ - ext/encoding/character/utf-8/rb_utf_tr.c
79
+ - ext/encoding/character/utf-8/rb_utf_upcase.c
80
+ - ext/encoding/character/utf-8/unicode.c
81
+ - ext/encoding/character/utf-8/utf.c
82
+ - ext/encoding/character/utf-8/rb_utf_internal_bignum.c
83
+ - ext/encoding/character/utf-8/private.h
84
+ - ext/encoding/character/utf-8/rb_includes.h
85
+ - ext/encoding/character/utf-8/rb_methods.h
86
+ - ext/encoding/character/utf-8/rb_utf_internal_tr.h
87
+ - ext/encoding/character/utf-8/unicode.h
88
+ - ext/encoding/character/utf-8/rb_utf_internal_bignum.h
89
+ - ext/encoding/character/utf-8/data/break.h
90
+ - ext/encoding/character/utf-8/data/character-tables.h
91
+ - ext/encoding/character/utf-8/data/compose.h
92
+ - ext/encoding/character/utf-8/data/decompose.h
93
+ - ext/encoding/character/utf-8/extconf.rb
94
+ - ext/encoding/character/utf-8/data/generate-unicode-data.rb
95
+ - ext/encoding/character/utf-8/depend
96
+ - tests/foldcase.rb
97
+ - tests/normalize.rb
98
+ - tests/unicodedatatestbase.rb
99
+ test_files: []
100
+
101
+ rdoc_options: []
102
+
103
+ extra_rdoc_files: []
104
+
105
+ executables: []
106
+
107
+ extensions:
108
+ - ext/encoding/character/utf-8/extconf.rb
109
+ requirements: []
110
+
111
+ dependencies: []
112
+