character-encodings 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,201 @@
1
+ # contents: UTF-8 String methods.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8/utf8'
6
+
7
+ # TODO: Rework this to use a dispatch object instead, so that the encoding can
8
+ # be changed on the fly.
9
+ # TODO: Add String#encoding.
10
+ module Encoding::Character::UTF8::Methods
11
+ def self.def_thunk_replacing_variant(method)
12
+ define_method(:"#{method}!") do
13
+ replace(send(method))
14
+ end
15
+ end
16
+
17
+ def <=>(other)
18
+ Encoding::Character::UTF8.collate(self, other)
19
+ end
20
+
21
+ def [](*args)
22
+ Encoding::Character::UTF8.aref(self, *args)
23
+ end
24
+
25
+ def slice(*args)
26
+ Encoding::Character::UTF8.aref(self, *args)
27
+ end
28
+
29
+ def []=(*args)
30
+ Encoding::Character::UTF8.aset(self, *args)
31
+ end
32
+
33
+ def casecmp(other)
34
+ Encoding::Character::UTF8.casecmp(self, other)
35
+ end
36
+
37
+ def center(*args)
38
+ Encoding::Character::UTF8.center(self, *args)
39
+ end
40
+
41
+ def chomp(*args)
42
+ Encoding::Character::UTF8.chomp(self, *args)
43
+ end
44
+
45
+ def chomp!(*args)
46
+ Encoding::Character::UTF8.chomp!(self, *args)
47
+ end
48
+
49
+ def chop
50
+ Encoding::Character::UTF8.chop(self)
51
+ end
52
+
53
+ def chop!
54
+ Encoding::Character::UTF8.chop!(self)
55
+ end
56
+
57
+ def count(*args)
58
+ Encoding::Character::UTF8.count(self, *args)
59
+ end
60
+
61
+ def delete(*args)
62
+ Encoding::Character::UTF8.delete(self, *args)
63
+ end
64
+
65
+ def delete!(*args)
66
+ Encoding::Character::UTF8.delete!(self, *args)
67
+ end
68
+
69
+ def downcase
70
+ Encoding::Character::UTF8.downcase(self)
71
+ end
72
+ def_thunk_replacing_variant :downcase
73
+
74
+ def each_char(&block)
75
+ Encoding::Character::UTF8.each_char(self, &block)
76
+ end
77
+
78
+ def index(*args)
79
+ Encoding::Character::UTF8.index(self, *args)
80
+ end
81
+
82
+ def insert(index, other)
83
+ Encoding::Character::UTF8.insert(self, index, other)
84
+ end
85
+
86
+ def length
87
+ Encoding::Character::UTF8.length(self)
88
+ end
89
+
90
+ def lstrip
91
+ Encoding::Character::UTF8.lstrip(self)
92
+ end
93
+
94
+ def lstrip!
95
+ Encoding::Character::UTF8.lstrip!(self)
96
+ end
97
+
98
+ def normalize(*args)
99
+ Encoding::Character::UTF8.normalize(self, *args)
100
+ end
101
+
102
+ def rindex(*args)
103
+ Encoding::Character::UTF8.rindex(self, *args)
104
+ end
105
+
106
+ def rstrip
107
+ Encoding::Character::UTF8.rstrip(self)
108
+ end
109
+
110
+ def rstrip!
111
+ Encoding::Character::UTF8.rstrip!(self)
112
+ end
113
+
114
+ def reverse
115
+ Encoding::Character::UTF8.reverse(self)
116
+ end
117
+ def_thunk_replacing_variant :reverse
118
+
119
+ def squeeze
120
+ Encoding::Character::UTF8.squeeze(self)
121
+ end
122
+
123
+ def squeeze!
124
+ Encoding::Character::UTF8.squeeze!(self)
125
+ end
126
+
127
+ def strip
128
+ Encoding::Character::UTF8.strip(self)
129
+ end
130
+
131
+ def strip!
132
+ Encoding::Character::UTF8.strip!(self)
133
+ end
134
+
135
+ def to_i(*args)
136
+ Encoding::Character::UTF8.to_i(self, *args)
137
+ end
138
+
139
+ def tr(from, to)
140
+ Encoding::Character::UTF8.tr(self, from, to)
141
+ end
142
+
143
+ def tr!(from, to)
144
+ replace(tr(from, to))
145
+ end
146
+
147
+ def tr_s(from, to)
148
+ Encoding::Character::UTF8.tr_s(self, from, to)
149
+ end
150
+
151
+ def tr_s!(from, to)
152
+ replace(tr_s(from, to))
153
+ end
154
+
155
+ def inspect
156
+ "u#{_inspect}"
157
+ end
158
+
159
+ def ljust(*args)
160
+ Encoding::Character::UTF8.ljust(self, *args)
161
+ end
162
+
163
+ def rjust(*args)
164
+ Encoding::Character::UTF8.rjust(self, *args)
165
+ end
166
+
167
+ def upcase
168
+ Encoding::Character::UTF8.upcase(self)
169
+ end
170
+ def_thunk_replacing_variant :upcase
171
+
172
+ def capitalize
173
+ self[0].upcase + self[1..-1].downcase
174
+ end
175
+ def_thunk_replacing_variant :capitalize
176
+
177
+ def foldcase
178
+ Encoding::Character::UTF8.foldcase(self)
179
+ end
180
+ def_thunk_replacing_variant :foldcase
181
+
182
+ private
183
+
184
+ Inspect = String.instance_method(:inspect)
185
+
186
+ def _inspect
187
+ Inspect.bind(self).call
188
+ end
189
+ end
190
+
191
+ class String
192
+ def +@
193
+ self.extend(Encoding::Character::UTF8::Methods)
194
+ end
195
+ end
196
+
197
+ module Kernel
198
+ def u(str)
199
+ str.extend(Encoding::Character::UTF8::Methods)
200
+ end
201
+ end
@@ -0,0 +1,45 @@
1
+ # contents: Specification of String#[].
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should return nil when sent #\\[\\], given an index of 0 and a negative length" do
13
+ [-10, -2, -1].each do |length|
14
+ @string[0, length].should_be nil
15
+ end
16
+ end
17
+
18
+ specify "should contain the empty string at index 0, given any non-negative length" do
19
+ [0, 1, 2, 10].each do |length|
20
+ @string[0, length].should_equal ""
21
+ end
22
+ end
23
+
24
+ specify "should return nil when sent #\\[\\], given any non-zero index and any length" do
25
+ [-10, -2, -1, 1, 2, 10].each do |index|
26
+ [-10, -2, -1, 0, 1, 2, 10].each do |length|
27
+ @string[index, length].should_be nil
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ context "The string “hëllö”" do
34
+ setup do
35
+ @string = u"hëllö"
36
+ end
37
+
38
+ specify "should contain the string “lö” at index 3" do
39
+ @string[3, 2].should_equal "lö"
40
+ end
41
+
42
+ specify "should contain the string “hë” at index 0" do
43
+ @string[0, 2].should_equal "hë"
44
+ end
45
+ end
@@ -0,0 +1,29 @@
1
+ # contents: Tests for String#count method.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should return a count of zero" do
13
+ @string.count("whatever").should_be 0
14
+ end
15
+ end
16
+
17
+ context "A string containing one ‘l’" do
18
+ setup do
19
+ @string = u"helo"
20
+ end
21
+
22
+ specify "should return a count of one “l”’s given an “l”" do
23
+ @string.count("l").should_be 1
24
+ end
25
+
26
+ specify "should return a count of one “l”’s given any input" do
27
+ @string.count("helo", "wrld").should_be 1
28
+ end
29
+ end
@@ -0,0 +1,25 @@
1
+ # contents: Specification for String#delete.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should return an empty string after deleting anything" do
13
+ @string.delete("whatever").should_be_empty
14
+ end
15
+ end
16
+
17
+ context "The string “hëllö”" do
18
+ setup do
19
+ @string = u"hëllö"
20
+ end
21
+
22
+ specify "should return “hëll” after deleting all ‘ö’’s" do
23
+ @string.delete("ö").should_equal "hëll"
24
+ end
25
+ end
@@ -0,0 +1,28 @@
1
+ # contents: Specification for String#each_char.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "shouldn’t yield any characters" do
13
+ i = 0
14
+ @string.each_char{ |c| i += 1 }
15
+ i.should_be 0
16
+ end
17
+ end
18
+
19
+ context "The string “hëllö”" do
20
+ setup do
21
+ @string = u"hëllö"
22
+ end
23
+
24
+ specify "should yield five characters" do
25
+ characters = ['h', 'ë', 'l', 'l', 'ö']
26
+ @string.each_char{ |c| c.should_equal characters.shift }
27
+ end
28
+ end
@@ -0,0 +1,35 @@
1
+ # contents: Specification of String#index.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should contain the empty string at index 0" do
13
+ @string.index("").should_equal 0
14
+ end
15
+
16
+ specify "shouldn’t contain any string at an index > 0" do
17
+ @string.index("", 1).should_be nil
18
+ @string.index("", -1).should_be nil
19
+ end
20
+ end
21
+
22
+ context "The string “hëllö”" do
23
+ setup do
24
+ @string = u"hëllö"
25
+ end
26
+
27
+ specify "should contain the string “lö” at index 3" do
28
+ @string.index("lö").should_equal 3
29
+ @string.index("lö", 3).should_equal 3
30
+ end
31
+
32
+ specify "should contain the string “hë” at index 0" do
33
+ @string.index("hë").should_equal 0
34
+ end
35
+ end
@@ -0,0 +1,67 @@
1
+ # contents: Specification of String#insert.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should be empty after insertion of an empty string at index 0" do
13
+ @string.insert(0, "")
14
+ @string.should_be_empty
15
+ end
16
+
17
+ specify "should raise an IndexError if inserting anything beyond index 0" do
18
+ proc{ @string.insert(1, "") }.should_raise IndexError
19
+ end
20
+
21
+ specify "should be non-empty after insertion of any non-empty string" do
22
+ @string.insert(0, "a")
23
+ @string.should_not_be_empty
24
+ end
25
+
26
+ specify "should be equal to the string inserted" do
27
+ string_to_insert = "äbc"
28
+ @string.insert(0, string_to_insert)
29
+ @string.should_equal string_to_insert
30
+ end
31
+ end
32
+
33
+ context "The string “hëö”" do
34
+ setup do
35
+ @string = u"hëö"
36
+ end
37
+
38
+ specify "should equal the string “hëllö” after inserting “ll” at index 2" do
39
+ @string.insert(2, "ll")
40
+ @string.should_equal "hëllö"
41
+ end
42
+
43
+ specify "should equal the string “hëöll” after inserting “ll” at index -2" do
44
+ @string.insert(-2, "ll")
45
+ @string.should_equal "hëllö"
46
+ end
47
+
48
+ specify "should equal the string “hëöll” after inserting “ll” at index 3" do
49
+ @string.insert(3, "ll")
50
+ @string.should_equal "hëöll"
51
+ end
52
+
53
+ specify "should equal the string “hëöll” after inserting “ll” at index -1" do
54
+ @string.insert(-1, "ll")
55
+ @string.should_equal "hëöll"
56
+ end
57
+
58
+ specify "should equal the string “llhëö” after inserting “ll” at index 0" do
59
+ @string.insert(0, "ll")
60
+ @string.should_equal "llhëö"
61
+ end
62
+
63
+ specify "should equal the string “llhëö” after inserting “ll” at index -4" do
64
+ @string.insert(0, "ll")
65
+ @string.should_equal "llhëö"
66
+ end
67
+ end
@@ -0,0 +1,45 @@
1
+ # contents: String#length specification.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should return 0 when sent #length" do
13
+ @string.length.should_equal 0
14
+ end
15
+ end
16
+
17
+ context "The string “hëllö”" do
18
+ setup do
19
+ @string = u"hëllö"
20
+ end
21
+
22
+ specify "should return 5 when sent #length" do
23
+ @string.length.should_equal 5
24
+ end
25
+ end
26
+
27
+ context "The string “hëllö\0agäin” with an embedded NUL-byte" do
28
+ setup do
29
+ @string = u"hëllö\0agäin"
30
+ end
31
+
32
+ specify "should return 11 when sent #length" do
33
+ @string.length.should_equal 11
34
+ end
35
+ end
36
+
37
+ context "The string “hëllö\0agäin” with a partial character at the end" do
38
+ setup do
39
+ @string = u"hëllö\0agäin\303"
40
+ end
41
+
42
+ specify "should return 11 when sent #length" do
43
+ @string.length.should_equal 11
44
+ end
45
+ end