character-encodings 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,201 @@
1
+ # contents: UTF-8 String methods.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8/utf8'
6
+
7
+ # TODO: Rework this to use a dispatch object instead, so that the encoding can
8
+ # be changed on the fly.
9
+ # TODO: Add String#encoding.
10
+ module Encoding::Character::UTF8::Methods
11
+ def self.def_thunk_replacing_variant(method)
12
+ define_method(:"#{method}!") do
13
+ replace(send(method))
14
+ end
15
+ end
16
+
17
+ def <=>(other)
18
+ Encoding::Character::UTF8.collate(self, other)
19
+ end
20
+
21
+ def [](*args)
22
+ Encoding::Character::UTF8.aref(self, *args)
23
+ end
24
+
25
+ def slice(*args)
26
+ Encoding::Character::UTF8.aref(self, *args)
27
+ end
28
+
29
+ def []=(*args)
30
+ Encoding::Character::UTF8.aset(self, *args)
31
+ end
32
+
33
+ def casecmp(other)
34
+ Encoding::Character::UTF8.casecmp(self, other)
35
+ end
36
+
37
+ def center(*args)
38
+ Encoding::Character::UTF8.center(self, *args)
39
+ end
40
+
41
+ def chomp(*args)
42
+ Encoding::Character::UTF8.chomp(self, *args)
43
+ end
44
+
45
+ def chomp!(*args)
46
+ Encoding::Character::UTF8.chomp!(self, *args)
47
+ end
48
+
49
+ def chop
50
+ Encoding::Character::UTF8.chop(self)
51
+ end
52
+
53
+ def chop!
54
+ Encoding::Character::UTF8.chop!(self)
55
+ end
56
+
57
+ def count(*args)
58
+ Encoding::Character::UTF8.count(self, *args)
59
+ end
60
+
61
+ def delete(*args)
62
+ Encoding::Character::UTF8.delete(self, *args)
63
+ end
64
+
65
+ def delete!(*args)
66
+ Encoding::Character::UTF8.delete!(self, *args)
67
+ end
68
+
69
+ def downcase
70
+ Encoding::Character::UTF8.downcase(self)
71
+ end
72
+ def_thunk_replacing_variant :downcase
73
+
74
+ def each_char(&block)
75
+ Encoding::Character::UTF8.each_char(self, &block)
76
+ end
77
+
78
+ def index(*args)
79
+ Encoding::Character::UTF8.index(self, *args)
80
+ end
81
+
82
+ def insert(index, other)
83
+ Encoding::Character::UTF8.insert(self, index, other)
84
+ end
85
+
86
+ def length
87
+ Encoding::Character::UTF8.length(self)
88
+ end
89
+
90
+ def lstrip
91
+ Encoding::Character::UTF8.lstrip(self)
92
+ end
93
+
94
+ def lstrip!
95
+ Encoding::Character::UTF8.lstrip!(self)
96
+ end
97
+
98
+ def normalize(*args)
99
+ Encoding::Character::UTF8.normalize(self, *args)
100
+ end
101
+
102
+ def rindex(*args)
103
+ Encoding::Character::UTF8.rindex(self, *args)
104
+ end
105
+
106
+ def rstrip
107
+ Encoding::Character::UTF8.rstrip(self)
108
+ end
109
+
110
+ def rstrip!
111
+ Encoding::Character::UTF8.rstrip!(self)
112
+ end
113
+
114
+ def reverse
115
+ Encoding::Character::UTF8.reverse(self)
116
+ end
117
+ def_thunk_replacing_variant :reverse
118
+
119
+ def squeeze
120
+ Encoding::Character::UTF8.squeeze(self)
121
+ end
122
+
123
+ def squeeze!
124
+ Encoding::Character::UTF8.squeeze!(self)
125
+ end
126
+
127
+ def strip
128
+ Encoding::Character::UTF8.strip(self)
129
+ end
130
+
131
+ def strip!
132
+ Encoding::Character::UTF8.strip!(self)
133
+ end
134
+
135
+ def to_i(*args)
136
+ Encoding::Character::UTF8.to_i(self, *args)
137
+ end
138
+
139
+ def tr(from, to)
140
+ Encoding::Character::UTF8.tr(self, from, to)
141
+ end
142
+
143
+ def tr!(from, to)
144
+ replace(tr(from, to))
145
+ end
146
+
147
+ def tr_s(from, to)
148
+ Encoding::Character::UTF8.tr_s(self, from, to)
149
+ end
150
+
151
+ def tr_s!(from, to)
152
+ replace(tr_s(from, to))
153
+ end
154
+
155
+ def inspect
156
+ "u#{_inspect}"
157
+ end
158
+
159
+ def ljust(*args)
160
+ Encoding::Character::UTF8.ljust(self, *args)
161
+ end
162
+
163
+ def rjust(*args)
164
+ Encoding::Character::UTF8.rjust(self, *args)
165
+ end
166
+
167
+ def upcase
168
+ Encoding::Character::UTF8.upcase(self)
169
+ end
170
+ def_thunk_replacing_variant :upcase
171
+
172
+ def capitalize
173
+ self[0].upcase + self[1..-1].downcase
174
+ end
175
+ def_thunk_replacing_variant :capitalize
176
+
177
+ def foldcase
178
+ Encoding::Character::UTF8.foldcase(self)
179
+ end
180
+ def_thunk_replacing_variant :foldcase
181
+
182
+ private
183
+
184
+ Inspect = String.instance_method(:inspect)
185
+
186
+ def _inspect
187
+ Inspect.bind(self).call
188
+ end
189
+ end
190
+
191
+ class String
192
+ def +@
193
+ self.extend(Encoding::Character::UTF8::Methods)
194
+ end
195
+ end
196
+
197
+ module Kernel
198
+ def u(str)
199
+ str.extend(Encoding::Character::UTF8::Methods)
200
+ end
201
+ end
@@ -0,0 +1,45 @@
1
+ # contents: Specification of String#[].
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should return nil when sent #\\[\\], given an index of 0 and a negative length" do
13
+ [-10, -2, -1].each do |length|
14
+ @string[0, length].should_be nil
15
+ end
16
+ end
17
+
18
+ specify "should contain the empty string at index 0, given any non-negative length" do
19
+ [0, 1, 2, 10].each do |length|
20
+ @string[0, length].should_equal ""
21
+ end
22
+ end
23
+
24
+ specify "should return nil when sent #\\[\\], given any non-zero index and any length" do
25
+ [-10, -2, -1, 1, 2, 10].each do |index|
26
+ [-10, -2, -1, 0, 1, 2, 10].each do |length|
27
+ @string[index, length].should_be nil
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ context "The string “hëllö”" do
34
+ setup do
35
+ @string = u"hëllö"
36
+ end
37
+
38
+ specify "should contain the string “lö” at index 3" do
39
+ @string[3, 2].should_equal "lö"
40
+ end
41
+
42
+ specify "should contain the string “hë” at index 0" do
43
+ @string[0, 2].should_equal "hë"
44
+ end
45
+ end
@@ -0,0 +1,29 @@
1
+ # contents: Tests for String#count method.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should return a count of zero" do
13
+ @string.count("whatever").should_be 0
14
+ end
15
+ end
16
+
17
+ context "A string containing one ‘l’" do
18
+ setup do
19
+ @string = u"helo"
20
+ end
21
+
22
+ specify "should return a count of one “l”’s given an “l”" do
23
+ @string.count("l").should_be 1
24
+ end
25
+
26
+ specify "should return a count of one “l”’s given any input" do
27
+ @string.count("helo", "wrld").should_be 1
28
+ end
29
+ end
@@ -0,0 +1,25 @@
1
+ # contents: Specification for String#delete.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should return an empty string after deleting anything" do
13
+ @string.delete("whatever").should_be_empty
14
+ end
15
+ end
16
+
17
+ context "The string “hëllö”" do
18
+ setup do
19
+ @string = u"hëllö"
20
+ end
21
+
22
+ specify "should return “hëll” after deleting all ‘ö’’s" do
23
+ @string.delete("ö").should_equal "hëll"
24
+ end
25
+ end
@@ -0,0 +1,28 @@
1
+ # contents: Specification for String#each_char.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "shouldn’t yield any characters" do
13
+ i = 0
14
+ @string.each_char{ |c| i += 1 }
15
+ i.should_be 0
16
+ end
17
+ end
18
+
19
+ context "The string “hëllö”" do
20
+ setup do
21
+ @string = u"hëllö"
22
+ end
23
+
24
+ specify "should yield five characters" do
25
+ characters = ['h', 'ë', 'l', 'l', 'ö']
26
+ @string.each_char{ |c| c.should_equal characters.shift }
27
+ end
28
+ end
@@ -0,0 +1,35 @@
1
+ # contents: Specification of String#index.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should contain the empty string at index 0" do
13
+ @string.index("").should_equal 0
14
+ end
15
+
16
+ specify "shouldn’t contain any string at an index > 0" do
17
+ @string.index("", 1).should_be nil
18
+ @string.index("", -1).should_be nil
19
+ end
20
+ end
21
+
22
+ context "The string “hëllö”" do
23
+ setup do
24
+ @string = u"hëllö"
25
+ end
26
+
27
+ specify "should contain the string “lö” at index 3" do
28
+ @string.index("lö").should_equal 3
29
+ @string.index("lö", 3).should_equal 3
30
+ end
31
+
32
+ specify "should contain the string “hë” at index 0" do
33
+ @string.index("hë").should_equal 0
34
+ end
35
+ end
@@ -0,0 +1,67 @@
1
+ # contents: Specification of String#insert.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should be empty after insertion of an empty string at index 0" do
13
+ @string.insert(0, "")
14
+ @string.should_be_empty
15
+ end
16
+
17
+ specify "should raise an IndexError if inserting anything beyond index 0" do
18
+ proc{ @string.insert(1, "") }.should_raise IndexError
19
+ end
20
+
21
+ specify "should be non-empty after insertion of any non-empty string" do
22
+ @string.insert(0, "a")
23
+ @string.should_not_be_empty
24
+ end
25
+
26
+ specify "should be equal to the string inserted" do
27
+ string_to_insert = "äbc"
28
+ @string.insert(0, string_to_insert)
29
+ @string.should_equal string_to_insert
30
+ end
31
+ end
32
+
33
+ context "The string “hëö”" do
34
+ setup do
35
+ @string = u"hëö"
36
+ end
37
+
38
+ specify "should equal the string “hëllö” after inserting “ll” at index 2" do
39
+ @string.insert(2, "ll")
40
+ @string.should_equal "hëllö"
41
+ end
42
+
43
+ specify "should equal the string “hëöll” after inserting “ll” at index -2" do
44
+ @string.insert(-2, "ll")
45
+ @string.should_equal "hëllö"
46
+ end
47
+
48
+ specify "should equal the string “hëöll” after inserting “ll” at index 3" do
49
+ @string.insert(3, "ll")
50
+ @string.should_equal "hëöll"
51
+ end
52
+
53
+ specify "should equal the string “hëöll” after inserting “ll” at index -1" do
54
+ @string.insert(-1, "ll")
55
+ @string.should_equal "hëöll"
56
+ end
57
+
58
+ specify "should equal the string “llhëö” after inserting “ll” at index 0" do
59
+ @string.insert(0, "ll")
60
+ @string.should_equal "llhëö"
61
+ end
62
+
63
+ specify "should equal the string “llhëö” after inserting “ll” at index -4" do
64
+ @string.insert(0, "ll")
65
+ @string.should_equal "llhëö"
66
+ end
67
+ end
@@ -0,0 +1,45 @@
1
+ # contents: String#length specification.
2
+ #
3
+ # Copyright © 2006 Nikolai Weibull <now@bitwi.se>
4
+
5
+ require 'encoding/character/utf-8'
6
+
7
+ context "An empty string" do
8
+ setup do
9
+ @string = u""
10
+ end
11
+
12
+ specify "should return 0 when sent #length" do
13
+ @string.length.should_equal 0
14
+ end
15
+ end
16
+
17
+ context "The string “hëllö”" do
18
+ setup do
19
+ @string = u"hëllö"
20
+ end
21
+
22
+ specify "should return 5 when sent #length" do
23
+ @string.length.should_equal 5
24
+ end
25
+ end
26
+
27
+ context "The string “hëllö\0agäin” with an embedded NUL-byte" do
28
+ setup do
29
+ @string = u"hëllö\0agäin"
30
+ end
31
+
32
+ specify "should return 11 when sent #length" do
33
+ @string.length.should_equal 11
34
+ end
35
+ end
36
+
37
+ context "The string “hëllö\0agäin” with a partial character at the end" do
38
+ setup do
39
+ @string = u"hëllö\0agäin\303"
40
+ end
41
+
42
+ specify "should return 11 when sent #length" do
43
+ @string.length.should_equal 11
44
+ end
45
+ end