character-encodings 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +26 -0
- data/Rakefile +157 -0
- data/ext/encoding/character/unicode/codepoint.c +48 -0
- data/ext/encoding/character/utf-8/break.c +38 -0
- data/ext/encoding/character/utf-8/data/break.h +22931 -0
- data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
- data/ext/encoding/character/utf-8/data/compose.h +1607 -0
- data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
- data/ext/encoding/character/utf-8/decompose.c +476 -0
- data/ext/encoding/character/utf-8/depend +64 -0
- data/ext/encoding/character/utf-8/extconf.rb +47 -0
- data/ext/encoding/character/utf-8/private.h +68 -0
- data/ext/encoding/character/utf-8/properties.c +1061 -0
- data/ext/encoding/character/utf-8/rb_includes.h +18 -0
- data/ext/encoding/character/utf-8/rb_methods.h +49 -0
- data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
- data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
- data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
- data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
- data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
- data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
- data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
- data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
- data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
- data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
- data/ext/encoding/character/utf-8/unicode.c +319 -0
- data/ext/encoding/character/utf-8/unicode.h +208 -0
- data/ext/encoding/character/utf-8/utf.c +1332 -0
- data/lib/encoding/character/utf-8.rb +201 -0
- data/specifications/aref.rb +45 -0
- data/specifications/count.rb +29 -0
- data/specifications/delete.rb +25 -0
- data/specifications/each_char.rb +28 -0
- data/specifications/index.rb +35 -0
- data/specifications/insert.rb +67 -0
- data/specifications/length.rb +45 -0
- data/specifications/rindex.rb +52 -0
- data/specifications/squeeze.rb +25 -0
- data/specifications/to_i.rb +54 -0
- data/specifications/tr.rb +39 -0
- data/tests/foldcase.rb +28 -0
- data/tests/normalize.rb +101 -0
- data/tests/unicodedatatestbase.rb +45 -0
- metadata +112 -0
@@ -0,0 +1,201 @@
|
|
1
|
+
# contents: UTF-8 String methods.
|
2
|
+
#
|
3
|
+
# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
4
|
+
|
5
|
+
require 'encoding/character/utf-8/utf8'
|
6
|
+
|
7
|
+
# TODO: Rework this to use a dispatch object instead, so that the encoding can
|
8
|
+
# be changed on the fly.
|
9
|
+
# TODO: Add String#encoding.
|
10
|
+
module Encoding::Character::UTF8::Methods
|
11
|
+
def self.def_thunk_replacing_variant(method)
|
12
|
+
define_method(:"#{method}!") do
|
13
|
+
replace(send(method))
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def <=>(other)
|
18
|
+
Encoding::Character::UTF8.collate(self, other)
|
19
|
+
end
|
20
|
+
|
21
|
+
def [](*args)
|
22
|
+
Encoding::Character::UTF8.aref(self, *args)
|
23
|
+
end
|
24
|
+
|
25
|
+
def slice(*args)
|
26
|
+
Encoding::Character::UTF8.aref(self, *args)
|
27
|
+
end
|
28
|
+
|
29
|
+
def []=(*args)
|
30
|
+
Encoding::Character::UTF8.aset(self, *args)
|
31
|
+
end
|
32
|
+
|
33
|
+
def casecmp(other)
|
34
|
+
Encoding::Character::UTF8.casecmp(self, other)
|
35
|
+
end
|
36
|
+
|
37
|
+
def center(*args)
|
38
|
+
Encoding::Character::UTF8.center(self, *args)
|
39
|
+
end
|
40
|
+
|
41
|
+
def chomp(*args)
|
42
|
+
Encoding::Character::UTF8.chomp(self, *args)
|
43
|
+
end
|
44
|
+
|
45
|
+
def chomp!(*args)
|
46
|
+
Encoding::Character::UTF8.chomp!(self, *args)
|
47
|
+
end
|
48
|
+
|
49
|
+
def chop
|
50
|
+
Encoding::Character::UTF8.chop(self)
|
51
|
+
end
|
52
|
+
|
53
|
+
def chop!
|
54
|
+
Encoding::Character::UTF8.chop!(self)
|
55
|
+
end
|
56
|
+
|
57
|
+
def count(*args)
|
58
|
+
Encoding::Character::UTF8.count(self, *args)
|
59
|
+
end
|
60
|
+
|
61
|
+
def delete(*args)
|
62
|
+
Encoding::Character::UTF8.delete(self, *args)
|
63
|
+
end
|
64
|
+
|
65
|
+
def delete!(*args)
|
66
|
+
Encoding::Character::UTF8.delete!(self, *args)
|
67
|
+
end
|
68
|
+
|
69
|
+
def downcase
|
70
|
+
Encoding::Character::UTF8.downcase(self)
|
71
|
+
end
|
72
|
+
def_thunk_replacing_variant :downcase
|
73
|
+
|
74
|
+
def each_char(&block)
|
75
|
+
Encoding::Character::UTF8.each_char(self, &block)
|
76
|
+
end
|
77
|
+
|
78
|
+
def index(*args)
|
79
|
+
Encoding::Character::UTF8.index(self, *args)
|
80
|
+
end
|
81
|
+
|
82
|
+
def insert(index, other)
|
83
|
+
Encoding::Character::UTF8.insert(self, index, other)
|
84
|
+
end
|
85
|
+
|
86
|
+
def length
|
87
|
+
Encoding::Character::UTF8.length(self)
|
88
|
+
end
|
89
|
+
|
90
|
+
def lstrip
|
91
|
+
Encoding::Character::UTF8.lstrip(self)
|
92
|
+
end
|
93
|
+
|
94
|
+
def lstrip!
|
95
|
+
Encoding::Character::UTF8.lstrip!(self)
|
96
|
+
end
|
97
|
+
|
98
|
+
def normalize(*args)
|
99
|
+
Encoding::Character::UTF8.normalize(self, *args)
|
100
|
+
end
|
101
|
+
|
102
|
+
def rindex(*args)
|
103
|
+
Encoding::Character::UTF8.rindex(self, *args)
|
104
|
+
end
|
105
|
+
|
106
|
+
def rstrip
|
107
|
+
Encoding::Character::UTF8.rstrip(self)
|
108
|
+
end
|
109
|
+
|
110
|
+
def rstrip!
|
111
|
+
Encoding::Character::UTF8.rstrip!(self)
|
112
|
+
end
|
113
|
+
|
114
|
+
def reverse
|
115
|
+
Encoding::Character::UTF8.reverse(self)
|
116
|
+
end
|
117
|
+
def_thunk_replacing_variant :reverse
|
118
|
+
|
119
|
+
def squeeze
|
120
|
+
Encoding::Character::UTF8.squeeze(self)
|
121
|
+
end
|
122
|
+
|
123
|
+
def squeeze!
|
124
|
+
Encoding::Character::UTF8.squeeze!(self)
|
125
|
+
end
|
126
|
+
|
127
|
+
def strip
|
128
|
+
Encoding::Character::UTF8.strip(self)
|
129
|
+
end
|
130
|
+
|
131
|
+
def strip!
|
132
|
+
Encoding::Character::UTF8.strip!(self)
|
133
|
+
end
|
134
|
+
|
135
|
+
def to_i(*args)
|
136
|
+
Encoding::Character::UTF8.to_i(self, *args)
|
137
|
+
end
|
138
|
+
|
139
|
+
def tr(from, to)
|
140
|
+
Encoding::Character::UTF8.tr(self, from, to)
|
141
|
+
end
|
142
|
+
|
143
|
+
def tr!(from, to)
|
144
|
+
replace(tr(from, to))
|
145
|
+
end
|
146
|
+
|
147
|
+
def tr_s(from, to)
|
148
|
+
Encoding::Character::UTF8.tr_s(self, from, to)
|
149
|
+
end
|
150
|
+
|
151
|
+
def tr_s!(from, to)
|
152
|
+
replace(tr_s(from, to))
|
153
|
+
end
|
154
|
+
|
155
|
+
def inspect
|
156
|
+
"u#{_inspect}"
|
157
|
+
end
|
158
|
+
|
159
|
+
def ljust(*args)
|
160
|
+
Encoding::Character::UTF8.ljust(self, *args)
|
161
|
+
end
|
162
|
+
|
163
|
+
def rjust(*args)
|
164
|
+
Encoding::Character::UTF8.rjust(self, *args)
|
165
|
+
end
|
166
|
+
|
167
|
+
def upcase
|
168
|
+
Encoding::Character::UTF8.upcase(self)
|
169
|
+
end
|
170
|
+
def_thunk_replacing_variant :upcase
|
171
|
+
|
172
|
+
def capitalize
|
173
|
+
self[0].upcase + self[1..-1].downcase
|
174
|
+
end
|
175
|
+
def_thunk_replacing_variant :capitalize
|
176
|
+
|
177
|
+
def foldcase
|
178
|
+
Encoding::Character::UTF8.foldcase(self)
|
179
|
+
end
|
180
|
+
def_thunk_replacing_variant :foldcase
|
181
|
+
|
182
|
+
private
|
183
|
+
|
184
|
+
Inspect = String.instance_method(:inspect)
|
185
|
+
|
186
|
+
def _inspect
|
187
|
+
Inspect.bind(self).call
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
class String
|
192
|
+
def +@
|
193
|
+
self.extend(Encoding::Character::UTF8::Methods)
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
module Kernel
|
198
|
+
def u(str)
|
199
|
+
str.extend(Encoding::Character::UTF8::Methods)
|
200
|
+
end
|
201
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# contents: Specification of String#[].
|
2
|
+
#
|
3
|
+
# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
4
|
+
|
5
|
+
require 'encoding/character/utf-8'
|
6
|
+
|
7
|
+
context "An empty string" do
|
8
|
+
setup do
|
9
|
+
@string = u""
|
10
|
+
end
|
11
|
+
|
12
|
+
specify "should return nil when sent #\\[\\], given an index of 0 and a negative length" do
|
13
|
+
[-10, -2, -1].each do |length|
|
14
|
+
@string[0, length].should_be nil
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
specify "should contain the empty string at index 0, given any non-negative length" do
|
19
|
+
[0, 1, 2, 10].each do |length|
|
20
|
+
@string[0, length].should_equal ""
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
specify "should return nil when sent #\\[\\], given any non-zero index and any length" do
|
25
|
+
[-10, -2, -1, 1, 2, 10].each do |index|
|
26
|
+
[-10, -2, -1, 0, 1, 2, 10].each do |length|
|
27
|
+
@string[index, length].should_be nil
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "The string “hëllö”" do
|
34
|
+
setup do
|
35
|
+
@string = u"hëllö"
|
36
|
+
end
|
37
|
+
|
38
|
+
specify "should contain the string “lö” at index 3" do
|
39
|
+
@string[3, 2].should_equal "lö"
|
40
|
+
end
|
41
|
+
|
42
|
+
specify "should contain the string “hë” at index 0" do
|
43
|
+
@string[0, 2].should_equal "hë"
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# contents: Tests for String#count method.
|
2
|
+
#
|
3
|
+
# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
4
|
+
|
5
|
+
require 'encoding/character/utf-8'
|
6
|
+
|
7
|
+
context "An empty string" do
|
8
|
+
setup do
|
9
|
+
@string = u""
|
10
|
+
end
|
11
|
+
|
12
|
+
specify "should return a count of zero" do
|
13
|
+
@string.count("whatever").should_be 0
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "A string containing one ‘l’" do
|
18
|
+
setup do
|
19
|
+
@string = u"helo"
|
20
|
+
end
|
21
|
+
|
22
|
+
specify "should return a count of one “l”’s given an “l”" do
|
23
|
+
@string.count("l").should_be 1
|
24
|
+
end
|
25
|
+
|
26
|
+
specify "should return a count of one “l”’s given any input" do
|
27
|
+
@string.count("helo", "wrld").should_be 1
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# contents: Specification for String#delete.
|
2
|
+
#
|
3
|
+
# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
4
|
+
|
5
|
+
require 'encoding/character/utf-8'
|
6
|
+
|
7
|
+
context "An empty string" do
|
8
|
+
setup do
|
9
|
+
@string = u""
|
10
|
+
end
|
11
|
+
|
12
|
+
specify "should return an empty string after deleting anything" do
|
13
|
+
@string.delete("whatever").should_be_empty
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "The string “hëllö”" do
|
18
|
+
setup do
|
19
|
+
@string = u"hëllö"
|
20
|
+
end
|
21
|
+
|
22
|
+
specify "should return “hëll” after deleting all ‘ö’’s" do
|
23
|
+
@string.delete("ö").should_equal "hëll"
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# contents: Specification for String#each_char.
|
2
|
+
#
|
3
|
+
# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
4
|
+
|
5
|
+
require 'encoding/character/utf-8'
|
6
|
+
|
7
|
+
context "An empty string" do
|
8
|
+
setup do
|
9
|
+
@string = u""
|
10
|
+
end
|
11
|
+
|
12
|
+
specify "shouldn’t yield any characters" do
|
13
|
+
i = 0
|
14
|
+
@string.each_char{ |c| i += 1 }
|
15
|
+
i.should_be 0
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context "The string “hëllö”" do
|
20
|
+
setup do
|
21
|
+
@string = u"hëllö"
|
22
|
+
end
|
23
|
+
|
24
|
+
specify "should yield five characters" do
|
25
|
+
characters = ['h', 'ë', 'l', 'l', 'ö']
|
26
|
+
@string.each_char{ |c| c.should_equal characters.shift }
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# contents: Specification of String#index.
|
2
|
+
#
|
3
|
+
# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
4
|
+
|
5
|
+
require 'encoding/character/utf-8'
|
6
|
+
|
7
|
+
context "An empty string" do
|
8
|
+
setup do
|
9
|
+
@string = u""
|
10
|
+
end
|
11
|
+
|
12
|
+
specify "should contain the empty string at index 0" do
|
13
|
+
@string.index("").should_equal 0
|
14
|
+
end
|
15
|
+
|
16
|
+
specify "shouldn’t contain any string at an index > 0" do
|
17
|
+
@string.index("", 1).should_be nil
|
18
|
+
@string.index("", -1).should_be nil
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context "The string “hëllö”" do
|
23
|
+
setup do
|
24
|
+
@string = u"hëllö"
|
25
|
+
end
|
26
|
+
|
27
|
+
specify "should contain the string “lö” at index 3" do
|
28
|
+
@string.index("lö").should_equal 3
|
29
|
+
@string.index("lö", 3).should_equal 3
|
30
|
+
end
|
31
|
+
|
32
|
+
specify "should contain the string “hë” at index 0" do
|
33
|
+
@string.index("hë").should_equal 0
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# contents: Specification of String#insert.
|
2
|
+
#
|
3
|
+
# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
4
|
+
|
5
|
+
require 'encoding/character/utf-8'
|
6
|
+
|
7
|
+
context "An empty string" do
|
8
|
+
setup do
|
9
|
+
@string = u""
|
10
|
+
end
|
11
|
+
|
12
|
+
specify "should be empty after insertion of an empty string at index 0" do
|
13
|
+
@string.insert(0, "")
|
14
|
+
@string.should_be_empty
|
15
|
+
end
|
16
|
+
|
17
|
+
specify "should raise an IndexError if inserting anything beyond index 0" do
|
18
|
+
proc{ @string.insert(1, "") }.should_raise IndexError
|
19
|
+
end
|
20
|
+
|
21
|
+
specify "should be non-empty after insertion of any non-empty string" do
|
22
|
+
@string.insert(0, "a")
|
23
|
+
@string.should_not_be_empty
|
24
|
+
end
|
25
|
+
|
26
|
+
specify "should be equal to the string inserted" do
|
27
|
+
string_to_insert = "äbc"
|
28
|
+
@string.insert(0, string_to_insert)
|
29
|
+
@string.should_equal string_to_insert
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "The string “hëö”" do
|
34
|
+
setup do
|
35
|
+
@string = u"hëö"
|
36
|
+
end
|
37
|
+
|
38
|
+
specify "should equal the string “hëllö” after inserting “ll” at index 2" do
|
39
|
+
@string.insert(2, "ll")
|
40
|
+
@string.should_equal "hëllö"
|
41
|
+
end
|
42
|
+
|
43
|
+
specify "should equal the string “hëöll” after inserting “ll” at index -2" do
|
44
|
+
@string.insert(-2, "ll")
|
45
|
+
@string.should_equal "hëllö"
|
46
|
+
end
|
47
|
+
|
48
|
+
specify "should equal the string “hëöll” after inserting “ll” at index 3" do
|
49
|
+
@string.insert(3, "ll")
|
50
|
+
@string.should_equal "hëöll"
|
51
|
+
end
|
52
|
+
|
53
|
+
specify "should equal the string “hëöll” after inserting “ll” at index -1" do
|
54
|
+
@string.insert(-1, "ll")
|
55
|
+
@string.should_equal "hëöll"
|
56
|
+
end
|
57
|
+
|
58
|
+
specify "should equal the string “llhëö” after inserting “ll” at index 0" do
|
59
|
+
@string.insert(0, "ll")
|
60
|
+
@string.should_equal "llhëö"
|
61
|
+
end
|
62
|
+
|
63
|
+
specify "should equal the string “llhëö” after inserting “ll” at index -4" do
|
64
|
+
@string.insert(0, "ll")
|
65
|
+
@string.should_equal "llhëö"
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# contents: String#length specification.
|
2
|
+
#
|
3
|
+
# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
4
|
+
|
5
|
+
require 'encoding/character/utf-8'
|
6
|
+
|
7
|
+
context "An empty string" do
|
8
|
+
setup do
|
9
|
+
@string = u""
|
10
|
+
end
|
11
|
+
|
12
|
+
specify "should return 0 when sent #length" do
|
13
|
+
@string.length.should_equal 0
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "The string “hëllö”" do
|
18
|
+
setup do
|
19
|
+
@string = u"hëllö"
|
20
|
+
end
|
21
|
+
|
22
|
+
specify "should return 5 when sent #length" do
|
23
|
+
@string.length.should_equal 5
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
context "The string “hëllö\0agäin” with an embedded NUL-byte" do
|
28
|
+
setup do
|
29
|
+
@string = u"hëllö\0agäin"
|
30
|
+
end
|
31
|
+
|
32
|
+
specify "should return 11 when sent #length" do
|
33
|
+
@string.length.should_equal 11
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "The string “hëllö\0agäin” with a partial character at the end" do
|
38
|
+
setup do
|
39
|
+
@string = u"hëllö\0agäin\303"
|
40
|
+
end
|
41
|
+
|
42
|
+
specify "should return 11 when sent #length" do
|
43
|
+
@string.length.should_equal 11
|
44
|
+
end
|
45
|
+
end
|