utf8 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/utf8/ext.c +1 -1
- data/ext/utf8/string_scanner_utf8.c +3 -0
- data/ext/utf8/string_utf8.c +44 -4
- data/ext/utf8/utf8.c +6 -3
- data/ext/utf8/utf8.h +1 -1
- data/lib/utf8/string.rb +1 -1
- data/spec/string_scanner_spec.rb +9 -0
- data/spec/string_spec.rb +17 -0
- data/utf8.gemspec +1 -1
- metadata +3 -3
data/ext/utf8/ext.c
CHANGED
@@ -45,6 +45,9 @@ static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
|
|
45
45
|
|
46
46
|
if (len > 0 && len > scanner->curr) {
|
47
47
|
lastCharLen = utf8CharLen(str, len);
|
48
|
+
if (lastCharLen < 0) {
|
49
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
50
|
+
}
|
48
51
|
utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
|
49
52
|
scanner->curr += lastCharLen;
|
50
53
|
AS_UTF8(utf8Str);
|
data/ext/utf8/string_utf8.c
CHANGED
@@ -15,9 +15,12 @@ extern VALUE intern_as_utf8;
|
|
15
15
|
static VALUE rb_cString_UTF8_length(VALUE self) {
|
16
16
|
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
|
17
17
|
size_t len = RSTRING_LEN(self);
|
18
|
-
|
18
|
+
int64_t utf8_len = 0;
|
19
19
|
|
20
20
|
utf8_len = utf8CharCount(str, len);
|
21
|
+
if (utf8_len < 0) {
|
22
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
23
|
+
}
|
21
24
|
|
22
25
|
return INT2FIX(utf8_len);
|
23
26
|
}
|
@@ -39,6 +42,9 @@ static VALUE rb_cString_UTF8_each_char(VALUE self) {
|
|
39
42
|
|
40
43
|
for(; i<len; i+=lastCharLen) {
|
41
44
|
lastCharLen = utf8CharLen(str, len);
|
45
|
+
if (lastCharLen < 0) {
|
46
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
47
|
+
}
|
42
48
|
utf8Str = rb_str_new((char *)str+i, lastCharLen);
|
43
49
|
AS_UTF8(utf8Str);
|
44
50
|
rb_yield(utf8Str);
|
@@ -79,7 +85,10 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
79
85
|
}
|
80
86
|
|
81
87
|
if (wantPos < 0) {
|
82
|
-
|
88
|
+
int64_t char_cnt = utf8CharCount(str, len);
|
89
|
+
if (char_cnt < 0) {
|
90
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
91
|
+
}
|
83
92
|
if ((wantPos * -1) > char_cnt) {
|
84
93
|
return Qnil;
|
85
94
|
}
|
@@ -88,6 +97,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
88
97
|
|
89
98
|
// scan until starting position
|
90
99
|
curCharLen = utf8CharLen(str, len);
|
100
|
+
if (curCharLen < 0) {
|
101
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
102
|
+
}
|
91
103
|
while (curPos < wantPos) {
|
92
104
|
// if we're about to step out of bounds, return nil
|
93
105
|
if ((size_t)(str-start) >= len) {
|
@@ -96,6 +108,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
96
108
|
|
97
109
|
str += curCharLen;
|
98
110
|
curCharLen = utf8CharLen(str, len);
|
111
|
+
if (curCharLen < 0) {
|
112
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
113
|
+
}
|
99
114
|
curPos++;
|
100
115
|
}
|
101
116
|
|
@@ -104,6 +119,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
104
119
|
offset = str;
|
105
120
|
str += curCharLen;
|
106
121
|
curCharLen = utf8CharLen(str, len);
|
122
|
+
if (curCharLen < 0) {
|
123
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
124
|
+
}
|
107
125
|
while (curPos < wantLen) {
|
108
126
|
// if we're about to step out of bounds, stop
|
109
127
|
if ((size_t)(str-start) >= len) {
|
@@ -112,6 +130,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
112
130
|
|
113
131
|
str += curCharLen;
|
114
132
|
curCharLen = utf8CharLen(str, len);
|
133
|
+
if (curCharLen < 0) {
|
134
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
135
|
+
}
|
115
136
|
curPos++;
|
116
137
|
}
|
117
138
|
|
@@ -130,7 +151,7 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
130
151
|
int8_t curCharLen = 0;
|
131
152
|
|
132
153
|
if (wantPos < 0) {
|
133
|
-
|
154
|
+
int64_t char_cnt = utf8CharCount(str, len);
|
134
155
|
if ((wantPos * -1) > char_cnt) {
|
135
156
|
return Qnil;
|
136
157
|
}
|
@@ -138,6 +159,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
138
159
|
}
|
139
160
|
|
140
161
|
curCharLen = utf8CharLen(str, len);
|
162
|
+
if (curCharLen < 0) {
|
163
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
164
|
+
}
|
141
165
|
while (curPos < wantPos) {
|
142
166
|
// if we're about to step out of bounds, return nil
|
143
167
|
if ((size_t)(str-start) >= len) {
|
@@ -146,6 +170,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
146
170
|
|
147
171
|
str += curCharLen;
|
148
172
|
curCharLen = utf8CharLen(str, len);
|
173
|
+
if (curCharLen < 0) {
|
174
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
175
|
+
}
|
149
176
|
curPos++;
|
150
177
|
}
|
151
178
|
|
@@ -158,7 +185,8 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
158
185
|
}
|
159
186
|
|
160
187
|
// [Range] syntax
|
161
|
-
long wantPos, curPos = 0, wantLen
|
188
|
+
long wantPos, curPos = 0, wantLen;
|
189
|
+
int64_t char_cnt = 0;
|
162
190
|
int8_t curCharLen = 0;
|
163
191
|
unsigned char *offset = str;
|
164
192
|
VALUE ret;
|
@@ -180,6 +208,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
180
208
|
|
181
209
|
// scan until starting position
|
182
210
|
curCharLen = utf8CharLen(str, len);
|
211
|
+
if (curCharLen < 0) {
|
212
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
213
|
+
}
|
183
214
|
while (curPos < wantPos) {
|
184
215
|
// if we're about to step out of bounds, return ""
|
185
216
|
if ((size_t)(str-start) >= len) {
|
@@ -190,6 +221,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
190
221
|
|
191
222
|
str += curCharLen;
|
192
223
|
curCharLen = utf8CharLen(str, len);
|
224
|
+
if (curCharLen < 0) {
|
225
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
226
|
+
}
|
193
227
|
curPos++;
|
194
228
|
}
|
195
229
|
|
@@ -198,6 +232,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
198
232
|
offset = str;
|
199
233
|
str += curCharLen;
|
200
234
|
curCharLen = utf8CharLen(str, len);
|
235
|
+
if (curCharLen < 0) {
|
236
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
237
|
+
}
|
201
238
|
while (curPos < wantLen) {
|
202
239
|
// if we're about to step out of bounds, stop
|
203
240
|
if ((size_t)(str-start) >= len) {
|
@@ -206,6 +243,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
206
243
|
|
207
244
|
str += curCharLen;
|
208
245
|
curCharLen = utf8CharLen(str, len);
|
246
|
+
if (curCharLen < 0) {
|
247
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
248
|
+
}
|
209
249
|
curPos++;
|
210
250
|
}
|
211
251
|
|
data/ext/utf8/utf8.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#include <stdio.h>
|
2
2
|
#include <stdint.h>
|
3
3
|
|
4
|
-
#define CHECK_LEN if ((size_t)(in-start) >= in_len) return
|
4
|
+
#define CHECK_LEN if ((size_t)(in-start) >= in_len) return -1;
|
5
5
|
|
6
6
|
/*
|
7
7
|
* Scans the current position of the buffer
|
@@ -62,14 +62,17 @@ inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
|
|
62
62
|
* Scans the current position of the buffer
|
63
63
|
* returning the total number of UTF8 characters found
|
64
64
|
*/
|
65
|
-
|
66
|
-
|
65
|
+
int64_t utf8CharCount(unsigned char *in, size_t in_len) {
|
66
|
+
int64_t total = 0, leftOver = in_len;
|
67
67
|
int8_t len = 0;
|
68
68
|
unsigned char *start = in;
|
69
69
|
|
70
70
|
if (in_len > 0) {
|
71
71
|
while (leftOver) {
|
72
72
|
len = utf8CharLen(start, leftOver);
|
73
|
+
if (len < 0) {
|
74
|
+
return -1;
|
75
|
+
}
|
73
76
|
leftOver -= len;
|
74
77
|
start += len;
|
75
78
|
total++;
|
data/ext/utf8/utf8.h
CHANGED
data/lib/utf8/string.rb
CHANGED
data/spec/string_scanner_spec.rb
CHANGED
@@ -8,6 +8,15 @@ describe StringScanner::UTF8 do
|
|
8
8
|
@utf8_scanner = @scanner.as_utf8
|
9
9
|
end
|
10
10
|
|
11
|
+
it "should blow up on invalid utf8 chars" do
|
12
|
+
# lets cut right into the middle of a sequence so we know it's bad
|
13
|
+
scanner = StringScanner.new(@char_array.join[0..1]).as_utf8
|
14
|
+
|
15
|
+
lambda {
|
16
|
+
scanner.getch
|
17
|
+
}.should raise_error(ArgumentError)
|
18
|
+
end
|
19
|
+
|
11
20
|
it "should extend StringScanner, adding an as_utf8 method that returns a StringScanner::UTF8 instance" do
|
12
21
|
@scanner.should respond_to(:as_utf8)
|
13
22
|
@scanner.as_utf8.class.should eql(StringScanner::UTF8)
|
data/spec/string_spec.rb
CHANGED
@@ -9,6 +9,23 @@ describe String::UTF8 do
|
|
9
9
|
@utf8_len = @char_array.size
|
10
10
|
end
|
11
11
|
|
12
|
+
it "should blow up on invalid utf8 chars" do
|
13
|
+
# lets cut right into the middle of a sequence so we know it's bad
|
14
|
+
utf8 = @str[0..1].as_utf8
|
15
|
+
|
16
|
+
lambda {
|
17
|
+
utf8.length
|
18
|
+
}.should raise_error(ArgumentError)
|
19
|
+
|
20
|
+
lambda {
|
21
|
+
utf8[0, 10]
|
22
|
+
}.should raise_error(ArgumentError)
|
23
|
+
|
24
|
+
lambda {
|
25
|
+
utf8.chars.to_a
|
26
|
+
}.should raise_error(ArgumentError)
|
27
|
+
end
|
28
|
+
|
12
29
|
it "should extend String, adding an as_utf8 method that returns a String::UTF8 instance" do
|
13
30
|
"".should respond_to(:as_utf8)
|
14
31
|
"".as_utf8.class.should eql(String::UTF8)
|
data/utf8.gemspec
CHANGED
metadata
CHANGED