utf8 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/utf8/ext.c +1 -1
- data/ext/utf8/string_scanner_utf8.c +3 -0
- data/ext/utf8/string_utf8.c +44 -4
- data/ext/utf8/utf8.c +6 -3
- data/ext/utf8/utf8.h +1 -1
- data/lib/utf8/string.rb +1 -1
- data/spec/string_scanner_spec.rb +9 -0
- data/spec/string_spec.rb +17 -0
- data/utf8.gemspec +1 -1
- metadata +3 -3
data/ext/utf8/ext.c
CHANGED
@@ -45,6 +45,9 @@ static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
|
|
45
45
|
|
46
46
|
if (len > 0 && len > scanner->curr) {
|
47
47
|
lastCharLen = utf8CharLen(str, len);
|
48
|
+
if (lastCharLen < 0) {
|
49
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
50
|
+
}
|
48
51
|
utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
|
49
52
|
scanner->curr += lastCharLen;
|
50
53
|
AS_UTF8(utf8Str);
|
data/ext/utf8/string_utf8.c
CHANGED
@@ -15,9 +15,12 @@ extern VALUE intern_as_utf8;
|
|
15
15
|
static VALUE rb_cString_UTF8_length(VALUE self) {
|
16
16
|
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
|
17
17
|
size_t len = RSTRING_LEN(self);
|
18
|
-
|
18
|
+
int64_t utf8_len = 0;
|
19
19
|
|
20
20
|
utf8_len = utf8CharCount(str, len);
|
21
|
+
if (utf8_len < 0) {
|
22
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
23
|
+
}
|
21
24
|
|
22
25
|
return INT2FIX(utf8_len);
|
23
26
|
}
|
@@ -39,6 +42,9 @@ static VALUE rb_cString_UTF8_each_char(VALUE self) {
|
|
39
42
|
|
40
43
|
for(; i<len; i+=lastCharLen) {
|
41
44
|
lastCharLen = utf8CharLen(str, len);
|
45
|
+
if (lastCharLen < 0) {
|
46
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
47
|
+
}
|
42
48
|
utf8Str = rb_str_new((char *)str+i, lastCharLen);
|
43
49
|
AS_UTF8(utf8Str);
|
44
50
|
rb_yield(utf8Str);
|
@@ -79,7 +85,10 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
79
85
|
}
|
80
86
|
|
81
87
|
if (wantPos < 0) {
|
82
|
-
|
88
|
+
int64_t char_cnt = utf8CharCount(str, len);
|
89
|
+
if (char_cnt < 0) {
|
90
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
91
|
+
}
|
83
92
|
if ((wantPos * -1) > char_cnt) {
|
84
93
|
return Qnil;
|
85
94
|
}
|
@@ -88,6 +97,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
88
97
|
|
89
98
|
// scan until starting position
|
90
99
|
curCharLen = utf8CharLen(str, len);
|
100
|
+
if (curCharLen < 0) {
|
101
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
102
|
+
}
|
91
103
|
while (curPos < wantPos) {
|
92
104
|
// if we're about to step out of bounds, return nil
|
93
105
|
if ((size_t)(str-start) >= len) {
|
@@ -96,6 +108,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
96
108
|
|
97
109
|
str += curCharLen;
|
98
110
|
curCharLen = utf8CharLen(str, len);
|
111
|
+
if (curCharLen < 0) {
|
112
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
113
|
+
}
|
99
114
|
curPos++;
|
100
115
|
}
|
101
116
|
|
@@ -104,6 +119,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
104
119
|
offset = str;
|
105
120
|
str += curCharLen;
|
106
121
|
curCharLen = utf8CharLen(str, len);
|
122
|
+
if (curCharLen < 0) {
|
123
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
124
|
+
}
|
107
125
|
while (curPos < wantLen) {
|
108
126
|
// if we're about to step out of bounds, stop
|
109
127
|
if ((size_t)(str-start) >= len) {
|
@@ -112,6 +130,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
112
130
|
|
113
131
|
str += curCharLen;
|
114
132
|
curCharLen = utf8CharLen(str, len);
|
133
|
+
if (curCharLen < 0) {
|
134
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
135
|
+
}
|
115
136
|
curPos++;
|
116
137
|
}
|
117
138
|
|
@@ -130,7 +151,7 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
130
151
|
int8_t curCharLen = 0;
|
131
152
|
|
132
153
|
if (wantPos < 0) {
|
133
|
-
|
154
|
+
int64_t char_cnt = utf8CharCount(str, len);
|
134
155
|
if ((wantPos * -1) > char_cnt) {
|
135
156
|
return Qnil;
|
136
157
|
}
|
@@ -138,6 +159,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
138
159
|
}
|
139
160
|
|
140
161
|
curCharLen = utf8CharLen(str, len);
|
162
|
+
if (curCharLen < 0) {
|
163
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
164
|
+
}
|
141
165
|
while (curPos < wantPos) {
|
142
166
|
// if we're about to step out of bounds, return nil
|
143
167
|
if ((size_t)(str-start) >= len) {
|
@@ -146,6 +170,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
146
170
|
|
147
171
|
str += curCharLen;
|
148
172
|
curCharLen = utf8CharLen(str, len);
|
173
|
+
if (curCharLen < 0) {
|
174
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
175
|
+
}
|
149
176
|
curPos++;
|
150
177
|
}
|
151
178
|
|
@@ -158,7 +185,8 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
158
185
|
}
|
159
186
|
|
160
187
|
// [Range] syntax
|
161
|
-
long wantPos, curPos = 0, wantLen
|
188
|
+
long wantPos, curPos = 0, wantLen;
|
189
|
+
int64_t char_cnt = 0;
|
162
190
|
int8_t curCharLen = 0;
|
163
191
|
unsigned char *offset = str;
|
164
192
|
VALUE ret;
|
@@ -180,6 +208,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
180
208
|
|
181
209
|
// scan until starting position
|
182
210
|
curCharLen = utf8CharLen(str, len);
|
211
|
+
if (curCharLen < 0) {
|
212
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
213
|
+
}
|
183
214
|
while (curPos < wantPos) {
|
184
215
|
// if we're about to step out of bounds, return ""
|
185
216
|
if ((size_t)(str-start) >= len) {
|
@@ -190,6 +221,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
190
221
|
|
191
222
|
str += curCharLen;
|
192
223
|
curCharLen = utf8CharLen(str, len);
|
224
|
+
if (curCharLen < 0) {
|
225
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
226
|
+
}
|
193
227
|
curPos++;
|
194
228
|
}
|
195
229
|
|
@@ -198,6 +232,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
198
232
|
offset = str;
|
199
233
|
str += curCharLen;
|
200
234
|
curCharLen = utf8CharLen(str, len);
|
235
|
+
if (curCharLen < 0) {
|
236
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
237
|
+
}
|
201
238
|
while (curPos < wantLen) {
|
202
239
|
// if we're about to step out of bounds, stop
|
203
240
|
if ((size_t)(str-start) >= len) {
|
@@ -206,6 +243,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
206
243
|
|
207
244
|
str += curCharLen;
|
208
245
|
curCharLen = utf8CharLen(str, len);
|
246
|
+
if (curCharLen < 0) {
|
247
|
+
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
248
|
+
}
|
209
249
|
curPos++;
|
210
250
|
}
|
211
251
|
|
data/ext/utf8/utf8.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#include <stdio.h>
|
2
2
|
#include <stdint.h>
|
3
3
|
|
4
|
-
#define CHECK_LEN if ((size_t)(in-start) >= in_len) return
|
4
|
+
#define CHECK_LEN if ((size_t)(in-start) >= in_len) return -1;
|
5
5
|
|
6
6
|
/*
|
7
7
|
* Scans the current position of the buffer
|
@@ -62,14 +62,17 @@ inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
|
|
62
62
|
* Scans the current position of the buffer
|
63
63
|
* returning the total number of UTF8 characters found
|
64
64
|
*/
|
65
|
-
|
66
|
-
|
65
|
+
int64_t utf8CharCount(unsigned char *in, size_t in_len) {
|
66
|
+
int64_t total = 0, leftOver = in_len;
|
67
67
|
int8_t len = 0;
|
68
68
|
unsigned char *start = in;
|
69
69
|
|
70
70
|
if (in_len > 0) {
|
71
71
|
while (leftOver) {
|
72
72
|
len = utf8CharLen(start, leftOver);
|
73
|
+
if (len < 0) {
|
74
|
+
return -1;
|
75
|
+
}
|
73
76
|
leftOver -= len;
|
74
77
|
start += len;
|
75
78
|
total++;
|
data/ext/utf8/utf8.h
CHANGED
data/lib/utf8/string.rb
CHANGED
data/spec/string_scanner_spec.rb
CHANGED
@@ -8,6 +8,15 @@ describe StringScanner::UTF8 do
|
|
8
8
|
@utf8_scanner = @scanner.as_utf8
|
9
9
|
end
|
10
10
|
|
11
|
+
it "should blow up on invalid utf8 chars" do
|
12
|
+
# lets cut right into the middle of a sequence so we know it's bad
|
13
|
+
scanner = StringScanner.new(@char_array.join[0..1]).as_utf8
|
14
|
+
|
15
|
+
lambda {
|
16
|
+
scanner.getch
|
17
|
+
}.should raise_error(ArgumentError)
|
18
|
+
end
|
19
|
+
|
11
20
|
it "should extend StringScanner, adding an as_utf8 method that returns a StringScanner::UTF8 instance" do
|
12
21
|
@scanner.should respond_to(:as_utf8)
|
13
22
|
@scanner.as_utf8.class.should eql(StringScanner::UTF8)
|
data/spec/string_spec.rb
CHANGED
@@ -9,6 +9,23 @@ describe String::UTF8 do
|
|
9
9
|
@utf8_len = @char_array.size
|
10
10
|
end
|
11
11
|
|
12
|
+
it "should blow up on invalid utf8 chars" do
|
13
|
+
# lets cut right into the middle of a sequence so we know it's bad
|
14
|
+
utf8 = @str[0..1].as_utf8
|
15
|
+
|
16
|
+
lambda {
|
17
|
+
utf8.length
|
18
|
+
}.should raise_error(ArgumentError)
|
19
|
+
|
20
|
+
lambda {
|
21
|
+
utf8[0, 10]
|
22
|
+
}.should raise_error(ArgumentError)
|
23
|
+
|
24
|
+
lambda {
|
25
|
+
utf8.chars.to_a
|
26
|
+
}.should raise_error(ArgumentError)
|
27
|
+
end
|
28
|
+
|
12
29
|
it "should extend String, adding an as_utf8 method that returns a String::UTF8 instance" do
|
13
30
|
"".should respond_to(:as_utf8)
|
14
31
|
"".as_utf8.class.should eql(String::UTF8)
|
data/utf8.gemspec
CHANGED
metadata
CHANGED