utf8 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/ext/utf8/ext.c CHANGED
@@ -3,7 +3,7 @@
3
3
  #include "string_utf8.h"
4
4
  #include "string_scanner_utf8.h"
5
5
 
6
- VALUE intern_as_utf8;
6
+ ID intern_as_utf8;
7
7
 
8
8
  #ifdef HAVE_RUBY_ENCODING_H
9
9
  rb_encoding *utf8Encoding;
@@ -45,6 +45,9 @@ static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
45
45
 
46
46
  if (len > 0 && len > scanner->curr) {
47
47
  lastCharLen = utf8CharLen(str, len);
48
+ if (lastCharLen < 0) {
49
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
50
+ }
48
51
  utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
49
52
  scanner->curr += lastCharLen;
50
53
  AS_UTF8(utf8Str);
@@ -15,9 +15,12 @@ extern VALUE intern_as_utf8;
15
15
  static VALUE rb_cString_UTF8_length(VALUE self) {
16
16
  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
17
17
  size_t len = RSTRING_LEN(self);
18
- size_t utf8_len = 0;
18
+ int64_t utf8_len = 0;
19
19
 
20
20
  utf8_len = utf8CharCount(str, len);
21
+ if (utf8_len < 0) {
22
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
23
+ }
21
24
 
22
25
  return INT2FIX(utf8_len);
23
26
  }
@@ -39,6 +42,9 @@ static VALUE rb_cString_UTF8_each_char(VALUE self) {
39
42
 
40
43
  for(; i<len; i+=lastCharLen) {
41
44
  lastCharLen = utf8CharLen(str, len);
45
+ if (lastCharLen < 0) {
46
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
47
+ }
42
48
  utf8Str = rb_str_new((char *)str+i, lastCharLen);
43
49
  AS_UTF8(utf8Str);
44
50
  rb_yield(utf8Str);
@@ -79,7 +85,10 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
79
85
  }
80
86
 
81
87
  if (wantPos < 0) {
82
- long char_cnt = utf8CharCount(str, len);
88
+ int64_t char_cnt = utf8CharCount(str, len);
89
+ if (char_cnt < 0) {
90
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
91
+ }
83
92
  if ((wantPos * -1) > char_cnt) {
84
93
  return Qnil;
85
94
  }
@@ -88,6 +97,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
88
97
 
89
98
  // scan until starting position
90
99
  curCharLen = utf8CharLen(str, len);
100
+ if (curCharLen < 0) {
101
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
102
+ }
91
103
  while (curPos < wantPos) {
92
104
  // if we're about to step out of bounds, return nil
93
105
  if ((size_t)(str-start) >= len) {
@@ -96,6 +108,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
96
108
 
97
109
  str += curCharLen;
98
110
  curCharLen = utf8CharLen(str, len);
111
+ if (curCharLen < 0) {
112
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
113
+ }
99
114
  curPos++;
100
115
  }
101
116
 
@@ -104,6 +119,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
104
119
  offset = str;
105
120
  str += curCharLen;
106
121
  curCharLen = utf8CharLen(str, len);
122
+ if (curCharLen < 0) {
123
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
124
+ }
107
125
  while (curPos < wantLen) {
108
126
  // if we're about to step out of bounds, stop
109
127
  if ((size_t)(str-start) >= len) {
@@ -112,6 +130,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
112
130
 
113
131
  str += curCharLen;
114
132
  curCharLen = utf8CharLen(str, len);
133
+ if (curCharLen < 0) {
134
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
135
+ }
115
136
  curPos++;
116
137
  }
117
138
 
@@ -130,7 +151,7 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
130
151
  int8_t curCharLen = 0;
131
152
 
132
153
  if (wantPos < 0) {
133
- long char_cnt = utf8CharCount(str, len);
154
+ int64_t char_cnt = utf8CharCount(str, len);
134
155
  if ((wantPos * -1) > char_cnt) {
135
156
  return Qnil;
136
157
  }
@@ -138,6 +159,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
138
159
  }
139
160
 
140
161
  curCharLen = utf8CharLen(str, len);
162
+ if (curCharLen < 0) {
163
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
164
+ }
141
165
  while (curPos < wantPos) {
142
166
  // if we're about to step out of bounds, return nil
143
167
  if ((size_t)(str-start) >= len) {
@@ -146,6 +170,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
146
170
 
147
171
  str += curCharLen;
148
172
  curCharLen = utf8CharLen(str, len);
173
+ if (curCharLen < 0) {
174
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
175
+ }
149
176
  curPos++;
150
177
  }
151
178
 
@@ -158,7 +185,8 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
158
185
  }
159
186
 
160
187
  // [Range] syntax
161
- long wantPos, curPos = 0, wantLen, char_cnt = 0;
188
+ long wantPos, curPos = 0, wantLen;
189
+ int64_t char_cnt = 0;
162
190
  int8_t curCharLen = 0;
163
191
  unsigned char *offset = str;
164
192
  VALUE ret;
@@ -180,6 +208,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
180
208
 
181
209
  // scan until starting position
182
210
  curCharLen = utf8CharLen(str, len);
211
+ if (curCharLen < 0) {
212
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
213
+ }
183
214
  while (curPos < wantPos) {
184
215
  // if we're about to step out of bounds, return ""
185
216
  if ((size_t)(str-start) >= len) {
@@ -190,6 +221,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
190
221
 
191
222
  str += curCharLen;
192
223
  curCharLen = utf8CharLen(str, len);
224
+ if (curCharLen < 0) {
225
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
226
+ }
193
227
  curPos++;
194
228
  }
195
229
 
@@ -198,6 +232,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
198
232
  offset = str;
199
233
  str += curCharLen;
200
234
  curCharLen = utf8CharLen(str, len);
235
+ if (curCharLen < 0) {
236
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
237
+ }
201
238
  while (curPos < wantLen) {
202
239
  // if we're about to step out of bounds, stop
203
240
  if ((size_t)(str-start) >= len) {
@@ -206,6 +243,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
206
243
 
207
244
  str += curCharLen;
208
245
  curCharLen = utf8CharLen(str, len);
246
+ if (curCharLen < 0) {
247
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
248
+ }
209
249
  curPos++;
210
250
  }
211
251
 
data/ext/utf8/utf8.c CHANGED
@@ -1,7 +1,7 @@
1
1
  #include <stdio.h>
2
2
  #include <stdint.h>
3
3
 
4
- #define CHECK_LEN if ((size_t)(in-start) >= in_len) return 0;
4
+ #define CHECK_LEN if ((size_t)(in-start) >= in_len) return -1;
5
5
 
6
6
  /*
7
7
  * Scans the current position of the buffer
@@ -62,14 +62,17 @@ inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
62
62
  * Scans the current position of the buffer
63
63
  * returning the total number of UTF8 characters found
64
64
  */
65
- size_t utf8CharCount(unsigned char *in, size_t in_len) {
66
- size_t total = 0, leftOver = in_len;
65
+ int64_t utf8CharCount(unsigned char *in, size_t in_len) {
66
+ int64_t total = 0, leftOver = in_len;
67
67
  int8_t len = 0;
68
68
  unsigned char *start = in;
69
69
 
70
70
  if (in_len > 0) {
71
71
  while (leftOver) {
72
72
  len = utf8CharLen(start, leftOver);
73
+ if (len < 0) {
74
+ return -1;
75
+ }
73
76
  leftOver -= len;
74
77
  start += len;
75
78
  total++;
data/ext/utf8/utf8.h CHANGED
@@ -2,6 +2,6 @@
2
2
  #define UTF8_UTF8_H
3
3
 
4
4
  inline int8_t utf8CharLen(unsigned char *in, size_t in_len);
5
- size_t utf8CharCount(unsigned char *in, size_t in_len);
5
+ int64_t utf8CharCount(unsigned char *in, size_t in_len);
6
6
 
7
7
  #endif
data/lib/utf8/string.rb CHANGED
@@ -5,7 +5,7 @@ class String
5
5
  end
6
6
 
7
7
  class UTF8
8
- VERSION = "0.1.0"
8
+ VERSION = "0.1.1"
9
9
 
10
10
  # Gives you access to the raw non-UTF8-aware version of the string
11
11
  def as_raw
@@ -8,6 +8,15 @@ describe StringScanner::UTF8 do
8
8
  @utf8_scanner = @scanner.as_utf8
9
9
  end
10
10
 
11
+ it "should blow up on invalid utf8 chars" do
12
+ # lets cut right into the middle of a sequence so we know it's bad
13
+ scanner = StringScanner.new(@char_array.join[0..1]).as_utf8
14
+
15
+ lambda {
16
+ scanner.getch
17
+ }.should raise_error(ArgumentError)
18
+ end
19
+
11
20
  it "should extend StringScanner, adding an as_utf8 method that returns a StringScanner::UTF8 instance" do
12
21
  @scanner.should respond_to(:as_utf8)
13
22
  @scanner.as_utf8.class.should eql(StringScanner::UTF8)
data/spec/string_spec.rb CHANGED
@@ -9,6 +9,23 @@ describe String::UTF8 do
9
9
  @utf8_len = @char_array.size
10
10
  end
11
11
 
12
+ it "should blow up on invalid utf8 chars" do
13
+ # lets cut right into the middle of a sequence so we know it's bad
14
+ utf8 = @str[0..1].as_utf8
15
+
16
+ lambda {
17
+ utf8.length
18
+ }.should raise_error(ArgumentError)
19
+
20
+ lambda {
21
+ utf8[0, 10]
22
+ }.should raise_error(ArgumentError)
23
+
24
+ lambda {
25
+ utf8.chars.to_a
26
+ }.should raise_error(ArgumentError)
27
+ end
28
+
12
29
  it "should extend String, adding an as_utf8 method that returns a String::UTF8 instance" do
13
30
  "".should respond_to(:as_utf8)
14
31
  "".as_utf8.class.should eql(String::UTF8)
data/utf8.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{utf8}
5
- s.version = "0.1.0"
5
+ s.version = "0.1.1"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Brian Lopez"]
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 0
10
- version: 0.1.0
9
+ - 1
10
+ version: 0.1.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Lopez