utf8 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/utf8/ext.c CHANGED
@@ -3,7 +3,7 @@
3
3
  #include "string_utf8.h"
4
4
  #include "string_scanner_utf8.h"
5
5
 
6
- VALUE intern_as_utf8;
6
+ ID intern_as_utf8;
7
7
 
8
8
  #ifdef HAVE_RUBY_ENCODING_H
9
9
  rb_encoding *utf8Encoding;
@@ -45,6 +45,9 @@ static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
45
45
 
46
46
  if (len > 0 && len > scanner->curr) {
47
47
  lastCharLen = utf8CharLen(str, len);
48
+ if (lastCharLen < 0) {
49
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
50
+ }
48
51
  utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
49
52
  scanner->curr += lastCharLen;
50
53
  AS_UTF8(utf8Str);
@@ -15,9 +15,12 @@ extern VALUE intern_as_utf8;
15
15
  static VALUE rb_cString_UTF8_length(VALUE self) {
16
16
  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
17
17
  size_t len = RSTRING_LEN(self);
18
- size_t utf8_len = 0;
18
+ int64_t utf8_len = 0;
19
19
 
20
20
  utf8_len = utf8CharCount(str, len);
21
+ if (utf8_len < 0) {
22
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
23
+ }
21
24
 
22
25
  return INT2FIX(utf8_len);
23
26
  }
@@ -39,6 +42,9 @@ static VALUE rb_cString_UTF8_each_char(VALUE self) {
39
42
 
40
43
  for(; i<len; i+=lastCharLen) {
41
44
  lastCharLen = utf8CharLen(str, len);
45
+ if (lastCharLen < 0) {
46
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
47
+ }
42
48
  utf8Str = rb_str_new((char *)str+i, lastCharLen);
43
49
  AS_UTF8(utf8Str);
44
50
  rb_yield(utf8Str);
@@ -79,7 +85,10 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
79
85
  }
80
86
 
81
87
  if (wantPos < 0) {
82
- long char_cnt = utf8CharCount(str, len);
88
+ int64_t char_cnt = utf8CharCount(str, len);
89
+ if (char_cnt < 0) {
90
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
91
+ }
83
92
  if ((wantPos * -1) > char_cnt) {
84
93
  return Qnil;
85
94
  }
@@ -88,6 +97,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
88
97
 
89
98
  // scan until starting position
90
99
  curCharLen = utf8CharLen(str, len);
100
+ if (curCharLen < 0) {
101
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
102
+ }
91
103
  while (curPos < wantPos) {
92
104
  // if we're about to step out of bounds, return nil
93
105
  if ((size_t)(str-start) >= len) {
@@ -96,6 +108,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
96
108
 
97
109
  str += curCharLen;
98
110
  curCharLen = utf8CharLen(str, len);
111
+ if (curCharLen < 0) {
112
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
113
+ }
99
114
  curPos++;
100
115
  }
101
116
 
@@ -104,6 +119,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
104
119
  offset = str;
105
120
  str += curCharLen;
106
121
  curCharLen = utf8CharLen(str, len);
122
+ if (curCharLen < 0) {
123
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
124
+ }
107
125
  while (curPos < wantLen) {
108
126
  // if we're about to step out of bounds, stop
109
127
  if ((size_t)(str-start) >= len) {
@@ -112,6 +130,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
112
130
 
113
131
  str += curCharLen;
114
132
  curCharLen = utf8CharLen(str, len);
133
+ if (curCharLen < 0) {
134
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
135
+ }
115
136
  curPos++;
116
137
  }
117
138
 
@@ -130,7 +151,7 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
130
151
  int8_t curCharLen = 0;
131
152
 
132
153
  if (wantPos < 0) {
133
- long char_cnt = utf8CharCount(str, len);
154
+ int64_t char_cnt = utf8CharCount(str, len);
134
155
  if ((wantPos * -1) > char_cnt) {
135
156
  return Qnil;
136
157
  }
@@ -138,6 +159,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
138
159
  }
139
160
 
140
161
  curCharLen = utf8CharLen(str, len);
162
+ if (curCharLen < 0) {
163
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
164
+ }
141
165
  while (curPos < wantPos) {
142
166
  // if we're about to step out of bounds, return nil
143
167
  if ((size_t)(str-start) >= len) {
@@ -146,6 +170,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
146
170
 
147
171
  str += curCharLen;
148
172
  curCharLen = utf8CharLen(str, len);
173
+ if (curCharLen < 0) {
174
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
175
+ }
149
176
  curPos++;
150
177
  }
151
178
 
@@ -158,7 +185,8 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
158
185
  }
159
186
 
160
187
  // [Range] syntax
161
- long wantPos, curPos = 0, wantLen, char_cnt = 0;
188
+ long wantPos, curPos = 0, wantLen;
189
+ int64_t char_cnt = 0;
162
190
  int8_t curCharLen = 0;
163
191
  unsigned char *offset = str;
164
192
  VALUE ret;
@@ -180,6 +208,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
180
208
 
181
209
  // scan until starting position
182
210
  curCharLen = utf8CharLen(str, len);
211
+ if (curCharLen < 0) {
212
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
213
+ }
183
214
  while (curPos < wantPos) {
184
215
  // if we're about to step out of bounds, return ""
185
216
  if ((size_t)(str-start) >= len) {
@@ -190,6 +221,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
190
221
 
191
222
  str += curCharLen;
192
223
  curCharLen = utf8CharLen(str, len);
224
+ if (curCharLen < 0) {
225
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
226
+ }
193
227
  curPos++;
194
228
  }
195
229
 
@@ -198,6 +232,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
198
232
  offset = str;
199
233
  str += curCharLen;
200
234
  curCharLen = utf8CharLen(str, len);
235
+ if (curCharLen < 0) {
236
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
237
+ }
201
238
  while (curPos < wantLen) {
202
239
  // if we're about to step out of bounds, stop
203
240
  if ((size_t)(str-start) >= len) {
@@ -206,6 +243,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
206
243
 
207
244
  str += curCharLen;
208
245
  curCharLen = utf8CharLen(str, len);
246
+ if (curCharLen < 0) {
247
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
248
+ }
209
249
  curPos++;
210
250
  }
211
251
 
data/ext/utf8/utf8.c CHANGED
@@ -1,7 +1,7 @@
1
1
  #include <stdio.h>
2
2
  #include <stdint.h>
3
3
 
4
- #define CHECK_LEN if ((size_t)(in-start) >= in_len) return 0;
4
+ #define CHECK_LEN if ((size_t)(in-start) >= in_len) return -1;
5
5
 
6
6
  /*
7
7
  * Scans the current position of the buffer
@@ -62,14 +62,17 @@ inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
62
62
  * Scans the current position of the buffer
63
63
  * returning the total number of UTF8 characters found
64
64
  */
65
- size_t utf8CharCount(unsigned char *in, size_t in_len) {
66
- size_t total = 0, leftOver = in_len;
65
+ int64_t utf8CharCount(unsigned char *in, size_t in_len) {
66
+ int64_t total = 0, leftOver = in_len;
67
67
  int8_t len = 0;
68
68
  unsigned char *start = in;
69
69
 
70
70
  if (in_len > 0) {
71
71
  while (leftOver) {
72
72
  len = utf8CharLen(start, leftOver);
73
+ if (len < 0) {
74
+ return -1;
75
+ }
73
76
  leftOver -= len;
74
77
  start += len;
75
78
  total++;
data/ext/utf8/utf8.h CHANGED
@@ -2,6 +2,6 @@
2
2
  #define UTF8_UTF8_H
3
3
 
4
4
  inline int8_t utf8CharLen(unsigned char *in, size_t in_len);
5
- size_t utf8CharCount(unsigned char *in, size_t in_len);
5
+ int64_t utf8CharCount(unsigned char *in, size_t in_len);
6
6
 
7
7
  #endif
data/lib/utf8/string.rb CHANGED
@@ -5,7 +5,7 @@ class String
5
5
  end
6
6
 
7
7
  class UTF8
8
- VERSION = "0.1.0"
8
+ VERSION = "0.1.1"
9
9
 
10
10
  # Gives you access to the raw non-UTF8-aware version of the string
11
11
  def as_raw
@@ -8,6 +8,15 @@ describe StringScanner::UTF8 do
8
8
  @utf8_scanner = @scanner.as_utf8
9
9
  end
10
10
 
11
+ it "should blow up on invalid utf8 chars" do
12
+ # lets cut right into the middle of a sequence so we know it's bad
13
+ scanner = StringScanner.new(@char_array.join[0..1]).as_utf8
14
+
15
+ lambda {
16
+ scanner.getch
17
+ }.should raise_error(ArgumentError)
18
+ end
19
+
11
20
  it "should extend StringScanner, adding an as_utf8 method that returns a StringScanner::UTF8 instance" do
12
21
  @scanner.should respond_to(:as_utf8)
13
22
  @scanner.as_utf8.class.should eql(StringScanner::UTF8)
data/spec/string_spec.rb CHANGED
@@ -9,6 +9,23 @@ describe String::UTF8 do
9
9
  @utf8_len = @char_array.size
10
10
  end
11
11
 
12
+ it "should blow up on invalid utf8 chars" do
13
+ # lets cut right into the middle of a sequence so we know it's bad
14
+ utf8 = @str[0..1].as_utf8
15
+
16
+ lambda {
17
+ utf8.length
18
+ }.should raise_error(ArgumentError)
19
+
20
+ lambda {
21
+ utf8[0, 10]
22
+ }.should raise_error(ArgumentError)
23
+
24
+ lambda {
25
+ utf8.chars.to_a
26
+ }.should raise_error(ArgumentError)
27
+ end
28
+
12
29
  it "should extend String, adding an as_utf8 method that returns a String::UTF8 instance" do
13
30
  "".should respond_to(:as_utf8)
14
31
  "".as_utf8.class.should eql(String::UTF8)
data/utf8.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{utf8}
5
- s.version = "0.1.0"
5
+ s.version = "0.1.1"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Brian Lopez"]
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 0
10
- version: 0.1.0
9
+ - 1
10
+ version: 0.1.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Lopez