RubyGems - utf8 - Versions diffs - 0.1.0 → 0.1.1 - Mend

utf8 0.1.0 → 0.1.1

Files changed (10) hide show

data/ext/utf8/ext.c +1 -1
data/ext/utf8/string_scanner_utf8.c +3 -0
data/ext/utf8/string_utf8.c +44 -4
data/ext/utf8/utf8.c +6 -3
data/ext/utf8/utf8.h +1 -1
data/lib/utf8/string.rb +1 -1
data/spec/string_scanner_spec.rb +9 -0
data/spec/string_spec.rb +17 -0
data/utf8.gemspec +1 -1
metadata +3 -3

data/ext/utf8/ext.c CHANGED Viewed

@@ -3,7 +3,7 @@
 #include "string_utf8.h"
 #include "string_scanner_utf8.h"
-VALUE intern_as_utf8;
+ID intern_as_utf8;
 #ifdef HAVE_RUBY_ENCODING_H
 rb_encoding *utf8Encoding;

data/ext/utf8/string_scanner_utf8.c CHANGED Viewed

@@ -45,6 +45,9 @@ static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
   if (len > 0 && len > scanner->curr) {
     lastCharLen = utf8CharLen(str, len);
+    if (lastCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
     scanner->curr += lastCharLen;
     AS_UTF8(utf8Str);

data/ext/utf8/string_utf8.c CHANGED Viewed

@@ -15,9 +15,12 @@ extern VALUE intern_as_utf8;
 static VALUE rb_cString_UTF8_length(VALUE self) {
   unsigned char *str = (unsigned char *)RSTRING_PTR(self);
   size_t len = RSTRING_LEN(self);
-  size_t utf8_len = 0;
+  int64_t utf8_len = 0;
   utf8_len = utf8CharCount(str, len);
+  if (utf8_len < 0) {
+    rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+  }
   return INT2FIX(utf8_len);
 }
@@ -39,6 +42,9 @@ static VALUE rb_cString_UTF8_each_char(VALUE self) {
   for(; i<len; i+=lastCharLen) {
     lastCharLen = utf8CharLen(str, len);
+    if (lastCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     utf8Str = rb_str_new((char *)str+i, lastCharLen);
     AS_UTF8(utf8Str);
     rb_yield(utf8Str);
@@ -79,7 +85,10 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     }
     if (wantPos < 0) {
-      long char_cnt = utf8CharCount(str, len);
+      int64_t char_cnt = utf8CharCount(str, len);
+      if (char_cnt < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       if ((wantPos * -1) > char_cnt) {
         return Qnil;
       }
@@ -88,6 +97,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     // scan until starting position
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantPos) {
       // if we're about to step out of bounds, return nil
       if ((size_t)(str-start) >= len) {
@@ -96,6 +108,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
@@ -104,6 +119,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     offset = str;
     str += curCharLen;
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantLen) {
       // if we're about to step out of bounds, stop
       if ((size_t)(str-start) >= len) {
@@ -112,6 +130,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
@@ -130,7 +151,7 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     int8_t curCharLen = 0;
     if (wantPos < 0) {
-      long char_cnt = utf8CharCount(str, len);
+      int64_t char_cnt = utf8CharCount(str, len);
       if ((wantPos * -1) > char_cnt) {
         return Qnil;
       }
@@ -138,6 +159,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     }
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantPos) {
       // if we're about to step out of bounds, return nil
       if ((size_t)(str-start) >= len) {
@@ -146,6 +170,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
@@ -158,7 +185,8 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     }
     // [Range] syntax
-    long wantPos, curPos = 0, wantLen, char_cnt = 0;
+    long wantPos, curPos = 0, wantLen;
+    int64_t char_cnt = 0;
     int8_t curCharLen = 0;
     unsigned char *offset = str;
     VALUE ret;
@@ -180,6 +208,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     // scan until starting position
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantPos) {
       // if we're about to step out of bounds, return ""
       if ((size_t)(str-start) >= len) {
@@ -190,6 +221,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
@@ -198,6 +232,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     offset = str;
     str += curCharLen;
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantLen) {
       // if we're about to step out of bounds, stop
       if ((size_t)(str-start) >= len) {
@@ -206,6 +243,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }

data/ext/utf8/utf8.c CHANGED Viewed

@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <stdint.h>
-#define CHECK_LEN if ((size_t)(in-start) >= in_len) return 0;
+#define CHECK_LEN if ((size_t)(in-start) >= in_len) return -1;
 /*
  * Scans the current position of the buffer
@@ -62,14 +62,17 @@ inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
  * Scans the current position of the buffer
  * returning the total number of UTF8 characters found
  */
-size_t utf8CharCount(unsigned char *in, size_t in_len) {
-  size_t total = 0, leftOver = in_len;
+int64_t utf8CharCount(unsigned char *in, size_t in_len) {
+  int64_t total = 0, leftOver = in_len;
   int8_t len = 0;
   unsigned char *start = in;
   if (in_len > 0) {
     while (leftOver) {
       len = utf8CharLen(start, leftOver);
+      if (len < 0) {
+        return -1;
+      }
       leftOver -= len;
       start += len;
       total++;

data/ext/utf8/utf8.h CHANGED Viewed

@@ -2,6 +2,6 @@
 #define UTF8_UTF8_H
 inline int8_t utf8CharLen(unsigned char *in, size_t in_len);
-size_t utf8CharCount(unsigned char *in, size_t in_len);
+int64_t utf8CharCount(unsigned char *in, size_t in_len);
 #endif

data/lib/utf8/string.rb CHANGED Viewed

@@ -5,7 +5,7 @@ class String
   end
   class UTF8
-    VERSION = "0.1.0"
+    VERSION = "0.1.1"
     # Gives you access to the raw non-UTF8-aware version of the string
     def as_raw

data/spec/string_scanner_spec.rb CHANGED Viewed

@@ -8,6 +8,15 @@ describe StringScanner::UTF8 do
     @utf8_scanner = @scanner.as_utf8
   end
+  it "should blow up on invalid utf8 chars" do
+    # lets cut right into the middle of a sequence so we know it's bad
+    scanner = StringScanner.new(@char_array.join[0..1]).as_utf8
+    lambda {
+      scanner.getch
+    }.should raise_error(ArgumentError)
+  end
   it "should extend StringScanner, adding an as_utf8 method that returns a StringScanner::UTF8 instance" do
     @scanner.should respond_to(:as_utf8)
     @scanner.as_utf8.class.should eql(StringScanner::UTF8)

data/spec/string_spec.rb CHANGED Viewed

@@ -9,6 +9,23 @@ describe String::UTF8 do
     @utf8_len = @char_array.size
   end
+  it "should blow up on invalid utf8 chars" do
+    # lets cut right into the middle of a sequence so we know it's bad
+    utf8 = @str[0..1].as_utf8
+    lambda {
+      utf8.length
+    }.should raise_error(ArgumentError)
+    lambda {
+      utf8[0, 10]
+    }.should raise_error(ArgumentError)
+    lambda {
+      utf8.chars.to_a
+    }.should raise_error(ArgumentError)
+  end
   it "should extend String, adding an as_utf8 method that returns a String::UTF8 instance" do
     "".should respond_to(:as_utf8)
     "".as_utf8.class.should eql(String::UTF8)

data/utf8.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |s|
   s.name = %q{utf8}
-  s.version = "0.1.0"
+  s.version = "0.1.1"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Brian Lopez"]

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: utf8
 version: !ruby/object:Gem::Version
-  hash: 27
+  hash: 25
   prerelease:
   segments:
   - 0
   - 1
-  - 0
-  version: 0.1.0
+  - 1
+  version: 0.1.1
 platform: ruby
 authors:
 - Brian Lopez