RubyGems - utf8 - Versions diffs - 0.1.0 → 0.1.1 - Mend

utf8 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/ext/utf8/ext.c +1 -1
data/ext/utf8/string_scanner_utf8.c +3 -0
data/ext/utf8/string_utf8.c +44 -4
data/ext/utf8/utf8.c +6 -3
data/ext/utf8/utf8.h +1 -1
data/lib/utf8/string.rb +1 -1
data/spec/string_scanner_spec.rb +9 -0
data/spec/string_spec.rb +17 -0
data/utf8.gemspec +1 -1
metadata +3 -3

data/ext/utf8/ext.c CHANGED Viewed

@@ -3,7 +3,7 @@
 #include "string_utf8.h"
 #include "string_scanner_utf8.h"
-VALUE intern_as_utf8;
+ID intern_as_utf8;
 #ifdef HAVE_RUBY_ENCODING_H
 rb_encoding *utf8Encoding;

data/ext/utf8/string_scanner_utf8.c CHANGED Viewed

@@ -45,6 +45,9 @@ static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
   if (len > 0 && len > scanner->curr) {
     lastCharLen = utf8CharLen(str, len);
+    if (lastCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
     scanner->curr += lastCharLen;
     AS_UTF8(utf8Str);

data/ext/utf8/string_utf8.c CHANGED Viewed

@@ -15,9 +15,12 @@ extern VALUE intern_as_utf8;
 static VALUE rb_cString_UTF8_length(VALUE self) {
   unsigned char *str = (unsigned char *)RSTRING_PTR(self);
   size_t len = RSTRING_LEN(self);
-  size_t utf8_len = 0;
+  int64_t utf8_len = 0;
   utf8_len = utf8CharCount(str, len);
+  if (utf8_len < 0) {
+    rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+  }
   return INT2FIX(utf8_len);
 }
@@ -39,6 +42,9 @@ static VALUE rb_cString_UTF8_each_char(VALUE self) {
   for(; i<len; i+=lastCharLen) {
     lastCharLen = utf8CharLen(str, len);
+    if (lastCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     utf8Str = rb_str_new((char *)str+i, lastCharLen);
     AS_UTF8(utf8Str);
     rb_yield(utf8Str);
@@ -79,7 +85,10 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     }
     if (wantPos < 0) {
-      long char_cnt = utf8CharCount(str, len);
+      int64_t char_cnt = utf8CharCount(str, len);
+      if (char_cnt < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       if ((wantPos * -1) > char_cnt) {
         return Qnil;
       }
@@ -88,6 +97,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     // scan until starting position
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantPos) {
       // if we're about to step out of bounds, return nil
       if ((size_t)(str-start) >= len) {
@@ -96,6 +108,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
@@ -104,6 +119,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     offset = str;
     str += curCharLen;
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantLen) {
       // if we're about to step out of bounds, stop
       if ((size_t)(str-start) >= len) {
@@ -112,6 +130,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
@@ -130,7 +151,7 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     int8_t curCharLen = 0;
     if (wantPos < 0) {
-      long char_cnt = utf8CharCount(str, len);
+      int64_t char_cnt = utf8CharCount(str, len);
       if ((wantPos * -1) > char_cnt) {
         return Qnil;
       }
@@ -138,6 +159,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     }
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantPos) {
       // if we're about to step out of bounds, return nil
       if ((size_t)(str-start) >= len) {
@@ -146,6 +170,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
@@ -158,7 +185,8 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     }
     // [Range] syntax
-    long wantPos, curPos = 0, wantLen, char_cnt = 0;
+    long wantPos, curPos = 0, wantLen;
+    int64_t char_cnt = 0;
     int8_t curCharLen = 0;
     unsigned char *offset = str;
     VALUE ret;
@@ -180,6 +208,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     // scan until starting position
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantPos) {
       // if we're about to step out of bounds, return ""
       if ((size_t)(str-start) >= len) {
@@ -190,6 +221,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
@@ -198,6 +232,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
     offset = str;
     str += curCharLen;
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantLen) {
       // if we're about to step out of bounds, stop
       if ((size_t)(str-start) >= len) {
@@ -206,6 +243,9 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }

data/ext/utf8/utf8.c CHANGED Viewed

@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <stdint.h>
-#define CHECK_LEN if ((size_t)(in-start) >= in_len) return 0;
+#define CHECK_LEN if ((size_t)(in-start) >= in_len) return -1;
 /*
  * Scans the current position of the buffer
@@ -62,14 +62,17 @@ inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
  * Scans the current position of the buffer
  * returning the total number of UTF8 characters found
  */
-size_t utf8CharCount(unsigned char *in, size_t in_len) {
-  size_t total = 0, leftOver = in_len;
+int64_t utf8CharCount(unsigned char *in, size_t in_len) {
+  int64_t total = 0, leftOver = in_len;
   int8_t len = 0;
   unsigned char *start = in;
   if (in_len > 0) {
     while (leftOver) {
       len = utf8CharLen(start, leftOver);
+      if (len < 0) {
+        return -1;
+      }
       leftOver -= len;
       start += len;
       total++;

data/ext/utf8/utf8.h CHANGED Viewed

@@ -2,6 +2,6 @@
 #define UTF8_UTF8_H
 inline int8_t utf8CharLen(unsigned char *in, size_t in_len);
-size_t utf8CharCount(unsigned char *in, size_t in_len);
+int64_t utf8CharCount(unsigned char *in, size_t in_len);
 #endif

data/lib/utf8/string.rb CHANGED Viewed

@@ -5,7 +5,7 @@ class String
   end
   class UTF8
-    VERSION = "0.1.0"
+    VERSION = "0.1.1"
     # Gives you access to the raw non-UTF8-aware version of the string
     def as_raw

data/spec/string_scanner_spec.rb CHANGED Viewed

@@ -8,6 +8,15 @@ describe StringScanner::UTF8 do
     @utf8_scanner = @scanner.as_utf8
   end
+  it "should blow up on invalid utf8 chars" do
+    # lets cut right into the middle of a sequence so we know it's bad
+    scanner = StringScanner.new(@char_array.join[0..1]).as_utf8
+    lambda {
+      scanner.getch
+    }.should raise_error(ArgumentError)
+  end
   it "should extend StringScanner, adding an as_utf8 method that returns a StringScanner::UTF8 instance" do
     @scanner.should respond_to(:as_utf8)
     @scanner.as_utf8.class.should eql(StringScanner::UTF8)

data/spec/string_spec.rb CHANGED Viewed

@@ -9,6 +9,23 @@ describe String::UTF8 do
     @utf8_len = @char_array.size
   end
+  it "should blow up on invalid utf8 chars" do
+    # lets cut right into the middle of a sequence so we know it's bad
+    utf8 = @str[0..1].as_utf8
+    lambda {
+      utf8.length
+    }.should raise_error(ArgumentError)
+    lambda {
+      utf8[0, 10]
+    }.should raise_error(ArgumentError)
+    lambda {
+      utf8.chars.to_a
+    }.should raise_error(ArgumentError)
+  end
   it "should extend String, adding an as_utf8 method that returns a String::UTF8 instance" do
     "".should respond_to(:as_utf8)
     "".as_utf8.class.should eql(String::UTF8)

data/utf8.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |s|
   s.name = %q{utf8}
-  s.version = "0.1.0"
+  s.version = "0.1.1"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Brian Lopez"]

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: utf8
 version: !ruby/object:Gem::Version
-  hash: 27
+  hash: 25
   prerelease:
   segments:
   - 0
   - 1
-  - 0
-  version: 0.1.0
+  - 1
+  version: 0.1.1
 platform: ruby
 authors:
 - Brian Lopez