RubyGems - utf8 - Versions diffs - 0.1.0 - Mend

utf8 0.1.0

Files changed (23) hide show

data/.gitignore +4 -0
data/MIT-LICENSE +20 -0
data/README.rdoc +46 -0
data/Rakefile +11 -0
data/benchmark/active_support.rb +61 -0
data/benchmark/test.txt +693 -0
data/ext/utf8/ext.c +21 -0
data/ext/utf8/ext.h +17 -0
data/ext/utf8/extconf.rb +7 -0
data/ext/utf8/string_scanner_utf8.c +68 -0
data/ext/utf8/string_scanner_utf8.h +6 -0
data/ext/utf8/string_utf8.c +224 -0
data/ext/utf8/string_utf8.h +6 -0
data/ext/utf8/utf8.c +80 -0
data/ext/utf8/utf8.h +7 -0
data/lib/utf8.rb +5 -0
data/lib/utf8/string.rb +19 -0
data/lib/utf8/string_scanner.rb +21 -0
data/spec/spec_helper.rb +5 -0
data/spec/string_scanner_spec.rb +48 -0
data/spec/string_spec.rb +151 -0
data/utf8.gemspec +37 -0
metadata +120 -0

data/ext/utf8/ext.c ADDED Viewed

@@ -0,0 +1,21 @@
+#include "ext.h"
+#include "string_utf8.h"
+#include "string_scanner_utf8.h"
+VALUE intern_as_utf8;
+#ifdef HAVE_RUBY_ENCODING_H
+rb_encoding *utf8Encoding;
+#endif
+void Init_utf8() {
+  init_String_UTF8();
+  init_StringScanner_UTF8();
+  intern_as_utf8 = rb_intern("as_utf8");
+#ifdef HAVE_RUBY_ENCODING_H
+  utf8Encoding = rb_utf8_encoding();
+#endif
+}

data/ext/utf8/ext.h ADDED Viewed

@@ -0,0 +1,17 @@
+#ifndef UTF8_EXT_H
+#define UTF8_EXT_H
+#include <ruby.h>
+#ifdef HAVE_RUBY_ENCODING_H
+#include <ruby/encoding.h>
+extern rb_encoding *utf8Encoding;
+#define AS_UTF8(_str)                         \
+  _str = rb_funcall(_str, intern_as_utf8, 0); \
+  rb_enc_associate(_str, utf8Encoding);
+#else
+#define AS_UTF8(_str) _str = rb_funcall(_str, intern_as_utf8, 0)
+#endif
+#endif

data/ext/utf8/extconf.rb ADDED Viewed

@@ -0,0 +1,7 @@
+require 'mkmf'
+require 'rbconfig'
+$CFLAGS << ' -Wall -funroll-loops'
+$CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
+create_makefile("utf8")

data/ext/utf8/string_scanner_utf8.c ADDED Viewed

@@ -0,0 +1,68 @@
+#include "ext.h"
+#include "ruby/regex.h"
+#include "utf8.h"
+extern ID intern_as_utf8;
+struct strscanner {
+    /* multi-purpose flags */
+    unsigned long flags;
+    /* the string to scan */
+    VALUE str;
+    /* scan pointers */
+    long prev; /* legal only when MATCHED_P(s) */
+    long curr; /* always legal */
+    /* the regexp register; legal only when MATCHED_P(s) */
+    struct re_registers regs;
+};
+#define GET_SCANNER(obj, var)                                                          \
+    Data_Get_Struct(obj, struct strscanner, var);                                      \
+    if (NIL_P(var->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");
+/*
+ * Document-class: StringScanner::UTF8
+ */
+/*
+ * call-seq: getch
+ *
+ * Works like StringScanner#getch but is UTF8-aware
+ */
+static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
+  unsigned char *str;
+  size_t len;
+  struct strscanner *scanner;
+  VALUE utf8Str;
+  int8_t lastCharLen=0;
+  GET_SCANNER(self, scanner);
+  str = (unsigned char *)RSTRING_PTR(scanner->str);
+  len = RSTRING_LEN(scanner->str);
+  if (len > 0 && len > scanner->curr) {
+    lastCharLen = utf8CharLen(str, len);
+    utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
+    scanner->curr += lastCharLen;
+    AS_UTF8(utf8Str);
+    return utf8Str;
+  } else {
+    return Qnil;
+  }
+}
+void init_StringScanner_UTF8() {
+  ID intern_string_scanner = rb_intern("StringScanner");
+  VALUE rb_cStringScanner, rb_cStringScanner_UTF8;
+  if (!rb_const_defined(rb_cObject, intern_string_scanner)) {
+    rb_require("strscan");
+  }
+  rb_cStringScanner = rb_const_get(rb_cObject, intern_string_scanner);
+  rb_cStringScanner_UTF8 = rb_define_class_under(rb_cStringScanner, "UTF8", rb_cStringScanner);
+  rb_define_method(rb_cStringScanner_UTF8, "getch", rb_cStringScanner_UTF8_getch, 0);
+}

data/ext/utf8/string_scanner_utf8.h ADDED Viewed

@@ -0,0 +1,6 @@
+#ifndef UTF8_STRING_SCANNER_H
+#define UTF8_STRING_SCANNER_H
+void init_StringScanner_UTF8();
+#endif

data/ext/utf8/string_utf8.c ADDED Viewed

@@ -0,0 +1,224 @@
+#include "ext.h"
+#include "utf8.h"
+extern VALUE intern_as_utf8;
+/*
+ * Document-class: String::UTF8
+ */
+/*
+ * call-seq: length
+ *
+ * Returns the number of UTF8 characters in this string
+ */
+static VALUE rb_cString_UTF8_length(VALUE self) {
+  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
+  size_t len = RSTRING_LEN(self);
+  size_t utf8_len = 0;
+  utf8_len = utf8CharCount(str, len);
+  return INT2FIX(utf8_len);
+}
+/*
+ * call-seq: each_char {|utf8_char| ...}
+ *
+ * Iterates over the string, yielding one UTF8 character at a time
+ */
+static VALUE rb_cString_UTF8_each_char(VALUE self) {
+  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
+  size_t len = RSTRING_LEN(self), i=0;
+  int8_t lastCharLen=0;
+  VALUE utf8Str;
+  // this will return an Enumerator wrapping this string, yielding this method
+  // when Enumerator#each is called
+  RETURN_ENUMERATOR(self, 0, 0);
+  for(; i<len; i+=lastCharLen) {
+    lastCharLen = utf8CharLen(str, len);
+    utf8Str = rb_str_new((char *)str+i, lastCharLen);
+    AS_UTF8(utf8Str);
+    rb_yield(utf8Str);
+  }
+  return self;
+}
+/*
+ * Works like String#[] but taking into account UTF8 character boundaries
+ *
+ * This method doesn't currently (and may never) support Regexp parameters
+ * It also doesn't support a String parameter (yet)
+ */
+static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
+  unsigned char *str = (unsigned char *)RSTRING_PTR(self), *start = str;
+  size_t len = RSTRING_LEN(self);
+  VALUE utf8Str;
+  if (len == 0) return Qnil;
+  if (argc == 2) {
+    if (TYPE(argv[0]) == T_REGEXP) {
+      rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
+    }
+    // [offset, length] syntax
+    long wantPos = NUM2LONG(argv[0]), curPos = 0, wantLen = NUM2LONG(argv[1]);
+    int8_t curCharLen = 0;
+    unsigned char *offset = str;
+    if (wantLen < 0) {
+      return Qnil;
+    } else if (wantLen == 0) {
+      utf8Str = rb_str_new("", 0);
+      AS_UTF8(utf8Str);
+      return utf8Str;
+    }
+    if (wantPos < 0) {
+      long char_cnt = utf8CharCount(str, len);
+      if ((wantPos * -1) > char_cnt) {
+        return Qnil;
+      }
+      wantPos = char_cnt + wantPos;
+    }
+    // scan until starting position
+    curCharLen = utf8CharLen(str, len);
+    while (curPos < wantPos) {
+      // if we're about to step out of bounds, return nil
+      if ((size_t)(str-start) >= len) {
+        return Qnil;
+      }
+      str += curCharLen;
+      curCharLen = utf8CharLen(str, len);
+      curPos++;
+    }
+    // now scan until we have the number of chars asked for
+    curPos = 1;
+    offset = str;
+    str += curCharLen;
+    curCharLen = utf8CharLen(str, len);
+    while (curPos < wantLen) {
+      // if we're about to step out of bounds, stop
+      if ((size_t)(str-start) >= len) {
+        break;
+      }
+      str += curCharLen;
+      curCharLen = utf8CharLen(str, len);
+      curPos++;
+    }
+    utf8Str = rb_str_new((char *)offset, str-offset);
+    AS_UTF8(utf8Str);
+    return utf8Str;
+  }
+  if (argc != 1) {
+    rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
+  }
+  // [Fixnum] syntax
+  if (TYPE(argv[0]) == T_FIXNUM) {
+    long wantPos = NUM2LONG(argv[0]), curPos = 0;
+    int8_t curCharLen = 0;
+    if (wantPos < 0) {
+      long char_cnt = utf8CharCount(str, len);
+      if ((wantPos * -1) > char_cnt) {
+        return Qnil;
+      }
+      wantPos = char_cnt + wantPos;
+    }
+    curCharLen = utf8CharLen(str, len);
+    while (curPos < wantPos) {
+      // if we're about to step out of bounds, return nil
+      if ((size_t)(str-start) >= len) {
+        return Qnil;
+      }
+      str += curCharLen;
+      curCharLen = utf8CharLen(str, len);
+      curPos++;
+    }
+    utf8Str = rb_str_new((char *)str, curCharLen);
+    AS_UTF8(utf8Str);
+    return utf8Str;
+  } else {
+    if (TYPE(argv[0]) == T_REGEXP) {
+      rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
+    }
+    // [Range] syntax
+    long wantPos, curPos = 0, wantLen, char_cnt = 0;
+    int8_t curCharLen = 0;
+    unsigned char *offset = str;
+    VALUE ret;
+    char_cnt = utf8CharCount(str, len);
+    ret = rb_range_beg_len(argv[0], &wantPos, &wantLen, char_cnt, 0);
+    if (ret == Qnil) {
+      return Qnil;
+    } else if (ret == Qfalse) {
+      // TODO: wtf do we do :P
+    }
+    if (wantLen == 0) {
+      utf8Str = rb_str_new("", 0);
+      AS_UTF8(utf8Str);
+      return utf8Str;
+    }
+    // scan until starting position
+    curCharLen = utf8CharLen(str, len);
+    while (curPos < wantPos) {
+      // if we're about to step out of bounds, return ""
+      if ((size_t)(str-start) >= len) {
+        utf8Str = rb_str_new("", 0);
+        AS_UTF8(utf8Str);
+        return utf8Str;
+      }
+      str += curCharLen;
+      curCharLen = utf8CharLen(str, len);
+      curPos++;
+    }
+    // now scan until we have the number of chars asked for
+    curPos = 1;
+    offset = str;
+    str += curCharLen;
+    curCharLen = utf8CharLen(str, len);
+    while (curPos < wantLen) {
+      // if we're about to step out of bounds, stop
+      if ((size_t)(str-start) >= len) {
+        break;
+      }
+      str += curCharLen;
+      curCharLen = utf8CharLen(str, len);
+      curPos++;
+    }
+    utf8Str = rb_str_new((char *)offset, str-offset);
+    AS_UTF8(utf8Str);
+    return utf8Str;
+  }
+}
+void init_String_UTF8() {
+  VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
+  rb_define_method(rb_cString_UTF8, "length",    rb_cString_UTF8_length, 0);
+  rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, 0);
+  rb_define_method(rb_cString_UTF8, "[]",        rb_cString_UTF8_slice, -1);
+}

data/ext/utf8/string_utf8.h ADDED Viewed

@@ -0,0 +1,6 @@
+#ifndef UTF8_STRING_H
+#define UTF8_STRING_H
+void init_String_UTF8();
+#endif

data/ext/utf8/utf8.c ADDED Viewed

@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include <stdint.h>
+#define CHECK_LEN if ((size_t)(in-start) >= in_len) return 0;
+/*
+ * Scans the current position of the buffer
+ * returning the length of this UTF8 character
+ */
+inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
+  if (in_len > 0) {
+    unsigned char curChar, *start;
+    start = in;
+    curChar = in[0];
+    in++;
+    if (curChar <= 0x7f) {
+      /* single byte */
+      return 1;
+    } else if ((curChar >> 5) == 0x6) {
+      /* two byte */
+      CHECK_LEN;
+      curChar = in[0];
+      in++;
+      if ((curChar >> 6) == 0x2) return 2;
+    } else if ((curChar >> 4) == 0x0e) {
+      /* three byte */
+      CHECK_LEN;
+      curChar = in[0];
+      in++;
+      if ((curChar >> 6) == 0x2) {
+        CHECK_LEN;
+        curChar = in[0];
+        in++;
+        if ((curChar >> 6) == 0x2) return 3;
+      }
+    } else if ((curChar >> 3) == 0x1e) {
+      /* four byte */
+      CHECK_LEN;
+      curChar = in[0];
+      in++;
+      if ((curChar >> 6) == 0x2) {
+        CHECK_LEN;
+        curChar = in[0];
+        in++;
+        if ((curChar >> 6) == 0x2) {
+          CHECK_LEN;
+          curChar = in[0];
+          in++;
+          if ((curChar >> 6) == 0x2) return 4;
+        }
+      }
+    }
+  }
+  // error case
+  return -1;
+}
+/*
+ * Scans the current position of the buffer
+ * returning the total number of UTF8 characters found
+ */
+size_t utf8CharCount(unsigned char *in, size_t in_len) {
+  size_t total = 0, leftOver = in_len;
+  int8_t len = 0;
+  unsigned char *start = in;
+  if (in_len > 0) {
+    while (leftOver) {
+      len = utf8CharLen(start, leftOver);
+      leftOver -= len;
+      start += len;
+      total++;
+    }
+  }
+  return total;
+}

data/ext/utf8/utf8.h ADDED Viewed

@@ -0,0 +1,7 @@
+#ifndef UTF8_UTF8_H
+#define UTF8_UTF8_H
+inline int8_t utf8CharLen(unsigned char *in, size_t in_len);
+size_t utf8CharCount(unsigned char *in, size_t in_len);
+#endif