RubyGems - utf8 - Versions diffs - 0.1.2 → 0.1.3 - Mend

utf8 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/.rspec +3 -0
data/Gemfile +3 -0
data/Rakefile +22 -1
data/benchmark/active_support.rb +14 -24
data/ext/utf8/ext.h +5 -0
data/ext/utf8/string_utf8.c +76 -4
data/ext/utf8/utf8.c +24 -2
data/ext/utf8/utf8.h +1 -0
data/lib/utf8.rb +1 -0
data/lib/utf8/string.rb +2 -3
data/lib/utf8/string_scanner.rb +1 -1
data/lib/utf8/version.rb +5 -0
data/spec/string_spec.rb +40 -2
data/utf8.gemspec +11 -21
metadata +24 -7

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--color
+--format=documentation
+--fail-fast

data/Gemfile ADDED Viewed

@@ -0,0 +1,3 @@
+source :rubygems
+gemspec

data/Rakefile CHANGED Viewed

@@ -1,6 +1,25 @@
+# rspec
+begin
+  require 'rspec'
+  require 'rspec/core/rake_task'
+  desc "Run all examples with RCov"
+  RSpec::Core::RakeTask.new('spec:rcov') do |t|
+    t.rcov = true
+  end
+  RSpec::Core::RakeTask.new('spec') do |t|
+    t.verbose = true
+  end
+  task :default => :spec
+rescue LoadError
+  puts "rspec, or one of its dependencies, is not available. Install it with: sudo gem install rspec"
+end
+# rake-compiler
 require 'rake' unless defined? Rake
-gem 'rake-compiler', '~> 0.7.1'
+gem 'rake-compiler', '>= 0.7.5'
 require "rake/extensiontask"
 Rake::ExtensionTask.new('utf8') do |ext|
@@ -9,3 +28,5 @@ Rake::ExtensionTask.new('utf8') do |ext|
   ext.lib_dir = File.join 'lib', 'utf8'
 end
+Rake::Task[:spec].prerequisites << :compile

data/benchmark/active_support.rb CHANGED Viewed

@@ -13,49 +13,39 @@ as_mb = ActiveSupport::Multibyte::Chars.new(raw)
 times = 1000
 puts "String::UTF8"
-Benchmark.bmbm do |x|
-  x.report {
-    puts "#length"
+Benchmark.bmbm { |x|
+  x.report("#length") {
     times.times {utf8.length}
   }
-  x.report {
-    puts "#[index]"
+  x.report("#[index]") {
     times.times {utf8[1024]}
   }
-  x.report {
-    puts "#[-index]"
+  x.report("#[-index]") {
     times.times {utf8[-1024]}
   }
-  x.report {
-    puts "#[start, len]"
+  x.report("#[start, len]") {
     times.times {utf8[1024, 1024]}
   }
-  x.report {
-    puts "#[-start, len]"
+  x.report("#[-start, len]") {
     times.times {utf8[-1024, 1024]}
   }
-end
+}
 puts "\n\nActiveSupport::Multibyte::Chars"
-Benchmark.bmbm do |x|
-  x.report {
-    puts "#length"
+Benchmark.bmbm { |x|
+  x.report("#length") {
     times.times {as_mb.length}
   }
-  x.report {
-    puts "#[index]"
+  x.report("#[index]") {
     times.times {as_mb[1024]}
   }
-  x.report {
-    puts "#[-index]"
+  x.report("#[-index]") {
     times.times {as_mb[-1024]}
   }
-  x.report {
-    puts "#[start, len]"
+  x.report("#[start, len]") {
     times.times {as_mb[1024, 1024]}
   }
-  x.report {
-    puts "#[-start, len]"
+  x.report("#[-start, len]") {
     times.times {as_mb[-1024, 1024]}
   }
-end
+}

data/ext/utf8/ext.h CHANGED Viewed

@@ -1,6 +1,11 @@
 #ifndef UTF8_EXT_H
 #define UTF8_EXT_H
+// tell rbx not to use it's caching compat layer
+// by doing this we're making a promize to RBX that
+// we'll never modify the pointers we get back from RSTRING_PTR
+#define RSTRING_NOT_MODIFIED
 #include <ruby.h>
 #ifdef HAVE_RUBY_ENCODING_H

data/ext/utf8/string_utf8.c CHANGED Viewed

@@ -4,13 +4,13 @@
 extern VALUE intern_as_utf8;
 /*
- * Document-class: String::UTF8
+ * Document-class: String::UTF-8
  */
 /*
  * call-seq: length
  *
- * Returns the number of UTF8 characters in this string
+ * Returns: a Fixnum - the number of UTF-8 characters in this string
  */
 static VALUE rb_cString_UTF8_length(VALUE self) {
   unsigned char *str = (unsigned char *)RSTRING_PTR(self);
@@ -28,7 +28,9 @@ static VALUE rb_cString_UTF8_length(VALUE self) {
 /*
  * call-seq: each_char {|utf8_char| ...}
  *
- * Iterates over the string, yielding one UTF8 character at a time
+ * Iterates over the string, yielding one UTF-8 character at a time
+ *
+ * Returns: self
  */
 static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) {
   unsigned char *str = (unsigned char *)RSTRING_PTR(self);
@@ -56,7 +58,75 @@ static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) {
 }
 /*
- * Works like String#[] but taking into account UTF8 character boundaries
+ * call-seq: each_codepoint {|utf8_codepoint| ...}
+ *
+ * Iterates over the string, yielding one UTF-8 codepoint at a time
+ *
+ * Returns: self
+ */
+static VALUE rb_cString_UTF8_each_codepoint(int argc, VALUE *argv, VALUE self) {
+  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
+  size_t len = RSTRING_LEN(self), i=0;
+  int8_t lastCharLen=0;
+  int32_t cp;
+  // this will return an Enumerator wrapping this string, yielding this method
+  // when Enumerator#each is called
+  if (!rb_block_given_p()) {
+    return rb_funcall(self, rb_intern("to_enum"), 1, ID2SYM(rb_intern("each_codepoint")));
+  }
+  for(; i<len; i+=lastCharLen) {
+    lastCharLen = utf8CharLen(str, len);
+    if (lastCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
+    cp = utf8CharToCodepoint(str+i, lastCharLen);
+    rb_yield(INT2FIX(cp));
+  }
+  return self;
+}
+/*
+ * call-seq: valid?(max_codepoint=nil)
+ *
+ * Iterates over the string, yielding one UTF-8 codepoint at a time
+ *
+ * max_codepoint - an optional Fixnum used to declare this string invalid
+ *                 if a codepoint higher than that value is found
+ *                 if nothing is passed, the UTF-8 maximum of 0x10FFFF is assumed
+ *
+ * Returns: a Boolean - true if the string is valid, false if not
+ */
+static VALUE rb_cString_UTF8_valid(int argc, VALUE *argv, VALUE self) {
+  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
+  size_t len = RSTRING_LEN(self), i=0;
+  int8_t lastCharLen=0;
+  int32_t cp, cp_max = -1;
+  VALUE rb_cp_max;
+  if (rb_scan_args(argc, argv, "01", &rb_cp_max) == 1) {
+    Check_Type(rb_cp_max, T_FIXNUM);
+    cp_max = FIX2INT(rb_cp_max);
+  }
+  for(; i<len; i+=lastCharLen) {
+    lastCharLen = utf8CharLen(str, len);
+    if (lastCharLen < 0) {
+      return Qfalse;
+    }
+    cp = utf8CharToCodepoint(str+i, lastCharLen);
+    if (cp_max >= 0 && cp > cp_max) {
+      return Qfalse;
+    }
+  }
+  return Qtrue;
+}
+/*
+ * Works like String#[] but taking into account UTF-8 character boundaries
  *
  * This method doesn't currently (and may never) support Regexp parameters
  * It also doesn't support a String parameter (yet)
@@ -263,4 +333,6 @@ void init_String_UTF8() {
   rb_define_method(rb_cString_UTF8, "length",    rb_cString_UTF8_length, 0);
   rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
   rb_define_method(rb_cString_UTF8, "[]",        rb_cString_UTF8_slice, -1);
+  rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
+  rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
 }

data/ext/utf8/utf8.c CHANGED Viewed

@@ -5,7 +5,7 @@
 /*
  * Scans the current position of the buffer
- * returning the length of this UTF8 character
+ * returning the length of this UTF-8 character
  */
 inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
   if (in_len > 0) {
@@ -60,7 +60,7 @@ inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
 /*
  * Scans the current position of the buffer
- * returning the total number of UTF8 characters found
+ * returning the total number of UTF-8 characters found
  */
 int64_t utf8CharCount(unsigned char *in, size_t in_len) {
   int64_t total = 0, leftOver = in_len;
@@ -81,3 +81,25 @@ int64_t utf8CharCount(unsigned char *in, size_t in_len) {
   return total;
 }
+/*
+ * Scans the current position of the buffer
+ * returning the codepoint for the UTF-8 character at this position
+ */
+int32_t utf8CharToCodepoint(unsigned char *in, size_t in_len) {
+  int32_t cp, ncp, len;
+  len = utf8CharLen(in, in_len);
+  cp = *in++;
+  if (len > 1) {
+    len--;
+    ncp = cp & ((1 << (6 - len)) - 1);
+    while (len--) {
+      cp = *in++;
+      ncp = (ncp << 6) | (cp & ((1 << 6) - 1));
+    }
+    return ncp;
+  } else {
+    return cp;
+  }
+}

data/ext/utf8/utf8.h CHANGED Viewed

@@ -3,5 +3,6 @@
 inline int8_t utf8CharLen(unsigned char *in, size_t in_len);
 int64_t utf8CharCount(unsigned char *in, size_t in_len);
+int32_t utf8CharToCodepoint(unsigned char *in, size_t in_len);
 #endif

data/lib/utf8.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require 'utf8/utf8'
 require 'utf8/string'
+require 'utf8/version' unless defined? String::UTF8::VERSION
 # explicitly require this in your app if you want to use it
 # require 'utf8/string_scanner'

data/lib/utf8/string.rb CHANGED Viewed

@@ -4,9 +4,7 @@ class String
     String::UTF8.new(self)
   end
-  class UTF8
-    VERSION = "0.1.2"
+  class UTF8 < ::String
     # Gives you access to the raw non-UTF8-aware version of the string
     def as_raw
       ::String.new(self)
@@ -15,5 +13,6 @@ class String
     alias :size  :length
     alias :chars :each_char
     alias :slice :[]
+    alias :codepoints :each_codepoint
   end
 end

data/lib/utf8/string_scanner.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class StringScanner
     StringScanner::UTF8.new(self.string.as_utf8)
   end
-  class UTF8
+  class UTF8 < ::StringScanner
     # Returns a non-UTF8-aware version of StringScanner wrapping your original string
     #
     # NOTE: this will lose all state associated with the current StringScanner::UTF8 instance

data/lib/utf8/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class String
+  class UTF8 < ::String
+    VERSION = "0.1.3"
+  end
+end

data/spec/string_spec.rb CHANGED Viewed

@@ -7,6 +7,7 @@ describe String::UTF8 do
     @str = @char_array.join
     @utf8 = @str.as_utf8
     @utf8_len = @char_array.size
+    @codepoints = @char_array.map{|c| c.unpack('U').first}
   end
   it "should blow up on invalid utf8 chars" do
@@ -67,10 +68,8 @@ describe String::UTF8 do
       end
       @utf8.chars.class.should eql(klass)
-      i=0
       @utf8.chars do |char|
         char.should_not be_nil
-        i+=1
       end
       joined = @utf8.chars.to_a.join
       @utf8.should eql(joined)
@@ -79,6 +78,25 @@ describe String::UTF8 do
     end
   end
+  context "#codepoints and #each_codepoint" do
+    it "should be utf8-aware" do
+      klass = begin
+        if defined? Encoding
+          Enumerator
+        else
+          Enumerable::Enumerator
+        end
+      end
+      @utf8.codepoints.class.should eql(klass)
+      @utf8.codepoints do |codepoint|
+        codepoint.should_not be_nil
+      end
+      @utf8.codepoints.to_a.size.should eql(@codepoints.size)
+      @utf8.codepoints.to_a.should eql(@codepoints)
+    end
+  end
   context "[offset] syntax" do
     it "should be utf8-aware" do
       @char_array.each_with_index do |char, i|
@@ -157,6 +175,26 @@ describe String::UTF8 do
     end
   end
+  context "#valid?" do
+    it "should test validity" do
+      # lets cut right into the middle of a sequence so we know it's bad
+      @str.force_encoding('binary') if @str.respond_to?(:force_encoding)
+      utf8 = @str[0..1]
+      utf8.force_encoding('utf-8') if utf8.respond_to?(:force_encoding)
+      utf8 = utf8.as_utf8
+      utf8.valid?.should be_false
+      @utf8.valid?.should be_true
+    end
+    it "should test validity using a maximum codepoint" do
+      highest_codepoint = @utf8.codepoints.to_a.max
+      @utf8.valid?(highest_codepoint).should be_true
+      @utf8.valid?(highest_codepoint-1).should be_false
+    end
+  end
   it "[Regexp] syntax shouldn't be supported yet" do
     lambda {
       @utf8[/a/]

data/utf8.gemspec CHANGED Viewed

@@ -1,37 +1,27 @@
-# encoding: utf-8
+require './lib/utf8/version'
 Gem::Specification.new do |s|
   s.name = %q{utf8}
-  s.version = "0.1.2"
-  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.version = String::UTF8::VERSION
   s.authors = ["Brian Lopez"]
-  s.date = %q{2011-01-12}
+  s.date = Time.now.utc.strftime("%Y-%m-%d")
   s.email = %q{seniorlopez@gmail.com}
   s.extensions = ["ext/utf8/extconf.rb"]
   s.extra_rdoc_files = [
     "README.rdoc"
   ]
-  s.files = [".gitignore", "MIT-LICENSE", "README.rdoc", "Rakefile", "benchmark/active_support.rb", "benchmark/test.txt", "ext/utf8/ext.c", "ext/utf8/ext.h", "ext/utf8/extconf.rb", "ext/utf8/string_scanner_utf8.c", "ext/utf8/string_scanner_utf8.h", "ext/utf8/string_utf8.c", "ext/utf8/string_utf8.h", "ext/utf8/utf8.c", "ext/utf8/utf8.h", "lib/utf8.rb", "lib/utf8/string.rb", "lib/utf8/string_scanner.rb", "spec/spec_helper.rb", "spec/string_scanner_spec.rb", "spec/string_spec.rb", "utf8.gemspec"]
+  s.files = `git ls-files`.split("\n")
   s.homepage = %q{http://github.com/brianmario/utf8}
+  s.rdoc_options = ["--charset=UTF-8"]
   s.require_paths = ["lib", "ext"]
   s.rubygems_version = %q{1.4.2}
   s.summary = %q{A lightweight UTF8-aware String class meant for use with Ruby 1.8}
-  s.test_files = ["spec/spec_helper.rb", "spec/string_scanner_spec.rb", "spec/string_spec.rb"]
-  if s.respond_to? :specification_version then
-    s.specification_version = 3
+  s.test_files = `git ls-files spec`.split("\n")
-    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
-      s.add_development_dependency(%q<rake-compiler>, [">= 0.7.5"])
-      s.add_development_dependency(%q<rspec>, [">= 2.0.0"])
-    else
-      s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
-      s.add_dependency(%q<rspec>, [">= 2.0.0"])
-    end
-  else
-    s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
-    s.add_dependency(%q<rspec>, [">= 2.0.0"])
-  end
+  # tests
+  s.add_development_dependency 'rake-compiler', ">= 0.7.5"
+  s.add_development_dependency 'rspec', ">= 2.0.0"
+  # benchmarks
+  s.add_development_dependency 'activesupport'
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: utf8
 version: !ruby/object:Gem::Version
-  hash: 31
+  hash: 29
   prerelease:
   segments:
   - 0
   - 1
-  - 2
-  version: 0.1.2
+  - 3
+  version: 0.1.3
 platform: ruby
 authors:
 - Brian Lopez
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-01-12 00:00:00 -08:00
+date: 2011-06-02 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -50,6 +50,20 @@ dependencies:
         version: 2.0.0
   type: :development
   version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: activesupport
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :development
+  version_requirements: *id003
 description:
 email: seniorlopez@gmail.com
 executables: []
@@ -60,6 +74,8 @@ extra_rdoc_files:
 - README.rdoc
 files:
 - .gitignore
+- .rspec
+- Gemfile
 - MIT-LICENSE
 - README.rdoc
 - Rakefile
@@ -77,6 +93,7 @@ files:
 - lib/utf8.rb
 - lib/utf8/string.rb
 - lib/utf8/string_scanner.rb
+- lib/utf8/version.rb
 - spec/spec_helper.rb
 - spec/string_scanner_spec.rb
 - spec/string_spec.rb
@@ -86,8 +103,8 @@ homepage: http://github.com/brianmario/utf8
 licenses: []
 post_install_message:
-rdoc_options: []
+rdoc_options:
+- --charset=UTF-8
 require_paths:
 - lib
 - ext
@@ -112,7 +129,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project:
-rubygems_version: 1.4.2
+rubygems_version: 1.6.2
 signing_key:
 specification_version: 3
 summary: A lightweight UTF8-aware String class meant for use with Ruby 1.8