RubyGems - fast_xs - Versions diffs - 0.2 - Mend

fast_xs 0.2

Files changed (11) hide show

data/CHANGELOG +8 -0
data/COPYING +22 -0
data/README +18 -0
data/Rakefile +119 -0
data/ext/fast_xs/extconf.rb +4 -0
data/ext/fast_xs/fast_xs.c +279 -0
data/lib/fast_xs_monkey_patcher.rb +32 -0
data/test/test_cgi_class_overrides.rb +35 -0
data/test/test_erb_util_module_overrides.rb +29 -0
data/test/test_xml_escaping.rb +39 -0
metadata +70 -0

data/CHANGELOG ADDED Viewed

@@ -0,0 +1,8 @@
+= 0.2
+=== 6th December, 2007
+* First real release
+* added fast_xs_html and fast_xs_cgi methods
+= 0.1
+=== 3rd October, 2007
+* initial implementation, not officially released

data/COPYING ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2007 Eric Wong <normalperson@yhbt.net>
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.

data/README ADDED Viewed

@@ -0,0 +1,18 @@
+= fast_xs, a fast C extension for cleaning XML
+== Overview
+  The original fast_xs method is based on the xchar code by Sam Ruby:
+    http://intertwingly.net/stories/2005/09/28/xchar.rb
+    http://intertwingly.net/blog/2005/09/28/XML-Cleansing
+  _why also packages a slightly modified version in the latest version
+  of Hpricot (not-yet-released) that also escapes '"' to "&quot;" (which
+  is optional).
+  This is an exact translation (to the best of my knowledge :) of Sam's
+  original implementation, so it does not escape "&quot;".  The version
+  of XML::Builder packaged with Rails also uses Sam's version, and
+  XML::Builder as packaged in Rails 2.0 will be automatically use
+  fast_xs if available.

data/Rakefile ADDED Viewed

@@ -0,0 +1,119 @@
+require 'rake'
+require 'rake/clean'
+require 'rake/gempackagetask'
+require 'rake/rdoctask'
+require 'rake/testtask'
+require 'hoe'
+require 'fileutils'
+include FileUtils
+name = "fast_xs"
+rev = `git describe 2>/dev/null`.chomp rescue nil
+version = ENV['VERSION'] || "0.2" + (rev && rev.length > 0 ? "-#{rev}" : "")
+pkg = "#{name}-#{version}"
+bin = "*.{so,o}"
+archlib = "lib/#{::Config::CONFIG['arch']}"
+CLEAN.include ["ext/fast_xs/#{bin}", "lib/**/#{bin}",
+               'ext/fast_xs/Makefile', '**/.*.sw?', '*.gem', '.config']
+rdoc_opts = ['--quiet', '--title', 'fast_xs notes', '--main', 'README',
+             '--inline-source']
+pkg_files = %w(CHANGELOG COPYING README Rakefile) +
+            Dir.glob("{test,lib}/**/*.rb") +
+            Dir.glob("ext/**/*.{c,rb}")
+spec = Gem::Specification.new do |s|
+  s.name = name
+  s.version = version
+  s.platform = Gem::Platform::RUBY
+  s.has_rdoc = true
+  s.rdoc_options += rdoc_opts
+  s.extra_rdoc_files = ["README", "CHANGELOG", "COPYING"]
+  s.summary = "escape faster!"
+  s.description = s.summary
+  s.author = "Eric Wong"
+  s.email = 'normalperson@yhbt.net'
+  s.homepage = 'http://bogonips.org/fast_xs/'
+  s.files = pkg_files
+  s.require_paths = [archlib, "lib"]
+  s.extensions = FileList["ext/**/extconf.rb"].to_a
+end
+hoe = Hoe.new(name, version) do |p|
+  p.author = spec.author
+  p.description = spec.description
+  p.email = spec.email
+  p.summary = spec.summary
+  p.url = spec.homepage
+  p.rubyforge_name = 'fast-xs'
+end
+desc "Does a full compile, test run"
+task :default => [:compile, :test]
+desc "Run all the tests"
+Rake::TestTask.new do |t|
+  t.libs << "test" << archlib
+  t.test_files = FileList['test/test_*.rb']
+  t.verbose = true
+end
+Rake::RDocTask.new do |rdoc|
+  rdoc.rdoc_dir = 'doc/rdoc'
+  rdoc.options += rdoc_opts
+  rdoc.main = "README"
+  rdoc.rdoc_files.add ['README', 'CHANGELOG', 'COPYING' ]
+end
+Rake::GemPackageTask.new(spec) do |p|
+  p.need_tar = true
+  p.gem_spec = spec
+end
+['fast_xs'].each do |extension|
+  ext = "ext/#{extension}"
+  ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
+  ext_files = FileList[
+    "#{ext}/*.c",
+    "#{ext}/*.h",
+    "#{ext}/*.rl",
+    "#{ext}/extconf.rb",
+    "#{ext}/Makefile",
+    "lib"
+  ]
+  desc "Builds just the #{extension} extension"
+  task extension.to_sym => ["#{ext}/Makefile", ext_so ]
+  file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
+    Dir.chdir(ext) do ruby "extconf.rb" end
+  end
+  file ext_so => ext_files do
+    Dir.chdir(ext) do sh('make') end
+    mkdir_p archlib
+    chmod 0644, ext_so
+    cp ext_so, archlib
+  end
+end
+task "lib" do
+  directory "lib"
+end
+desc "Compiles the Ruby extension"
+task :compile => [:fast_xs] do
+  if Dir.glob(File.join(archlib,"fast_xs.*")).empty?
+    STDERR.puts 'Gem failed to build'
+    exit(1)
+  end
+end
+task :install do
+  sh %{rake package}
+  sh %{sudo gem install pkg/#{name}-#{version}}
+end
+task :uninstall => [:clean] do
+  sh %{sudo gem uninstall #{name}}
+end

data/ext/fast_xs/extconf.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require 'mkmf'
+have_header('assert.h') or exit
+dir_config('fast_xs')
+create_makefile('fast_xs')

data/ext/fast_xs/fast_xs.c ADDED Viewed

@@ -0,0 +1,279 @@
+#define VERSION	"0.1"
+#include <ruby.h>
+#include <assert.h>
+static ID unpack_id;
+static VALUE U_fmt, C_fmt;
+/* give GCC hints for better branch prediction
+ * (we layout branches so that ASCII characters are handled faster) */
+#if defined(__GNUC__) && (__GNUC__ >= 3)
+#  define likely(x)		__builtin_expect (!!(x), 1)
+#  define unlikely(x)		__builtin_expect (!!(x), 0)
+#else
+#  define unlikely(x)		(x)
+#  define likely(x)		(x)
+#endif
+/* pass-through certain characters for CP-1252 */
+#define p(x) (x-128)
+static const int cp_1252[] = {
+	8364,		/* 128 => 8364, euro sign */
+	p(129),		/* 129 => 129,  pass-through */
+	8218,		/* 130 => 8218, single low-9 quotation mark */
+	402,		/* 131 =>  402, latin small letter f with hook */
+	8222,		/* 132 => 8222, double low-9 quotation mark */
+	8230,		/* 133 => 8230, horizontal ellipsis */
+	8224,		/* 134 => 8224, dagger */
+	8225,		/* 135 => 8225, double dagger */
+	710,		/* 136 =>  710, modifier letter circumflex accent */
+	8240,		/* 137 => 8240, per mille sign */
+	352,		/* 138 =>  352, latin capital letter s with caron */
+	8249,		/* 139 => 8249, single left-pointing angle quotation mark */
+	338,		/* 140 =>  338, latin capital ligature oe */
+	p(141),		/* 141 =>  141, pass-through */
+	381,		/* 142 =>  381, latin capital letter z with caron */
+	p(143),		/* 143 =>  143, pass-through */
+	p(144),		/* 144 =>  144, pass-through */
+	8216,		/* 145 => 8216, left single quotation mark */
+	8217,		/* 146 => 8217, right single quotation mark */
+	8220,		/* 147 => 8220, left double quotation mark */
+	8221,		/* 148 => 8221, right double quotation mark */
+	8226,		/* 149 => 8226, bullet */
+	8211,		/* 150 => 8211, en dash */
+	8212,		/* 151 => 8212, em dash */
+	732,		/* 152 =>  732, small tilde */
+	8482,		/* 153 => 8482, trade mark sign */
+	353,		/* 154 =>  353, latin small letter s with caron */
+	8250,		/* 155 => 8250, single right-pointing angle quotation mark */
+	339,		/* 156 =>  339, latin small ligature oe */
+	p(157),		/* 157 =>  157, pass-through */
+	382,		/* 158 =>  382, latin small letter z with caron */
+	376		/* 159 =>  376} latin capital letter y with diaeresis */
+};
+#define VALID_VALUE(n) \
+	(n >= 0x20 && n <= 0xD7FF) || \
+	    (n >= 0xE000 && n <= 0xFFFD) || \
+	    (n >= 0x10000 && n <= 0x10FFFF)
+#define CP_1252_ESCAPE(n) do { \
+	if (n >= 128 && n <= 159) \
+		n = cp_1252[n - 128]; \
+	} while(0)
+static inline size_t bytes_for(int n)
+{
+	if (n < 1000)
+		return sizeof("&#999;") - 1;
+	if (n < 10000)
+		return sizeof("&#9999;") - 1;
+	if (n < 100000)
+		return sizeof("&#99999;") - 1;
+	if (n < 1000000)
+		return sizeof("&#999999;") - 1;
+	/* if (n < 10000000), we won't have cases above 0x10FFFF */
+	return sizeof("&#9999999;") - 1;
+}
+static size_t escape(char *buf, int n)
+{
+#define return_const_len(x) do { \
+	memcpy(buf, x, sizeof(x) - 1); \
+	return (sizeof(x) - 1); \
+} while (0)
+	/* handle ASCII first */
+	if (likely(n < 128)) {
+		if (likely(n >= 0x20 || n == '\t' || n == '\n' || n == '\r')) {
+			if (unlikely(n == '&'))
+				return_const_len("&amp;");
+			if (unlikely(n == '<'))
+				return_const_len("&lt;");
+			if (unlikely(n == '>'))
+				return_const_len("&gt;");
+			buf[0] = (char)n;
+			return 1;
+		}
+		buf[0] = '*';
+		return 1;
+	}
+#undef return_const_len
+	CP_1252_ESCAPE(n);
+	if (VALID_VALUE(n)) {
+		/* return snprintf(buf, sizeof("&#1114111;"), "&#%i;", n); */
+		extern const char ruby_digitmap[];
+		size_t rv = sizeof("&#;") - 1;
+		buf += bytes_for(n);
+		*--buf = ';';
+		do {
+			*--buf = ruby_digitmap[(int)(n % 10)];
+			++rv;
+		} while (n /= 10);
+		*--buf = '#';
+		*--buf = '&';
+		return rv;
+	}
+	buf[0] = '*';
+	return 1;
+}
+static long escaped_len(int n)
+{
+	if (likely(n < 128)) {
+		if (unlikely(n == '&'))
+			return (sizeof("&amp;") - 1);
+		if (unlikely(n == '>' || n == '<'))
+			return (sizeof("&gt;") - 1);
+		return 1;
+	}
+	CP_1252_ESCAPE(n);
+	return VALID_VALUE(n) ? bytes_for(n) : 1;
+}
+static VALUE unpack_utf8(VALUE self)
+{
+	return rb_funcall(self, unpack_id, 1, U_fmt);
+}
+static VALUE unpack_uchar(VALUE self)
+{
+	return rb_funcall(self, unpack_id, 1, C_fmt);
+}
+static VALUE fast_xs(VALUE self)
+{
+	long i;
+	struct RArray *array;
+	char *s, *c;
+	size_t s_len = 0;
+	VALUE *tmp;
+	array = RARRAY(rb_rescue(unpack_utf8, self, unpack_uchar, self));
+	for (tmp = array->ptr, i = array->len; --i >= 0; tmp++)
+		s_len += escaped_len(NUM2INT(*tmp));
+	c = s = alloca(s_len);
+	for (tmp = array->ptr, i = array->len; --i >= 0; tmp++)
+		c += escape(c, NUM2INT(*tmp));
+	return rb_str_new(s, s_len);
+}
+/*
+ * This is coding agnostic, and works on each byte, so some multibyte
+ * character sets may not be fully supported (but UTF-8 should be).
+ * This is meant to be 100% compatible with the
+ * ERB::Util::escape_html and CGI::escapeHTML methods
+ */
+static VALUE fast_xs_html(VALUE self)
+{
+	struct RString *string = RSTRING(self);
+	long i;
+	char *s;
+	size_t new_len = 0;
+	char *new_str;
+	for (s = string->ptr, i = string->len; --i >= 0; ++s) {
+		if (unlikely(*s == '&'))
+			new_len += (sizeof("&amp;") - 1);
+		else if (unlikely(*s == '<' || *s == '>'))
+			new_len += (sizeof("&gt;") - 1);
+		else if (unlikely(*s == '"'))
+			new_len += (sizeof("&quot;") - 1);
+		else
+			new_len += 1;
+	}
+	new_str = alloca(new_len);
+#define append_const(buf, x) do { \
+	buf = memcpy(buf, x, sizeof(x) - 1) + sizeof(x) - 1; \
+} while (0)
+	for (s = string->ptr, i = string->len; --i >= 0; ++s) {
+		if (unlikely(*s == '&'))
+			append_const(new_str, "&amp;");
+		else if (unlikely(*s == '<'))
+			append_const(new_str, "&lt;");
+		else if (unlikely(*s == '>'))
+			append_const(new_str, "&gt;");
+		else if (unlikely(*s == '"'))
+			append_const(new_str, "&quot;");
+		else
+			*new_str++ = *s;
+	}
+#undef append_const
+	return rb_str_new(new_str - new_len, new_len);
+}
+#define CGI_URI_OK(x) \
+	((x >= 'a' && x <= 'z') || \
+	 (x >= 'A' && x <= 'Z') || \
+	 (x >= '0' && x <= '9') || \
+	 (x == '.' || x == '-' || x == '_'))
+/*
+ * Compatible with CGI::escape(), this iterates through each byte, so
+ * multibyte character sets may not supported (but UTF-8 should be).
+ */
+static VALUE fast_xs_cgi(VALUE self)
+{
+	struct RString *string = RSTRING(self);
+	long i;
+	char *s;
+	size_t new_len = 0;
+	char *new_str;
+	for (s = string->ptr, i = string->len; --i >= 0; ++s) {
+		if (likely(CGI_URI_OK(*s) || *s == ' '))
+			++new_len;
+		else /* we'll only get <= "%FF" here */
+			new_len += 3;
+	}
+	new_str = alloca(new_len);
+	for (s = string->ptr, i = string->len; --i >= 0; ++s) {
+		if (likely(CGI_URI_OK(*s)))
+			*new_str++ = *s;
+		else if (*s == ' ')
+			*new_str++ = '+';
+		else {
+			static const char cgi_digitmap[] = "0123456789ABCDEF";
+			new_str[2] = cgi_digitmap[(*s % 16)];
+			new_str[1] = cgi_digitmap[((*s/16) % 16)];
+			new_str[0] = '%';
+			new_str += 3;
+		}
+	}
+	return rb_str_new(new_str - new_len, new_len);
+}
+void Init_fast_xs(void)
+{
+	assert(cp_1252[159 - 128] == 376); /* just in case I skipped a line */
+	unpack_id = rb_intern("unpack");
+	U_fmt = rb_str_new("U*", 2);
+	C_fmt = rb_str_new("C*", 2);
+	rb_global_variable(&U_fmt);
+	rb_global_variable(&C_fmt);
+	rb_define_method(rb_cString, "fast_xs", fast_xs, 0);
+	rb_define_method(rb_cString, "fast_xs_html", fast_xs_html, 0);
+	rb_define_method(rb_cString, "fast_xs_cgi", fast_xs_cgi, 0);
+}

data/lib/fast_xs_monkey_patcher.rb ADDED Viewed

@@ -0,0 +1,32 @@
+require 'fast_xs'
+if defined?(CGI)
+  class CGI
+    def CGI::escapeHTML(value)
+      value.to_s.fast_xs_html
+    end
+    def CGI::escape(value)
+      value.to_s.fast_xs_cgi
+    end
+  end
+end
+if defined?(ERB::Util)
+  module ERB::Util
+    def html_escape(value)
+      value.to_s.fast_xs_html
+    end
+    alias h html_escape
+    module_function :h
+    module_function :html_escape
+  end
+end

data/test/test_cgi_class_overrides.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require 'test/unit'
+require 'cgi'
+load 'fast_xs_monkey_patcher.rb'
+class TestCgiClassOverrides < Test::Unit::TestCase
+  def test_escape_html_predefined
+    assert_equal '&amp;', CGI::escapeHTML('&')
+    assert_equal '&quot;', CGI::escapeHTML('"')
+    assert_equal '&lt;', CGI::escapeHTML('<')
+    assert_equal '&gt;', CGI::escapeHTML('>')
+  end
+  def test_escape_html_normal
+    assert_equal 'hello world', CGI::escapeHTML('hello world')
+    assert_equal '', CGI::escapeHTML('')
+  end
+  def test_escape_html_ignore
+    assert_equal "\x00", CGI::escapeHTML("\x00")
+    assert_equal "\x0C", CGI::escapeHTML("\x0C")
+    assert_equal "\xEF\xBF\xBF", CGI::escapeHTML("\xEF\xBF\xBF")
+  end
+  def test_escape_cgi
+    assert_equal 'hello%3Dworld', CGI::escape('hello=world')
+    assert_equal '+', CGI::escape(' ')
+    assert_equal '%2B', CGI::escape('+')
+    assert_equal '%2C', CGI::escape(',')
+    assert_equal 'hello-world', CGI::escape('hello-world')
+    assert_equal 'H3LL0+W0RLD', CGI::escape('H3LL0 W0RLD')
+  end
+end

data/test/test_erb_util_module_overrides.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require 'erb'
+require 'test/unit'
+load 'fast_xs_monkey_patcher.rb'
+class TestErbUtilModuleOverrides < Test::Unit::TestCase
+  include ERB::Util
+  def test_escape_html_predefined
+    assert_equal '&amp;', html_escape('&')
+    assert_equal '&quot;', html_escape('"')
+    assert_equal '&lt;', html_escape('<')
+    assert_equal '&gt;', html_escape('>')
+  end
+  def test_escape_html_normal
+    assert_equal 'hello world', html_escape('hello world')
+    assert_equal '', html_escape('')
+  end
+  def test_escape_html_ignore
+    assert_equal "\x00", html_escape("\x00")
+    assert_equal "\x0C", html_escape("\x0C")
+    assert_equal "\xEF\xBF\xBF", html_escape("\xEF\xBF\xBF")
+  end
+end

data/test/test_xml_escaping.rb ADDED Viewed

@@ -0,0 +1,39 @@
+require 'fast_xs'
+require 'test/unit'
+# Based on Sam's original xchar.rb tests:
+class TestXmlEscaping < Test::Unit::TestCase
+  def test_ascii
+    assert_equal 'abc', 'abc'.fast_xs
+  end
+  def test_predefined
+    assert_equal '&amp;', '&'.fast_xs              # ampersand
+    assert_equal '&lt;',  '<'.fast_xs              # left angle bracket
+    assert_equal '&gt;',  '>'.fast_xs              # right angle bracket
+  end
+  def test_invalid
+    assert_equal '*', "\x00".fast_xs               # null
+    assert_equal '*', "\x0C".fast_xs               # form feed
+    assert_equal '*', "\xEF\xBF\xBF".fast_xs       # U+FFFF
+  end
+  def test_iso_8859_1
+    assert_equal '&#231;', "\xE7".fast_xs          # small c cedilla
+    assert_equal '&#169;', "\xA9".fast_xs          # copyright symbol
+  end
+  def test_win_1252
+    assert_equal '&#8217;', "\x92".fast_xs         # smart quote
+    assert_equal '&#8364;', "\x80".fast_xs         # euro
+  end
+  def test_utf8
+    assert_equal '&#8217;', "\xE2\x80\x99".fast_xs # right single quote
+    assert_equal '&#169;',  "\xC2\xA9".fast_xs     # copy
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,70 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.9.4.7
+specification_version: 2
+name: fast_xs
+version: !ruby/object:Gem::Version
+  version: "0.2"
+date: 2007-12-07 00:00:00 -08:00
+summary: escape faster!
+require_paths:
+- lib/i486-linux
+- lib
+email: normalperson@yhbt.net
+homepage: http://bogonips.org/fast_xs/
+rubyforge_project:
+description: escape faster!
+autorequire:
+default_executable:
+bindir: bin
+has_rdoc: true
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+platform: ruby
+signing_key:
+cert_chain: []
+post_install_message:
+authors:
+- Eric Wong
+files:
+- CHANGELOG
+- COPYING
+- README
+- Rakefile
+- test/test_cgi_class_overrides.rb
+- test/test_xml_escaping.rb
+- test/test_erb_util_module_overrides.rb
+- lib/fast_xs_monkey_patcher.rb
+- ext/fast_xs/fast_xs.c
+- ext/fast_xs/extconf.rb
+test_files: []
+rdoc_options:
+- --quiet
+- --title
+- fast_xs notes
+- --main
+- README
+- --inline-source
+extra_rdoc_files:
+- README
+- CHANGELOG
+- COPYING
+executables: []
+extensions:
+- ext/fast_xs/extconf.rb
+requirements: []
+dependencies: []