RubyGems - adamh-hpricot - Versions diffs - 0.6.168 - Mend

adamh-hpricot 0.6.168

Files changed (43) hide show

data/CHANGELOG +62 -0
data/COPYING +18 -0
data/README +284 -0
data/Rakefile +259 -0
data/ext/fast_xs/FastXsService.java +1018 -0
data/ext/fast_xs/extconf.rb +4 -0
data/ext/fast_xs/fast_xs.c +194 -0
data/ext/hpricot_scan/extconf.rb +6 -0
data/ext/hpricot_scan/hpricot_common.rl +76 -0
data/ext/hpricot_scan/hpricot_scan.h +79 -0
data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
data/ext/hpricot_scan/hpricot_scan.rl +649 -0
data/extras/mingw-rbconfig.rb +176 -0
data/lib/hpricot/blankslate.rb +63 -0
data/lib/hpricot/builder.rb +209 -0
data/lib/hpricot/elements.rb +510 -0
data/lib/hpricot/htmlinfo.rb +672 -0
data/lib/hpricot/inspect.rb +103 -0
data/lib/hpricot/modules.rb +38 -0
data/lib/hpricot/parse.rb +36 -0
data/lib/hpricot/tag.rb +186 -0
data/lib/hpricot/tags.rb +164 -0
data/lib/hpricot/traverse.rb +838 -0
data/lib/hpricot/xchar.rb +94 -0
data/lib/hpricot.rb +26 -0
data/test/files/basic.xhtml +17 -0
data/test/files/boingboing.html +2266 -0
data/test/files/cy0.html +3653 -0
data/test/files/immob.html +400 -0
data/test/files/pace_application.html +1320 -0
data/test/files/tenderlove.html +16 -0
data/test/files/uswebgen.html +220 -0
data/test/files/utf8.html +1054 -0
data/test/files/week9.html +1723 -0
data/test/files/why.xml +19 -0
data/test/load_files.rb +7 -0
data/test/test_alter.rb +77 -0
data/test/test_builder.rb +37 -0
data/test/test_parser.rb +400 -0
data/test/test_paths.rb +25 -0
data/test/test_preserved.rb +66 -0
data/test/test_xml.rb +28 -0
metadata +107 -0

data/ext/fast_xs/extconf.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require 'mkmf'
+have_header('stdio.h') or exit
+dir_config('fast_xs')
+create_makefile('fast_xs')

data/ext/fast_xs/fast_xs.c ADDED Viewed

@@ -0,0 +1,194 @@
+#define VERSION	"0.1"
+#include <ruby.h>
+#include <assert.h>
+/* #include <stdio.h> */
+static ID unpack_id;
+static VALUE U_fmt, C_fmt;
+/* give GCC hints for better branch prediction
+ * (we layout branches so that ASCII characters are handled faster) */
+#if defined(__GNUC__) && (__GNUC__ >= 3)
+#  define likely(x)		__builtin_expect (!!(x), 1)
+#  define unlikely(x)		__builtin_expect (!!(x), 0)
+#else
+#  define unlikely(x)		(x)
+#  define likely(x)		(x)
+#endif
+/* pass-through certain characters for CP-1252 */
+#define p(x) (x-128)
+static const int cp_1252[] = {
+	8364,		/* 128 => 8364, euro sign */
+	p(129),		/* 129 => 129,  pass-through */
+	8218,		/* 130 => 8218, single low-9 quotation mark */
+	402,		/* 131 =>  402, latin small letter f with hook */
+	8222,		/* 132 => 8222, double low-9 quotation mark */
+	8230,		/* 133 => 8230, horizontal ellipsis */
+	8224,		/* 134 => 8224, dagger */
+	8225,		/* 135 => 8225, double dagger */
+	710,		/* 136 =>  710, modifier letter circumflex accent */
+	8240,		/* 137 => 8240, per mille sign */
+	352,		/* 138 =>  352, latin capital letter s with caron */
+	8249,		/* 139 => 8249, single left-pointing angle quotation mark */
+	338,		/* 140 =>  338, latin capital ligature oe */
+	p(141),		/* 141 =>  141, pass-through */
+	381,		/* 142 =>  381, latin capital letter z with caron */
+	p(143),		/* 143 =>  143, pass-through */
+	p(144),		/* 144 =>  144, pass-through */
+	8216,		/* 145 => 8216, left single quotation mark */
+	8217,		/* 146 => 8217, right single quotation mark */
+	8220,		/* 147 => 8220, left double quotation mark */
+	8221,		/* 148 => 8221, right double quotation mark */
+	8226,		/* 149 => 8226, bullet */
+	8211,		/* 150 => 8211, en dash */
+	8212,		/* 151 => 8212, em dash */
+	732,		/* 152 =>  732, small tilde */
+	8482,		/* 153 => 8482, trade mark sign */
+	353,		/* 154 =>  353, latin small letter s with caron */
+	8250,		/* 155 => 8250, single right-pointing angle quotation mark */
+	339,		/* 156 =>  339, latin small ligature oe */
+	p(157),		/* 157 =>  157, pass-through */
+	382,		/* 158 =>  382, latin small letter z with caron */
+	376		/* 159 =>  376} latin capital letter y with diaeresis */
+};
+#define VALID_VALUE(n) \
+	(n >= 0x20 && n <= 0xD7FF) || \
+	    (n >= 0xE000 && n <= 0xFFFD) || \
+	    (n >= 0x10000 && n <= 0x10FFFF)
+#define CP_1252_ESCAPE(n) do { \
+	if (n >= 128 && n <= 159) \
+		n = cp_1252[n - 128]; \
+	} while(0)
+#define return_const_len(x) do { \
+	memcpy(buf, x, sizeof(x) - 1); \
+	return (sizeof(x) - 1); \
+} while (0)
+static inline size_t bytes_for(int n)
+{
+	if (n < 1000)
+		return sizeof("&#999;") - 1;
+	if (n < 10000)
+		return sizeof("&#9999;") - 1;
+	if (n < 100000)
+		return sizeof("&#99999;") - 1;
+	if (n < 1000000)
+		return sizeof("&#999999;") - 1;
+	/* if (n < 10000000), we won't have cases above 0x10FFFF */
+	return sizeof("&#9999999;") - 1;
+}
+static long escape(char *buf, int n)
+{
+	/* handle ASCII first */
+	if (likely(n < 128)) {
+		if (likely(n >= 0x20 || n == 0x9 || n == 0xA || n == 0xD)) {
+			if (unlikely(n == 34))
+				return_const_len("&quot;");
+			if (unlikely(n == 38))
+				return_const_len("&amp;");
+			if (unlikely(n == 60))
+				return_const_len("&lt;");
+			if (unlikely(n == 62))
+				return_const_len("&gt;");
+			buf[0] = (char)n;
+			return 1;
+		}
+		buf[0] = '*';
+		return 1;
+	}
+	CP_1252_ESCAPE(n);
+	if (VALID_VALUE(n)) {
+		/* return snprintf(buf, sizeof("&#1114111;"), "&#%i;", n); */
+		extern const char ruby_digitmap[];
+		int rv = 3; /* &#; */
+		buf += bytes_for(n);
+		*--buf = ';';
+		do {
+			*--buf = ruby_digitmap[(int)(n % 10)];
+			++rv;
+		} while (n /= 10);
+		*--buf = '#';
+		*--buf = '&';
+		return rv;
+	}
+	buf[0] = '*';
+	return 1;
+}
+#undef return_const_len
+static long escaped_len(int n)
+{
+	if (likely(n < 128)) {
+		if (unlikely(n == 34))
+			return (sizeof("&quot;") - 1);
+		if (unlikely(n == 38))
+			return (sizeof("&amp;") - 1);
+		if (unlikely(n == 60 || n == 62))
+			return (sizeof("&gt;") - 1);
+		return 1;
+	}
+	CP_1252_ESCAPE(n);
+	if (VALID_VALUE(n))
+		return bytes_for(n);
+	return 1;
+}
+static VALUE unpack_utf8(VALUE self)
+{
+	return rb_funcall(self, unpack_id, 1, U_fmt);
+}
+static VALUE unpack_uchar(VALUE self)
+{
+	return rb_funcall(self, unpack_id, 1, C_fmt);
+}
+VALUE fast_xs(VALUE self)
+{
+	long i;
+	struct RArray *array;
+	char *s, *c;
+	long s_len = 0;
+	VALUE *tmp;
+	array = RARRAY(rb_rescue(unpack_utf8, self, unpack_uchar, self));
+	tmp = array->ptr;
+	for (i = array->len; --i >= 0; tmp++)
+		s_len += escaped_len(NUM2INT(*tmp));
+	c = s = alloca(s_len + 1);
+	tmp = array->ptr;
+	for (i = array->len; --i >= 0; tmp++)
+		c += escape(c, NUM2INT(*tmp));
+	*c = '\0';
+	return rb_str_new(s, s_len);
+}
+void Init_fast_xs(void)
+{
+	assert(cp_1252[159 - 128] == 376); /* just in case I skipped a line */
+	unpack_id = rb_intern("unpack");
+	U_fmt = rb_str_new("U*", 2);
+	C_fmt = rb_str_new("C*", 2);
+	rb_global_variable(&U_fmt);
+	rb_global_variable(&C_fmt);
+	rb_define_method(rb_cString, "fast_xs", fast_xs, 0);
+}

data/ext/hpricot_scan/extconf.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'mkmf'
+dir_config("hpricot_scan")
+have_library("c", "main")
+create_makefile("hpricot_scan")

data/ext/hpricot_scan/hpricot_common.rl ADDED Viewed

@@ -0,0 +1,76 @@
+%%{
+  machine hpricot_common;
+  #
+  # HTML tokens
+  # (a blatant rip from HTree)
+  #
+  newline = '\n' @{curline += 1;} ;
+  NameChar = [\-A-Za-z0-9._:?] ;
+  Name = [A-Za-z_:] NameChar* ;
+  StartComment = "<!--" ;
+  EndComment = "-->" ;
+  StartCdata = "<![CDATA[" ;
+  EndCdata = "]]>" ;
+  NameCap = Name >_tag %tag;
+  NameAttr = NameChar+ >_akey %akey ;
+  Q1Char = ( "\\\'" | [^'] ) ;
+  Q1Attr = Q1Char* >_aval %aval ;
+  Q2Char = ( "\\\"" | [^"] ) ;
+  Q2Attr = Q2Char* >_aval %aval ;
+  UnqAttr = ( space >_aval | [^ \t\r\n<>"'] >_aval [^ \t\r\n<>]* %aunq ) ;
+  Nmtoken = NameChar+ >_akey %akey ;
+  Attr =  NameAttr space* "=" space* ('"' Q2Attr '"' | "'" Q1Attr "'" | UnqAttr space+ ) space* ;
+  AttrEnd = ( NameAttr space* "=" space* UnqAttr? | Nmtoken >new_attr %save_attr ) ;
+  AttrSet = ( Attr >new_attr %save_attr | Nmtoken >new_attr space+ %save_attr ) ;
+  StartTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? ">" | "<" NameCap ">";
+  EmptyTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? "/>" | "<" NameCap "/>" ;
+  EndTag = "</" NameCap space* ">" ;
+  XmlVersionNum = [a-zA-Z0-9_.:\-]+ >_aval %xmlver ;
+  XmlVersionInfo = space+ "version" space* "=" space* ("'" XmlVersionNum "'" | '"' XmlVersionNum '"' ) ;
+  XmlEncName = [A-Za-z] >_aval [A-Za-z0-9._\-]* %xmlenc ;
+  XmlEncodingDecl = space+ "encoding" space* "=" space* ("'" XmlEncName "'" | '"' XmlEncName '"' ) ;
+  XmlYesNo = ("yes" | "no") >_aval %xmlsd ;
+  XmlSDDecl = space+ "standalone" space* "=" space* ("'" XmlYesNo "'" | '"' XmlYesNo '"') ;
+  XmlDecl = "<?xml" XmlVersionInfo XmlEncodingDecl? XmlSDDecl? space* "?"? ">" ;
+  SystemLiteral = '"' [^"]* >_aval %sysid '"' | "'" [^']* >_aval %sysid "'" ;
+  PubidLiteral = '"' [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]*  >_aval %pubid '"' |
+    "'" [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid "'" ;
+  ExternalID = ( "SYSTEM" | "PUBLIC" space+ PubidLiteral ) (space+ SystemLiteral)? ;
+  DocType = "<!DOCTYPE" space+ NameCap (space+ ExternalID)? space* ("[" [^\]]* "]" space*)? ">" ;
+  StartXmlProcIns = "<?" Name >{ TEXT_PASS(); } space+ ;
+  EndXmlProcIns = "?"? ">" ;
+  html_comment := |*
+    EndComment @{ EBLK(comment, 3); fgoto main; };
+    any | newline { TEXT_PASS(); };
+  *|;
+  html_cdata := |*
+    EndCdata @{ EBLK(cdata, 3); fgoto main; };
+    any | newline { TEXT_PASS(); };
+  *|;
+  html_procins := |*
+    EndXmlProcIns @{ EBLK(procins, 2); fgoto main; };
+    any | newline { TEXT_PASS(); };
+  *|;
+  main := |*
+    XmlDecl >newEle { ELE(xmldecl); };
+    DocType >newEle { ELE(doctype); };
+    StartXmlProcIns >newEle { fgoto html_procins; };
+    StartTag >newEle { ELE(stag); };
+    EndTag >newEle { ELE(etag); };
+    EmptyTag >newEle { ELE(emptytag); };
+    StartComment >newEle { fgoto html_comment; };
+    StartCdata >newEle { fgoto html_cdata; };
+    any | newline { TEXT_PASS(); };
+  *|;
+}%%;

data/ext/hpricot_scan/hpricot_scan.h ADDED Viewed

@@ -0,0 +1,79 @@
+/*
+ * hpricot_scan.h
+ *
+ * $Author: why $
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
+ *
+ * Copyright (C) 2006 why the lucky stiff
+ * You can redistribute it and/or modify it under the same terms as Ruby.
+ */
+#ifndef hpricot_scan_h
+#define hpricot_scan_h
+#include <sys/types.h>
+#if defined(_WIN32)
+#include <stddef.h>
+#endif
+/*
+ * Memory Allocation
+ */
+#if defined(HAVE_ALLOCA_H) && !defined(__GNUC__)
+#include <alloca.h>
+#endif
+#ifndef NULL
+# define NULL (void *)0
+#endif
+#define BUFSIZE 16384
+#define S_ALLOC_N(type,n) (type*)malloc(sizeof(type)*(n))
+#define S_ALLOC(type) (type*)malloc(sizeof(type))
+#define S_REALLOC_N(var,type,n) (var)=(type*)realloc((char*)(var),sizeof(type)*(n))
+#define S_FREE(n) free(n); n = NULL;
+#define S_ALLOCA_N(type,n) (type*)alloca(sizeof(type)*(n))
+#define S_MEMZERO(p,type,n) memset((p), 0, sizeof(type)*(n))
+#define S_MEMCPY(p1,p2,type,n) memcpy((p1), (p2), sizeof(type)*(n))
+#define S_MEMMOVE(p1,p2,type,n) memmove((p1), (p2), sizeof(type)*(n))
+#define S_MEMCMP(p1,p2,type,n) memcmp((p1), (p2), sizeof(type)*(n))
+typedef struct {
+  void *name;
+  void *attributes;
+} hpricot_element;
+typedef void (*hpricot_element_cb)(void *data, hpricot_element *token);
+typedef struct hpricot_scan {
+  int lineno;
+  int cs;
+  size_t nread;
+  size_t mark;
+  void *data;
+  hpricot_element_cb xmldecl;
+  hpricot_element_cb doctype;
+  hpricot_element_cb xmlprocins;
+  hpricot_element_cb starttag;
+  hpricot_element_cb endtag;
+  hpricot_element_cb emptytag;
+  hpricot_element_cb comment;
+  hpricot_element_cb cdata;
+} http_scan;
+// int hpricot_scan_init(hpricot_scan *scan);
+// int hpricot_scan_finish(hpricot_scan *scan);
+// size_t hpricot_scan_execute(hpricot_scan *scan, const char *data, size_t len, size_t off);
+// int hpricot_scan_has_error(hpricot_scan *scan);
+// int hpricot_scan_is_finished(hpricot_scan *scan);
+//
+// #define hpricot_scan_nread(scan) (scan)->nread
+#endif