adamh-hpricot 0.6.168

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/CHANGELOG +62 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +259 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +194 -0
  8. data/ext/hpricot_scan/extconf.rb +6 -0
  9. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  10. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  11. data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
  12. data/ext/hpricot_scan/hpricot_scan.rl +649 -0
  13. data/extras/mingw-rbconfig.rb +176 -0
  14. data/lib/hpricot/blankslate.rb +63 -0
  15. data/lib/hpricot/builder.rb +209 -0
  16. data/lib/hpricot/elements.rb +510 -0
  17. data/lib/hpricot/htmlinfo.rb +672 -0
  18. data/lib/hpricot/inspect.rb +103 -0
  19. data/lib/hpricot/modules.rb +38 -0
  20. data/lib/hpricot/parse.rb +36 -0
  21. data/lib/hpricot/tag.rb +186 -0
  22. data/lib/hpricot/tags.rb +164 -0
  23. data/lib/hpricot/traverse.rb +838 -0
  24. data/lib/hpricot/xchar.rb +94 -0
  25. data/lib/hpricot.rb +26 -0
  26. data/test/files/basic.xhtml +17 -0
  27. data/test/files/boingboing.html +2266 -0
  28. data/test/files/cy0.html +3653 -0
  29. data/test/files/immob.html +400 -0
  30. data/test/files/pace_application.html +1320 -0
  31. data/test/files/tenderlove.html +16 -0
  32. data/test/files/uswebgen.html +220 -0
  33. data/test/files/utf8.html +1054 -0
  34. data/test/files/week9.html +1723 -0
  35. data/test/files/why.xml +19 -0
  36. data/test/load_files.rb +7 -0
  37. data/test/test_alter.rb +77 -0
  38. data/test/test_builder.rb +37 -0
  39. data/test/test_parser.rb +400 -0
  40. data/test/test_paths.rb +25 -0
  41. data/test/test_preserved.rb +66 -0
  42. data/test/test_xml.rb +28 -0
  43. metadata +107 -0
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+ have_header('stdio.h') or exit
3
+ dir_config('fast_xs')
4
+ create_makefile('fast_xs')
@@ -0,0 +1,194 @@
1
+ #define VERSION "0.1"
2
+
3
+ #include <ruby.h>
4
+ #include <assert.h>
5
+ /* #include <stdio.h> */
6
+
7
+ static ID unpack_id;
8
+ static VALUE U_fmt, C_fmt;
9
+
10
+ /* give GCC hints for better branch prediction
11
+ * (we layout branches so that ASCII characters are handled faster) */
12
+ #if defined(__GNUC__) && (__GNUC__ >= 3)
13
+ # define likely(x) __builtin_expect (!!(x), 1)
14
+ # define unlikely(x) __builtin_expect (!!(x), 0)
15
+ #else
16
+ # define unlikely(x) (x)
17
+ # define likely(x) (x)
18
+ #endif
19
+
20
+ /* pass-through certain characters for CP-1252 */
21
+ #define p(x) (x-128)
22
+
23
+ static const int cp_1252[] = {
24
+ 8364, /* 128 => 8364, euro sign */
25
+ p(129), /* 129 => 129, pass-through */
26
+ 8218, /* 130 => 8218, single low-9 quotation mark */
27
+ 402, /* 131 => 402, latin small letter f with hook */
28
+ 8222, /* 132 => 8222, double low-9 quotation mark */
29
+ 8230, /* 133 => 8230, horizontal ellipsis */
30
+ 8224, /* 134 => 8224, dagger */
31
+ 8225, /* 135 => 8225, double dagger */
32
+ 710, /* 136 => 710, modifier letter circumflex accent */
33
+ 8240, /* 137 => 8240, per mille sign */
34
+ 352, /* 138 => 352, latin capital letter s with caron */
35
+ 8249, /* 139 => 8249, single left-pointing angle quotation mark */
36
+ 338, /* 140 => 338, latin capital ligature oe */
37
+ p(141), /* 141 => 141, pass-through */
38
+ 381, /* 142 => 381, latin capital letter z with caron */
39
+ p(143), /* 143 => 143, pass-through */
40
+ p(144), /* 144 => 144, pass-through */
41
+ 8216, /* 145 => 8216, left single quotation mark */
42
+ 8217, /* 146 => 8217, right single quotation mark */
43
+ 8220, /* 147 => 8220, left double quotation mark */
44
+ 8221, /* 148 => 8221, right double quotation mark */
45
+ 8226, /* 149 => 8226, bullet */
46
+ 8211, /* 150 => 8211, en dash */
47
+ 8212, /* 151 => 8212, em dash */
48
+ 732, /* 152 => 732, small tilde */
49
+ 8482, /* 153 => 8482, trade mark sign */
50
+ 353, /* 154 => 353, latin small letter s with caron */
51
+ 8250, /* 155 => 8250, single right-pointing angle quotation mark */
52
+ 339, /* 156 => 339, latin small ligature oe */
53
+ p(157), /* 157 => 157, pass-through */
54
+ 382, /* 158 => 382, latin small letter z with caron */
55
+ 376 /* 159 => 376} latin capital letter y with diaeresis */
56
+ };
57
+
58
+ #define VALID_VALUE(n) \
59
+ (n >= 0x20 && n <= 0xD7FF) || \
60
+ (n >= 0xE000 && n <= 0xFFFD) || \
61
+ (n >= 0x10000 && n <= 0x10FFFF)
62
+
63
+ #define CP_1252_ESCAPE(n) do { \
64
+ if (n >= 128 && n <= 159) \
65
+ n = cp_1252[n - 128]; \
66
+ } while(0)
67
+
68
+ #define return_const_len(x) do { \
69
+ memcpy(buf, x, sizeof(x) - 1); \
70
+ return (sizeof(x) - 1); \
71
+ } while (0)
72
+
73
+ static inline size_t bytes_for(int n)
74
+ {
75
+ if (n < 1000)
76
+ return sizeof("&#999;") - 1;
77
+ if (n < 10000)
78
+ return sizeof("&#9999;") - 1;
79
+ if (n < 100000)
80
+ return sizeof("&#99999;") - 1;
81
+ if (n < 1000000)
82
+ return sizeof("&#999999;") - 1;
83
+ /* if (n < 10000000), we won't have cases above 0x10FFFF */
84
+ return sizeof("&#9999999;") - 1;
85
+ }
86
+
87
+ static long escape(char *buf, int n)
88
+ {
89
+ /* handle ASCII first */
90
+ if (likely(n < 128)) {
91
+ if (likely(n >= 0x20 || n == 0x9 || n == 0xA || n == 0xD)) {
92
+ if (unlikely(n == 34))
93
+ return_const_len("&quot;");
94
+ if (unlikely(n == 38))
95
+ return_const_len("&amp;");
96
+ if (unlikely(n == 60))
97
+ return_const_len("&lt;");
98
+ if (unlikely(n == 62))
99
+ return_const_len("&gt;");
100
+ buf[0] = (char)n;
101
+ return 1;
102
+ }
103
+
104
+ buf[0] = '*';
105
+ return 1;
106
+ }
107
+
108
+ CP_1252_ESCAPE(n);
109
+
110
+ if (VALID_VALUE(n)) {
111
+ /* return snprintf(buf, sizeof("&#1114111;"), "&#%i;", n); */
112
+ extern const char ruby_digitmap[];
113
+ int rv = 3; /* &#; */
114
+ buf += bytes_for(n);
115
+ *--buf = ';';
116
+ do {
117
+ *--buf = ruby_digitmap[(int)(n % 10)];
118
+ ++rv;
119
+ } while (n /= 10);
120
+ *--buf = '#';
121
+ *--buf = '&';
122
+ return rv;
123
+ }
124
+ buf[0] = '*';
125
+ return 1;
126
+ }
127
+
128
+ #undef return_const_len
129
+
130
+ static long escaped_len(int n)
131
+ {
132
+ if (likely(n < 128)) {
133
+ if (unlikely(n == 34))
134
+ return (sizeof("&quot;") - 1);
135
+ if (unlikely(n == 38))
136
+ return (sizeof("&amp;") - 1);
137
+ if (unlikely(n == 60 || n == 62))
138
+ return (sizeof("&gt;") - 1);
139
+ return 1;
140
+ }
141
+
142
+ CP_1252_ESCAPE(n);
143
+
144
+ if (VALID_VALUE(n))
145
+ return bytes_for(n);
146
+ return 1;
147
+ }
148
+
149
+ static VALUE unpack_utf8(VALUE self)
150
+ {
151
+ return rb_funcall(self, unpack_id, 1, U_fmt);
152
+ }
153
+
154
+ static VALUE unpack_uchar(VALUE self)
155
+ {
156
+ return rb_funcall(self, unpack_id, 1, C_fmt);
157
+ }
158
+
159
+ VALUE fast_xs(VALUE self)
160
+ {
161
+ long i;
162
+ struct RArray *array;
163
+ char *s, *c;
164
+ long s_len = 0;
165
+ VALUE *tmp;
166
+
167
+ array = RARRAY(rb_rescue(unpack_utf8, self, unpack_uchar, self));
168
+
169
+ tmp = array->ptr;
170
+ for (i = array->len; --i >= 0; tmp++)
171
+ s_len += escaped_len(NUM2INT(*tmp));
172
+
173
+ c = s = alloca(s_len + 1);
174
+
175
+ tmp = array->ptr;
176
+ for (i = array->len; --i >= 0; tmp++)
177
+ c += escape(c, NUM2INT(*tmp));
178
+
179
+ *c = '\0';
180
+ return rb_str_new(s, s_len);
181
+ }
182
+
183
+ void Init_fast_xs(void)
184
+ {
185
+ assert(cp_1252[159 - 128] == 376); /* just in case I skipped a line */
186
+
187
+ unpack_id = rb_intern("unpack");
188
+ U_fmt = rb_str_new("U*", 2);
189
+ C_fmt = rb_str_new("C*", 2);
190
+ rb_global_variable(&U_fmt);
191
+ rb_global_variable(&C_fmt);
192
+
193
+ rb_define_method(rb_cString, "fast_xs", fast_xs, 0);
194
+ }
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ dir_config("hpricot_scan")
4
+ have_library("c", "main")
5
+
6
+ create_makefile("hpricot_scan")
@@ -0,0 +1,76 @@
1
+ %%{
2
+
3
+ machine hpricot_common;
4
+
5
+ #
6
+ # HTML tokens
7
+ # (a blatant rip from HTree)
8
+ #
9
+ newline = '\n' @{curline += 1;} ;
10
+ NameChar = [\-A-Za-z0-9._:?] ;
11
+ Name = [A-Za-z_:] NameChar* ;
12
+ StartComment = "<!--" ;
13
+ EndComment = "-->" ;
14
+ StartCdata = "<![CDATA[" ;
15
+ EndCdata = "]]>" ;
16
+
17
+ NameCap = Name >_tag %tag;
18
+ NameAttr = NameChar+ >_akey %akey ;
19
+ Q1Char = ( "\\\'" | [^'] ) ;
20
+ Q1Attr = Q1Char* >_aval %aval ;
21
+ Q2Char = ( "\\\"" | [^"] ) ;
22
+ Q2Attr = Q2Char* >_aval %aval ;
23
+ UnqAttr = ( space >_aval | [^ \t\r\n<>"'] >_aval [^ \t\r\n<>]* %aunq ) ;
24
+ Nmtoken = NameChar+ >_akey %akey ;
25
+
26
+ Attr = NameAttr space* "=" space* ('"' Q2Attr '"' | "'" Q1Attr "'" | UnqAttr space+ ) space* ;
27
+ AttrEnd = ( NameAttr space* "=" space* UnqAttr? | Nmtoken >new_attr %save_attr ) ;
28
+ AttrSet = ( Attr >new_attr %save_attr | Nmtoken >new_attr space+ %save_attr ) ;
29
+ StartTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? ">" | "<" NameCap ">";
30
+ EmptyTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? "/>" | "<" NameCap "/>" ;
31
+
32
+ EndTag = "</" NameCap space* ">" ;
33
+ XmlVersionNum = [a-zA-Z0-9_.:\-]+ >_aval %xmlver ;
34
+ XmlVersionInfo = space+ "version" space* "=" space* ("'" XmlVersionNum "'" | '"' XmlVersionNum '"' ) ;
35
+ XmlEncName = [A-Za-z] >_aval [A-Za-z0-9._\-]* %xmlenc ;
36
+ XmlEncodingDecl = space+ "encoding" space* "=" space* ("'" XmlEncName "'" | '"' XmlEncName '"' ) ;
37
+ XmlYesNo = ("yes" | "no") >_aval %xmlsd ;
38
+ XmlSDDecl = space+ "standalone" space* "=" space* ("'" XmlYesNo "'" | '"' XmlYesNo '"') ;
39
+ XmlDecl = "<?xml" XmlVersionInfo XmlEncodingDecl? XmlSDDecl? space* "?"? ">" ;
40
+
41
+ SystemLiteral = '"' [^"]* >_aval %sysid '"' | "'" [^']* >_aval %sysid "'" ;
42
+ PubidLiteral = '"' [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid '"' |
43
+ "'" [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid "'" ;
44
+ ExternalID = ( "SYSTEM" | "PUBLIC" space+ PubidLiteral ) (space+ SystemLiteral)? ;
45
+ DocType = "<!DOCTYPE" space+ NameCap (space+ ExternalID)? space* ("[" [^\]]* "]" space*)? ">" ;
46
+ StartXmlProcIns = "<?" Name >{ TEXT_PASS(); } space+ ;
47
+ EndXmlProcIns = "?"? ">" ;
48
+
49
+ html_comment := |*
50
+ EndComment @{ EBLK(comment, 3); fgoto main; };
51
+ any | newline { TEXT_PASS(); };
52
+ *|;
53
+
54
+ html_cdata := |*
55
+ EndCdata @{ EBLK(cdata, 3); fgoto main; };
56
+ any | newline { TEXT_PASS(); };
57
+ *|;
58
+
59
+ html_procins := |*
60
+ EndXmlProcIns @{ EBLK(procins, 2); fgoto main; };
61
+ any | newline { TEXT_PASS(); };
62
+ *|;
63
+
64
+ main := |*
65
+ XmlDecl >newEle { ELE(xmldecl); };
66
+ DocType >newEle { ELE(doctype); };
67
+ StartXmlProcIns >newEle { fgoto html_procins; };
68
+ StartTag >newEle { ELE(stag); };
69
+ EndTag >newEle { ELE(etag); };
70
+ EmptyTag >newEle { ELE(emptytag); };
71
+ StartComment >newEle { fgoto html_comment; };
72
+ StartCdata >newEle { fgoto html_cdata; };
73
+ any | newline { TEXT_PASS(); };
74
+ *|;
75
+
76
+ }%%;
@@ -0,0 +1,79 @@
1
+ /*
2
+ * hpricot_scan.h
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006 why the lucky stiff
8
+ * You can redistribute it and/or modify it under the same terms as Ruby.
9
+ */
10
+
11
+ #ifndef hpricot_scan_h
12
+ #define hpricot_scan_h
13
+
14
+ #include <sys/types.h>
15
+
16
+ #if defined(_WIN32)
17
+ #include <stddef.h>
18
+ #endif
19
+
20
+ /*
21
+ * Memory Allocation
22
+ */
23
+ #if defined(HAVE_ALLOCA_H) && !defined(__GNUC__)
24
+ #include <alloca.h>
25
+ #endif
26
+
27
+ #ifndef NULL
28
+ # define NULL (void *)0
29
+ #endif
30
+
31
+ #define BUFSIZE 16384
32
+
33
+ #define S_ALLOC_N(type,n) (type*)malloc(sizeof(type)*(n))
34
+ #define S_ALLOC(type) (type*)malloc(sizeof(type))
35
+ #define S_REALLOC_N(var,type,n) (var)=(type*)realloc((char*)(var),sizeof(type)*(n))
36
+ #define S_FREE(n) free(n); n = NULL;
37
+
38
+ #define S_ALLOCA_N(type,n) (type*)alloca(sizeof(type)*(n))
39
+
40
+ #define S_MEMZERO(p,type,n) memset((p), 0, sizeof(type)*(n))
41
+ #define S_MEMCPY(p1,p2,type,n) memcpy((p1), (p2), sizeof(type)*(n))
42
+ #define S_MEMMOVE(p1,p2,type,n) memmove((p1), (p2), sizeof(type)*(n))
43
+ #define S_MEMCMP(p1,p2,type,n) memcmp((p1), (p2), sizeof(type)*(n))
44
+
45
+ typedef struct {
46
+ void *name;
47
+ void *attributes;
48
+ } hpricot_element;
49
+
50
+ typedef void (*hpricot_element_cb)(void *data, hpricot_element *token);
51
+
52
+ typedef struct hpricot_scan {
53
+ int lineno;
54
+ int cs;
55
+ size_t nread;
56
+ size_t mark;
57
+
58
+ void *data;
59
+
60
+ hpricot_element_cb xmldecl;
61
+ hpricot_element_cb doctype;
62
+ hpricot_element_cb xmlprocins;
63
+ hpricot_element_cb starttag;
64
+ hpricot_element_cb endtag;
65
+ hpricot_element_cb emptytag;
66
+ hpricot_element_cb comment;
67
+ hpricot_element_cb cdata;
68
+
69
+ } http_scan;
70
+
71
+ // int hpricot_scan_init(hpricot_scan *scan);
72
+ // int hpricot_scan_finish(hpricot_scan *scan);
73
+ // size_t hpricot_scan_execute(hpricot_scan *scan, const char *data, size_t len, size_t off);
74
+ // int hpricot_scan_has_error(hpricot_scan *scan);
75
+ // int hpricot_scan_is_finished(hpricot_scan *scan);
76
+ //
77
+ // #define hpricot_scan_nread(scan) (scan)->nread
78
+
79
+ #endif