stepheneb-hpricot 0.8.265

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/CHANGELOG +75 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +264 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +200 -0
  8. data/ext/hpricot_scan/HpricotScanService.java +2090 -0
  9. data/ext/hpricot_scan/extconf.rb +6 -0
  10. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  11. data/ext/hpricot_scan/hpricot_css.c +3506 -0
  12. data/ext/hpricot_scan/hpricot_scan.c +6931 -0
  13. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  14. data/ext/hpricot_scan/hpricot_scan.java.rl +1152 -0
  15. data/ext/hpricot_scan/hpricot_scan.rl +788 -0
  16. data/extras/mingw-rbconfig.rb +176 -0
  17. data/lib/hpricot/blankslate.rb +63 -0
  18. data/lib/hpricot/builder.rb +216 -0
  19. data/lib/hpricot/elements.rb +510 -0
  20. data/lib/hpricot/htmlinfo.rb +691 -0
  21. data/lib/hpricot/inspect.rb +103 -0
  22. data/lib/hpricot/modules.rb +40 -0
  23. data/lib/hpricot/parse.rb +38 -0
  24. data/lib/hpricot/tag.rb +219 -0
  25. data/lib/hpricot/tags.rb +164 -0
  26. data/lib/hpricot/traverse.rb +839 -0
  27. data/lib/hpricot/xchar.rb +94 -0
  28. data/lib/hpricot.rb +26 -0
  29. data/test/files/basic.xhtml +17 -0
  30. data/test/files/boingboing.html +2266 -0
  31. data/test/files/cy0.html +3653 -0
  32. data/test/files/immob.html +400 -0
  33. data/test/files/pace_application.html +1320 -0
  34. data/test/files/tenderlove.html +16 -0
  35. data/test/files/uswebgen.html +220 -0
  36. data/test/files/utf8.html +1054 -0
  37. data/test/files/week9.html +1723 -0
  38. data/test/files/why.xml +19 -0
  39. data/test/load_files.rb +7 -0
  40. data/test/test_alter.rb +95 -0
  41. data/test/test_builder.rb +37 -0
  42. data/test/test_parser.rb +428 -0
  43. data/test/test_paths.rb +25 -0
  44. data/test/test_preserved.rb +79 -0
  45. data/test/test_xml.rb +28 -0
  46. metadata +108 -0
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+ have_header('stdio.h') or exit
3
+ dir_config('fast_xs')
4
+ create_makefile('fast_xs')
@@ -0,0 +1,200 @@
1
+ #include <ruby.h>
2
+ #include <assert.h>
3
+
4
+ #ifndef RARRAY_LEN
5
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
6
+ #define RARRAY_PTR(arr) RARRAY(arr)->ptr
7
+ #define RSTRING_LEN(str) RSTRING(str)->len
8
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
9
+ #endif
10
+
11
+ static ID unpack_id;
12
+ static VALUE U_fmt, C_fmt;
13
+
14
+ /* give GCC hints for better branch prediction
15
+ * (we layout branches so that ASCII characters are handled faster) */
16
+ #if defined(__GNUC__) && (__GNUC__ >= 3)
17
+ # define likely(x) __builtin_expect (!!(x), 1)
18
+ # define unlikely(x) __builtin_expect (!!(x), 0)
19
+ #else
20
+ # define unlikely(x) (x)
21
+ # define likely(x) (x)
22
+ #endif
23
+
24
+ /* pass-through certain characters for CP-1252 */
25
+ #define p(x) (x-128)
26
+
27
+ static const int cp_1252[] = {
28
+ 8364, /* 128 => 8364, euro sign */
29
+ p(129), /* 129 => 129, pass-through */
30
+ 8218, /* 130 => 8218, single low-9 quotation mark */
31
+ 402, /* 131 => 402, latin small letter f with hook */
32
+ 8222, /* 132 => 8222, double low-9 quotation mark */
33
+ 8230, /* 133 => 8230, horizontal ellipsis */
34
+ 8224, /* 134 => 8224, dagger */
35
+ 8225, /* 135 => 8225, double dagger */
36
+ 710, /* 136 => 710, modifier letter circumflex accent */
37
+ 8240, /* 137 => 8240, per mille sign */
38
+ 352, /* 138 => 352, latin capital letter s with caron */
39
+ 8249, /* 139 => 8249, single left-pointing angle quotation mark */
40
+ 338, /* 140 => 338, latin capital ligature oe */
41
+ p(141), /* 141 => 141, pass-through */
42
+ 381, /* 142 => 381, latin capital letter z with caron */
43
+ p(143), /* 143 => 143, pass-through */
44
+ p(144), /* 144 => 144, pass-through */
45
+ 8216, /* 145 => 8216, left single quotation mark */
46
+ 8217, /* 146 => 8217, right single quotation mark */
47
+ 8220, /* 147 => 8220, left double quotation mark */
48
+ 8221, /* 148 => 8221, right double quotation mark */
49
+ 8226, /* 149 => 8226, bullet */
50
+ 8211, /* 150 => 8211, en dash */
51
+ 8212, /* 151 => 8212, em dash */
52
+ 732, /* 152 => 732, small tilde */
53
+ 8482, /* 153 => 8482, trade mark sign */
54
+ 353, /* 154 => 353, latin small letter s with caron */
55
+ 8250, /* 155 => 8250, single right-pointing angle quotation mark */
56
+ 339, /* 156 => 339, latin small ligature oe */
57
+ p(157), /* 157 => 157, pass-through */
58
+ 382, /* 158 => 382, latin small letter z with caron */
59
+ 376 /* 159 => 376} latin capital letter y with diaeresis */
60
+ };
61
+
62
+ #define VALID_VALUE(n) \
63
+ (n >= 0x20 && n <= 0xD7FF) || \
64
+ (n >= 0xE000 && n <= 0xFFFD) || \
65
+ (n >= 0x10000 && n <= 0x10FFFF)
66
+
67
+ #define CP_1252_ESCAPE(n) do { \
68
+ if (n >= 128 && n <= 159) \
69
+ n = cp_1252[n - 128]; \
70
+ } while(0)
71
+
72
+ static inline size_t bytes_for(int n)
73
+ {
74
+ if (n < 1000)
75
+ return sizeof("&#999;") - 1;
76
+ if (n < 10000)
77
+ return sizeof("&#9999;") - 1;
78
+ if (n < 100000)
79
+ return sizeof("&#99999;") - 1;
80
+ if (n < 1000000)
81
+ return sizeof("&#999999;") - 1;
82
+ /* if (n < 10000000), we won't have cases above 0x10FFFF */
83
+ return sizeof("&#9999999;") - 1;
84
+ }
85
+
86
+ static size_t escape(char *buf, int n)
87
+ {
88
+
89
+ #define return_const_len(x) do { \
90
+ memcpy(buf, x, sizeof(x) - 1); \
91
+ return (sizeof(x) - 1); \
92
+ } while (0)
93
+
94
+ /* handle ASCII first */
95
+ if (likely(n < 128)) {
96
+ if (likely(n >= 0x20 || n == '\t' || n == '\n' || n == '\r')) {
97
+ if (unlikely(n == '"'))
98
+ return_const_len("&quot;");
99
+ if (unlikely(n == '&'))
100
+ return_const_len("&amp;");
101
+ if (unlikely(n == '<'))
102
+ return_const_len("&lt;");
103
+ if (unlikely(n == '>'))
104
+ return_const_len("&gt;");
105
+ buf[0] = (char)n;
106
+ return 1;
107
+ }
108
+
109
+ buf[0] = '*';
110
+ return 1;
111
+ }
112
+
113
+ #undef return_const_len
114
+
115
+ CP_1252_ESCAPE(n);
116
+
117
+ if (VALID_VALUE(n)) {
118
+ /* return snprintf(buf, sizeof("&#1114111;"), "&#%i;", n); */
119
+ static const char digitmap[] = "0123456789";
120
+ size_t rv = sizeof("&#;") - 1;
121
+ buf += bytes_for(n);
122
+ *--buf = ';';
123
+ do {
124
+ *--buf = digitmap[(int)(n % 10)];
125
+ ++rv;
126
+ } while (n /= 10);
127
+ *--buf = '#';
128
+ *--buf = '&';
129
+ return rv;
130
+ }
131
+ buf[0] = '*';
132
+ return 1;
133
+ }
134
+
135
+ static VALUE unpack_utf8(VALUE self)
136
+ {
137
+ return rb_funcall(self, unpack_id, 1, U_fmt);
138
+ }
139
+
140
+ static VALUE unpack_uchar(VALUE self)
141
+ {
142
+ return rb_funcall(self, unpack_id, 1, C_fmt);
143
+ }
144
+
145
+ /*
146
+ * escapes strings for XML
147
+ * The double-quote (") character is translated to "&quot;"
148
+ */
149
+ static VALUE fast_xs(VALUE self)
150
+ {
151
+ long i;
152
+ VALUE array;
153
+ char *c;
154
+ size_t s_len;
155
+ VALUE *tmp;
156
+ VALUE rv;
157
+
158
+ array = rb_rescue(unpack_utf8, self, unpack_uchar, self);
159
+
160
+ for (tmp = RARRAY_PTR(array), s_len = i = RARRAY_LEN(array);
161
+ --i >= 0;
162
+ tmp++) {
163
+ int n = NUM2INT(*tmp);
164
+ if (likely(n < 128)) {
165
+ if (unlikely(n == '"'))
166
+ s_len += (sizeof("&quot;") - 2);
167
+ if (unlikely(n == '&'))
168
+ s_len += (sizeof("&amp;") - 2);
169
+ if (unlikely(n == '>' || n == '<'))
170
+ s_len += (sizeof("&gt;") - 2);
171
+ continue;
172
+ }
173
+
174
+ CP_1252_ESCAPE(n);
175
+
176
+ if (VALID_VALUE(n))
177
+ s_len += bytes_for(n) - 1;
178
+ }
179
+
180
+ rv = rb_str_new(NULL, s_len);
181
+ c = RSTRING_PTR(rv);
182
+
183
+ for (tmp = RARRAY_PTR(array), i = RARRAY_LEN(array); --i >= 0; tmp++)
184
+ c += escape(c, NUM2INT(*tmp));
185
+
186
+ return rv;
187
+ }
188
+
189
+ void Init_fast_xs(void)
190
+ {
191
+ assert(cp_1252[159 - 128] == 376); /* just in case I skipped a line */
192
+
193
+ unpack_id = rb_intern("unpack");
194
+ U_fmt = rb_str_new("U*", 2);
195
+ C_fmt = rb_str_new("C*", 2);
196
+ rb_global_variable(&U_fmt);
197
+ rb_global_variable(&C_fmt);
198
+
199
+ rb_define_method(rb_cString, "fast_xs", fast_xs, 0);
200
+ }