hpricot 0.7-x86-mswin32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +68 -0
- data/COPYING +18 -0
- data/README +284 -0
- data/Rakefile +260 -0
- data/ext/fast_xs/FastXsService.java +1018 -0
- data/ext/fast_xs/extconf.rb +4 -0
- data/ext/fast_xs/fast_xs.c +200 -0
- data/ext/hpricot_scan/HpricotScanService.java +1305 -0
- data/ext/hpricot_scan/extconf.rb +6 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_css.c +3502 -0
- data/ext/hpricot_scan/hpricot_css.rl +115 -0
- data/ext/hpricot_scan/hpricot_scan.c +6704 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
- data/ext/hpricot_scan/hpricot_scan.rl +722 -0
- data/ext/hpricot_scan/test.rb +4 -0
- data/extras/mingw-rbconfig.rb +176 -0
- data/lib/fast_xs.so +0 -0
- data/lib/hpricot.rb +26 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +216 -0
- data/lib/hpricot/elements.rb +510 -0
- data/lib/hpricot/htmlinfo.rb +691 -0
- data/lib/hpricot/inspect.rb +103 -0
- data/lib/hpricot/modules.rb +38 -0
- data/lib/hpricot/parse.rb +38 -0
- data/lib/hpricot/tag.rb +198 -0
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +838 -0
- data/lib/hpricot/xchar.rb +94 -0
- data/lib/hpricot_scan.so +0 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/immob.html +400 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/uswebgen.html +220 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +7 -0
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_alter.rb +77 -0
- data/test/test_builder.rb +37 -0
- data/test/test_parser.rb +409 -0
- data/test/test_paths.rb +25 -0
- data/test/test_preserved.rb +70 -0
- data/test/test_xml.rb +28 -0
- metadata +111 -0
@@ -0,0 +1,722 @@
|
|
1
|
+
/*
|
2
|
+
* hpricot_scan.rl
|
3
|
+
*
|
4
|
+
* $Author: why $
|
5
|
+
* $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
|
6
|
+
*
|
7
|
+
* Copyright (C) 2006 why the lucky stiff
|
8
|
+
*/
|
9
|
+
#include <ruby.h>
|
10
|
+
|
11
|
+
#ifndef RARRAY_LEN
|
12
|
+
#define RARRAY_LEN(arr) RARRAY(arr)->len
|
13
|
+
#define RSTRING_LEN(str) RSTRING(str)->len
|
14
|
+
#define RSTRING_PTR(str) RSTRING(str)->ptr
|
15
|
+
#endif
|
16
|
+
|
17
|
+
VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
|
18
|
+
|
19
|
+
#define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
|
20
|
+
|
21
|
+
static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
|
22
|
+
sym_cdata, sym_text, sym_EMPTY, sym_CDATA;
|
23
|
+
static VALUE mHpricot, rb_eHpricotParseError;
|
24
|
+
static VALUE cBaseEle, cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cETag, cText,
|
25
|
+
cXMLDecl, cProcIns, symAllow, symDeny;
|
26
|
+
static ID s_ElementContent;
|
27
|
+
static ID s_downcase, s_new, s_parent, s_read, s_to_str;
|
28
|
+
static ID iv_parent;
|
29
|
+
static VALUE reProcInsParse;
|
30
|
+
|
31
|
+
typedef struct {
|
32
|
+
int name;
|
33
|
+
VALUE tag, attr, etag, raw, EC;
|
34
|
+
VALUE parent, children;
|
35
|
+
} hpricot_ele;
|
36
|
+
|
37
|
+
#define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
|
38
|
+
|
39
|
+
#define ELE(N) \
|
40
|
+
if (te > ts || text == 1) { \
|
41
|
+
char *raw = NULL; \
|
42
|
+
int rawlen = 0; \
|
43
|
+
ele_open = 0; text = 0; \
|
44
|
+
if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
|
45
|
+
raw = ts; rawlen = te - ts; \
|
46
|
+
} \
|
47
|
+
if (rb_block_given_p()) { \
|
48
|
+
VALUE raw_string = Qnil; \
|
49
|
+
if (raw != NULL) raw_string = rb_str_new(raw, rawlen); \
|
50
|
+
rb_yield_tokens(sym_##N, tag, attr, Qnil, taint); \
|
51
|
+
} else \
|
52
|
+
rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint); \
|
53
|
+
}
|
54
|
+
|
55
|
+
#define SET(N, E) \
|
56
|
+
if (mark_##N == NULL || E == mark_##N) \
|
57
|
+
N = rb_str_new2(""); \
|
58
|
+
else if (E > mark_##N) \
|
59
|
+
N = rb_str_new(mark_##N, E - mark_##N);
|
60
|
+
|
61
|
+
#define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
|
62
|
+
|
63
|
+
#define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
|
64
|
+
|
65
|
+
#define ATTR(K, V) \
|
66
|
+
if (!NIL_P(K)) { \
|
67
|
+
if (NIL_P(attr)) attr = rb_hash_new(); \
|
68
|
+
rb_hash_aset(attr, K, V); \
|
69
|
+
}
|
70
|
+
|
71
|
+
#define TEXT_PASS() \
|
72
|
+
if (text == 0) \
|
73
|
+
{ \
|
74
|
+
if (ele_open == 1) { \
|
75
|
+
ele_open = 0; \
|
76
|
+
if (ts > 0) { \
|
77
|
+
mark_tag = ts; \
|
78
|
+
} \
|
79
|
+
} else { \
|
80
|
+
mark_tag = p; \
|
81
|
+
} \
|
82
|
+
attr = Qnil; \
|
83
|
+
tag = Qnil; \
|
84
|
+
text = 1; \
|
85
|
+
}
|
86
|
+
|
87
|
+
#define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
|
88
|
+
|
89
|
+
%%{
|
90
|
+
machine hpricot_scan;
|
91
|
+
|
92
|
+
action newEle {
|
93
|
+
if (text == 1) {
|
94
|
+
CAT(tag, p);
|
95
|
+
ELE(text);
|
96
|
+
text = 0;
|
97
|
+
}
|
98
|
+
attr = Qnil;
|
99
|
+
tag = Qnil;
|
100
|
+
mark_tag = NULL;
|
101
|
+
ele_open = 1;
|
102
|
+
}
|
103
|
+
|
104
|
+
action _tag { mark_tag = p; }
|
105
|
+
action _aval { mark_aval = p; }
|
106
|
+
action _akey { mark_akey = p; }
|
107
|
+
action tag { SET(tag, p); }
|
108
|
+
action tagc { SET(tag, p-1); }
|
109
|
+
action aval { SET(aval, p); }
|
110
|
+
action aunq {
|
111
|
+
if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
|
112
|
+
else { SET(aval, p); }
|
113
|
+
}
|
114
|
+
action akey { SET(akey, p); }
|
115
|
+
action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
|
116
|
+
action xmlenc { SET(aval, p); ATTR(ID2SYM(rb_intern("encoding")), aval); }
|
117
|
+
action xmlsd { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
|
118
|
+
action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
|
119
|
+
action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
|
120
|
+
|
121
|
+
action new_attr {
|
122
|
+
akey = Qnil;
|
123
|
+
aval = Qnil;
|
124
|
+
mark_akey = NULL;
|
125
|
+
mark_aval = NULL;
|
126
|
+
}
|
127
|
+
|
128
|
+
action save_attr {
|
129
|
+
ATTR(akey, aval);
|
130
|
+
}
|
131
|
+
|
132
|
+
include hpricot_common "hpricot_common.rl";
|
133
|
+
|
134
|
+
}%%
|
135
|
+
|
136
|
+
%% write data nofinal;
|
137
|
+
|
138
|
+
#define BUFSIZE 16384
|
139
|
+
|
140
|
+
void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
|
141
|
+
{
|
142
|
+
VALUE ary;
|
143
|
+
if (sym == sym_text) {
|
144
|
+
raw = tag;
|
145
|
+
}
|
146
|
+
ary = rb_ary_new3(4, sym, tag, attr, raw);
|
147
|
+
if (taint) {
|
148
|
+
OBJ_TAINT(ary);
|
149
|
+
OBJ_TAINT(tag);
|
150
|
+
OBJ_TAINT(attr);
|
151
|
+
OBJ_TAINT(raw);
|
152
|
+
}
|
153
|
+
rb_yield(ary);
|
154
|
+
}
|
155
|
+
|
156
|
+
/* rb_hash_lookup() is only in Ruby 1.8.7 */
|
157
|
+
static VALUE
|
158
|
+
our_rb_hash_lookup(VALUE hash, VALUE key)
|
159
|
+
{
|
160
|
+
VALUE val;
|
161
|
+
|
162
|
+
if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
|
163
|
+
return Qnil; /* without Hash#default */
|
164
|
+
}
|
165
|
+
|
166
|
+
return val;
|
167
|
+
}
|
168
|
+
|
169
|
+
static void
|
170
|
+
rb_hpricot_add(VALUE focus, VALUE ele)
|
171
|
+
{
|
172
|
+
hpricot_ele *he, *he2;
|
173
|
+
Data_Get_Struct(focus, hpricot_ele, he);
|
174
|
+
Data_Get_Struct(ele, hpricot_ele, he2);
|
175
|
+
if (NIL_P(he->children))
|
176
|
+
he->children = rb_ary_new();
|
177
|
+
rb_ary_push(he->children, ele);
|
178
|
+
he2->parent = focus;
|
179
|
+
}
|
180
|
+
|
181
|
+
typedef struct {
|
182
|
+
VALUE doc;
|
183
|
+
VALUE focus;
|
184
|
+
VALUE last;
|
185
|
+
VALUE EC;
|
186
|
+
unsigned char xml, strict, fixup;
|
187
|
+
} hpricot_state;
|
188
|
+
|
189
|
+
static void
|
190
|
+
hpricot_ele_mark(hpricot_ele *he)
|
191
|
+
{
|
192
|
+
rb_gc_mark(he->tag);
|
193
|
+
rb_gc_mark(he->attr);
|
194
|
+
rb_gc_mark(he->etag);
|
195
|
+
rb_gc_mark(he->raw);
|
196
|
+
rb_gc_mark(he->parent);
|
197
|
+
rb_gc_mark(he->children);
|
198
|
+
}
|
199
|
+
|
200
|
+
static void
|
201
|
+
hpricot_ele_free(hpricot_ele *he)
|
202
|
+
{
|
203
|
+
free(he);
|
204
|
+
}
|
205
|
+
|
206
|
+
#define H_PROP(prop) \
|
207
|
+
static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
|
208
|
+
hpricot_ele *he; \
|
209
|
+
Data_Get_Struct(self, hpricot_ele, he); \
|
210
|
+
he->prop = x; \
|
211
|
+
return self; \
|
212
|
+
} \
|
213
|
+
static VALUE hpricot_ele_get_##prop(VALUE self) { \
|
214
|
+
hpricot_ele *he; \
|
215
|
+
Data_Get_Struct(self, hpricot_ele, he); \
|
216
|
+
return he->prop; \
|
217
|
+
}
|
218
|
+
|
219
|
+
#define H_ATTR(prop) \
|
220
|
+
static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
|
221
|
+
hpricot_ele *he; \
|
222
|
+
Data_Get_Struct(self, hpricot_ele, he); \
|
223
|
+
rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
|
224
|
+
return self; \
|
225
|
+
} \
|
226
|
+
static VALUE hpricot_ele_get_##prop(VALUE self) { \
|
227
|
+
hpricot_ele *he; \
|
228
|
+
Data_Get_Struct(self, hpricot_ele, he); \
|
229
|
+
return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
|
230
|
+
}
|
231
|
+
|
232
|
+
H_PROP(tag);
|
233
|
+
H_PROP(attr);
|
234
|
+
H_PROP(etag);
|
235
|
+
H_PROP(parent);
|
236
|
+
H_PROP(children);
|
237
|
+
H_ATTR(encoding);
|
238
|
+
H_ATTR(version);
|
239
|
+
H_ATTR(standalone);
|
240
|
+
H_ATTR(system_id);
|
241
|
+
H_ATTR(public_id);
|
242
|
+
|
243
|
+
static VALUE
|
244
|
+
hpricot_ele_get_raw(VALUE self, VALUE x) {
|
245
|
+
hpricot_ele *he;
|
246
|
+
Data_Get_Struct(self, hpricot_ele, he);
|
247
|
+
return he->raw;
|
248
|
+
}
|
249
|
+
|
250
|
+
static VALUE
|
251
|
+
hpricot_ele_clear_raw(VALUE self)
|
252
|
+
{
|
253
|
+
hpricot_ele *he;
|
254
|
+
Data_Get_Struct(self, hpricot_ele, he);
|
255
|
+
he->raw = Qnil;
|
256
|
+
return Qtrue;
|
257
|
+
}
|
258
|
+
|
259
|
+
#define H_ELE(klass) \
|
260
|
+
hpricot_ele *he = ALLOC(hpricot_ele); \
|
261
|
+
he->name = 0; \
|
262
|
+
he->tag = tag; \
|
263
|
+
he->attr = attr; \
|
264
|
+
he->raw = Qnil; \
|
265
|
+
he->EC = ec; \
|
266
|
+
he->etag = he->parent = he->children = Qnil; \
|
267
|
+
if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_etag || sym == sym_doctype)) { \
|
268
|
+
he->raw = rb_str_new(raw, rawlen); \
|
269
|
+
} \
|
270
|
+
ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
|
271
|
+
S->last = ele
|
272
|
+
|
273
|
+
VALUE
|
274
|
+
hpricot_ele_alloc(VALUE klass)
|
275
|
+
{
|
276
|
+
VALUE ele;
|
277
|
+
hpricot_ele *he = ALLOC(hpricot_ele);
|
278
|
+
he->name = 0;
|
279
|
+
he->tag = he->attr = he->raw = he->EC = Qnil;
|
280
|
+
he->etag = he->parent = he->children = Qnil;
|
281
|
+
ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
|
282
|
+
return ele;
|
283
|
+
}
|
284
|
+
|
285
|
+
//
|
286
|
+
// the swift, compact parser logic. most of the complicated stuff is done
|
287
|
+
// in the lexer. this step just pairs up the start and end tags.
|
288
|
+
//
|
289
|
+
void
|
290
|
+
rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw, int rawlen, int taint)
|
291
|
+
{
|
292
|
+
VALUE ele, ec = Qnil;
|
293
|
+
|
294
|
+
//
|
295
|
+
// in html mode, fix up start tags incorrectly formed as empty tags
|
296
|
+
//
|
297
|
+
if (!S->xml) {
|
298
|
+
hpricot_ele *last;
|
299
|
+
Data_Get_Struct(S->focus, hpricot_ele, last);
|
300
|
+
if (last->EC == sym_CDATA &&
|
301
|
+
(sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
|
302
|
+
!(sym == sym_etag && rb_str_hash(tag) == last->name))
|
303
|
+
{
|
304
|
+
sym = sym_text;
|
305
|
+
tag = rb_str_new(raw, rawlen);
|
306
|
+
}
|
307
|
+
|
308
|
+
if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
|
309
|
+
ec = rb_hash_aref(S->EC, tag);
|
310
|
+
if (NIL_P(ec)) {
|
311
|
+
tag = rb_funcall(tag, s_downcase, 0);
|
312
|
+
ec = rb_hash_aref(S->EC, tag);
|
313
|
+
}
|
314
|
+
if (sym == sym_emptytag) {
|
315
|
+
if (ec != sym_EMPTY)
|
316
|
+
sym = sym_stag;
|
317
|
+
} else if (sym == sym_stag) {
|
318
|
+
if (ec == sym_EMPTY)
|
319
|
+
sym = sym_emptytag;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
}
|
323
|
+
|
324
|
+
if (sym == sym_emptytag || sym == sym_stag) {
|
325
|
+
H_ELE(cElem);
|
326
|
+
he->name = rb_str_hash(tag);
|
327
|
+
|
328
|
+
if (!S->xml) {
|
329
|
+
VALUE match = Qnil, e = S->focus;
|
330
|
+
while (e != S->doc)
|
331
|
+
{
|
332
|
+
hpricot_ele *hee;
|
333
|
+
Data_Get_Struct(e, hpricot_ele, hee);
|
334
|
+
|
335
|
+
if (TYPE(hee->EC) == T_HASH)
|
336
|
+
{
|
337
|
+
VALUE has = our_rb_hash_lookup(hee->EC, INT2NUM(he->name));
|
338
|
+
if (has != Qnil) {
|
339
|
+
if (has == Qtrue) {
|
340
|
+
if (match == Qnil)
|
341
|
+
match = e;
|
342
|
+
} else if (has == symAllow) {
|
343
|
+
match = S->focus;
|
344
|
+
} else if (has == symDeny) {
|
345
|
+
match = Qnil;
|
346
|
+
}
|
347
|
+
}
|
348
|
+
}
|
349
|
+
|
350
|
+
e = hee->parent;
|
351
|
+
}
|
352
|
+
|
353
|
+
if (match == Qnil)
|
354
|
+
match = S->focus;
|
355
|
+
S->focus = match;
|
356
|
+
}
|
357
|
+
|
358
|
+
rb_hpricot_add(S->focus, ele);
|
359
|
+
|
360
|
+
//
|
361
|
+
// in the case of a start tag that should be empty, just
|
362
|
+
// skip the step that focuses the element. focusing moves
|
363
|
+
// us deeper into the document.
|
364
|
+
//
|
365
|
+
if (sym == sym_stag) {
|
366
|
+
if (S->xml || ec != sym_EMPTY) {
|
367
|
+
S->focus = ele;
|
368
|
+
S->last = Qnil;
|
369
|
+
}
|
370
|
+
}
|
371
|
+
} else if (sym == sym_etag) {
|
372
|
+
int name;
|
373
|
+
VALUE match = Qnil, e = S->focus;
|
374
|
+
if (S->strict) {
|
375
|
+
if (NIL_P(rb_hash_aref(S->EC, tag))) {
|
376
|
+
tag = rb_str_new2("div");
|
377
|
+
}
|
378
|
+
}
|
379
|
+
|
380
|
+
//
|
381
|
+
// another optimization will be to improve this very simple
|
382
|
+
// O(n) tag search, where n is the depth of the focused tag.
|
383
|
+
//
|
384
|
+
// (see also: the search above for fixups)
|
385
|
+
//
|
386
|
+
name = rb_str_hash(tag);
|
387
|
+
while (e != S->doc)
|
388
|
+
{
|
389
|
+
hpricot_ele *he;
|
390
|
+
Data_Get_Struct(e, hpricot_ele, he);
|
391
|
+
|
392
|
+
if (he->name == name)
|
393
|
+
{
|
394
|
+
match = e;
|
395
|
+
break;
|
396
|
+
}
|
397
|
+
|
398
|
+
e = he->parent;
|
399
|
+
}
|
400
|
+
|
401
|
+
if (NIL_P(match))
|
402
|
+
{
|
403
|
+
H_ELE(cBogusETag);
|
404
|
+
rb_hpricot_add(S->focus, ele);
|
405
|
+
}
|
406
|
+
else
|
407
|
+
{
|
408
|
+
H_ELE(cETag);
|
409
|
+
Data_Get_Struct(match, hpricot_ele, he);
|
410
|
+
he->etag = ele;
|
411
|
+
S->focus = he->parent;
|
412
|
+
S->last = Qnil;
|
413
|
+
}
|
414
|
+
} else if (sym == sym_cdata) {
|
415
|
+
H_ELE(cCData);
|
416
|
+
rb_hpricot_add(S->focus, ele);
|
417
|
+
} else if (sym == sym_comment) {
|
418
|
+
H_ELE(cComment);
|
419
|
+
rb_hpricot_add(S->focus, ele);
|
420
|
+
} else if (sym == sym_doctype) {
|
421
|
+
H_ELE(cDocType);
|
422
|
+
if (S->strict) {
|
423
|
+
rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
|
424
|
+
rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN"));
|
425
|
+
}
|
426
|
+
rb_hpricot_add(S->focus, ele);
|
427
|
+
} else if (sym == sym_procins) {
|
428
|
+
VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
|
429
|
+
tag = rb_reg_nth_match(1, match);
|
430
|
+
attr = rb_reg_nth_match(2, match);
|
431
|
+
{
|
432
|
+
H_ELE(cProcIns);
|
433
|
+
rb_hpricot_add(S->focus, ele);
|
434
|
+
}
|
435
|
+
} else if (sym == sym_text) {
|
436
|
+
// TODO: add raw_string as well?
|
437
|
+
if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
|
438
|
+
hpricot_ele *he;
|
439
|
+
Data_Get_Struct(S->last, hpricot_ele, he);
|
440
|
+
rb_str_append(he->tag, tag);
|
441
|
+
} else {
|
442
|
+
H_ELE(cText);
|
443
|
+
rb_hpricot_add(S->focus, ele);
|
444
|
+
}
|
445
|
+
} else if (sym == sym_xmldecl) {
|
446
|
+
H_ELE(cXMLDecl);
|
447
|
+
rb_hpricot_add(S->focus, ele);
|
448
|
+
}
|
449
|
+
}
|
450
|
+
|
451
|
+
VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
452
|
+
{
|
453
|
+
int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
|
454
|
+
char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
|
455
|
+
|
456
|
+
hpricot_state *S = NULL;
|
457
|
+
VALUE port, opts;
|
458
|
+
VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
|
459
|
+
char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
|
460
|
+
int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
|
461
|
+
|
462
|
+
rb_scan_args(argc, argv, "11", &port, &opts);
|
463
|
+
taint = OBJ_TAINTED(port);
|
464
|
+
io = rb_respond_to(port, s_read);
|
465
|
+
if (!io)
|
466
|
+
{
|
467
|
+
if (rb_respond_to(port, s_to_str))
|
468
|
+
{
|
469
|
+
port = rb_funcall(port, s_to_str, 0);
|
470
|
+
StringValue(port);
|
471
|
+
}
|
472
|
+
else
|
473
|
+
{
|
474
|
+
rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
|
475
|
+
}
|
476
|
+
}
|
477
|
+
|
478
|
+
if (TYPE(opts) != T_HASH)
|
479
|
+
opts = Qnil;
|
480
|
+
|
481
|
+
if (!rb_block_given_p())
|
482
|
+
{
|
483
|
+
hpricot_ele *he = ALLOC(hpricot_ele);
|
484
|
+
S = ALLOC(hpricot_state);
|
485
|
+
MEMZERO(he, hpricot_ele, 1);
|
486
|
+
he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
|
487
|
+
S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
|
488
|
+
rb_gc_register_address(&S->doc);
|
489
|
+
S->focus = S->doc;
|
490
|
+
S->last = Qnil;
|
491
|
+
S->xml = OPT(opts, xml);
|
492
|
+
S->strict = OPT(opts, xhtml_strict);
|
493
|
+
S->fixup = OPT(opts, fixup_tags);
|
494
|
+
if (S->strict) S->fixup = 1;
|
495
|
+
rb_ivar_set(S->doc, rb_intern("@options"), opts);
|
496
|
+
|
497
|
+
S->EC = rb_const_get(mHpricot, s_ElementContent);
|
498
|
+
}
|
499
|
+
|
500
|
+
buffer_size = BUFSIZE;
|
501
|
+
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
|
502
|
+
bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
|
503
|
+
if (!NIL_P(bufsize)) {
|
504
|
+
buffer_size = NUM2INT(bufsize);
|
505
|
+
}
|
506
|
+
}
|
507
|
+
|
508
|
+
if (io)
|
509
|
+
buf = ALLOC_N(char, buffer_size);
|
510
|
+
|
511
|
+
%% write init;
|
512
|
+
|
513
|
+
while (!done) {
|
514
|
+
VALUE str;
|
515
|
+
char *p, *pe;
|
516
|
+
int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
|
517
|
+
|
518
|
+
if (io)
|
519
|
+
{
|
520
|
+
if (space == 0) {
|
521
|
+
/* We've used up the entire buffer storing an already-parsed token
|
522
|
+
* prefix that must be preserved. Likely caused by super-long attributes.
|
523
|
+
* Increase buffer size and continue */
|
524
|
+
tokstart_diff = ts - buf;
|
525
|
+
tokend_diff = te - buf;
|
526
|
+
mark_tag_diff = mark_tag - buf;
|
527
|
+
mark_akey_diff = mark_akey - buf;
|
528
|
+
mark_aval_diff = mark_aval - buf;
|
529
|
+
|
530
|
+
buffer_size += BUFSIZE;
|
531
|
+
REALLOC_N(buf, char, buffer_size);
|
532
|
+
|
533
|
+
space = buffer_size - have;
|
534
|
+
|
535
|
+
ts = buf + tokstart_diff;
|
536
|
+
te = buf + tokend_diff;
|
537
|
+
mark_tag = buf + mark_tag_diff;
|
538
|
+
mark_akey = buf + mark_akey_diff;
|
539
|
+
mark_aval = buf + mark_aval_diff;
|
540
|
+
}
|
541
|
+
p = buf + have;
|
542
|
+
|
543
|
+
str = rb_funcall(port, s_read, 1, INT2FIX(space));
|
544
|
+
len = RSTRING_LEN(str);
|
545
|
+
memcpy(p, StringValuePtr(str), len);
|
546
|
+
}
|
547
|
+
else
|
548
|
+
{
|
549
|
+
p = RSTRING_PTR(port);
|
550
|
+
len = RSTRING_LEN(port) + 1;
|
551
|
+
done = 1;
|
552
|
+
}
|
553
|
+
|
554
|
+
nread += len;
|
555
|
+
|
556
|
+
/* If this is the last buffer, tack on an EOF. */
|
557
|
+
if (io && len < space) {
|
558
|
+
p[len++] = 0;
|
559
|
+
done = 1;
|
560
|
+
}
|
561
|
+
|
562
|
+
pe = p + len;
|
563
|
+
%% write exec;
|
564
|
+
|
565
|
+
if (cs == hpricot_scan_error) {
|
566
|
+
if (buf != NULL)
|
567
|
+
free(buf);
|
568
|
+
if (!NIL_P(tag))
|
569
|
+
{
|
570
|
+
rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
|
571
|
+
}
|
572
|
+
else
|
573
|
+
{
|
574
|
+
rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
|
575
|
+
}
|
576
|
+
}
|
577
|
+
|
578
|
+
if (done && ele_open)
|
579
|
+
{
|
580
|
+
ele_open = 0;
|
581
|
+
if (ts > 0) {
|
582
|
+
mark_tag = ts;
|
583
|
+
ts = 0;
|
584
|
+
text = 1;
|
585
|
+
}
|
586
|
+
}
|
587
|
+
|
588
|
+
if (ts == 0)
|
589
|
+
{
|
590
|
+
have = 0;
|
591
|
+
/* text nodes have no ts because each byte is parsed alone */
|
592
|
+
if (mark_tag != NULL && text == 1)
|
593
|
+
{
|
594
|
+
if (done)
|
595
|
+
{
|
596
|
+
if (mark_tag < p-1)
|
597
|
+
{
|
598
|
+
CAT(tag, p-1);
|
599
|
+
ELE(text);
|
600
|
+
}
|
601
|
+
}
|
602
|
+
else
|
603
|
+
{
|
604
|
+
CAT(tag, p);
|
605
|
+
}
|
606
|
+
}
|
607
|
+
if (io)
|
608
|
+
mark_tag = buf;
|
609
|
+
else
|
610
|
+
mark_tag = RSTRING_PTR(port);
|
611
|
+
}
|
612
|
+
else if (io)
|
613
|
+
{
|
614
|
+
have = pe - ts;
|
615
|
+
memmove(buf, ts, have);
|
616
|
+
SLIDE(tag);
|
617
|
+
SLIDE(akey);
|
618
|
+
SLIDE(aval);
|
619
|
+
te = buf + (te - ts);
|
620
|
+
ts = buf;
|
621
|
+
}
|
622
|
+
}
|
623
|
+
|
624
|
+
if (buf != NULL)
|
625
|
+
free(buf);
|
626
|
+
|
627
|
+
if (S != NULL)
|
628
|
+
{
|
629
|
+
VALUE doc = S->doc;
|
630
|
+
rb_gc_unregister_address(&S->doc);
|
631
|
+
free(S);
|
632
|
+
return doc;
|
633
|
+
}
|
634
|
+
|
635
|
+
return Qnil;
|
636
|
+
}
|
637
|
+
|
638
|
+
void Init_hpricot_scan()
|
639
|
+
{
|
640
|
+
mHpricot = rb_define_module("Hpricot");
|
641
|
+
rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
|
642
|
+
rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
|
643
|
+
rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
|
644
|
+
rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
|
645
|
+
|
646
|
+
cDoc = rb_define_class_under(mHpricot, "Doc", rb_cObject);
|
647
|
+
rb_define_alloc_func(cDoc, hpricot_ele_alloc);
|
648
|
+
rb_define_method(cDoc, "children", hpricot_ele_get_children, 0);
|
649
|
+
rb_define_method(cDoc, "children=", hpricot_ele_set_children, 1);
|
650
|
+
|
651
|
+
cBaseEle = rb_define_class_under(mHpricot, "BaseEle", rb_cObject);
|
652
|
+
rb_define_alloc_func(cBaseEle, hpricot_ele_alloc);
|
653
|
+
rb_define_method(cBaseEle, "raw_string", hpricot_ele_get_raw, 0);
|
654
|
+
rb_define_method(cBaseEle, "clear_raw", hpricot_ele_clear_raw, 0);
|
655
|
+
rb_define_method(cBaseEle, "parent", hpricot_ele_get_parent, 0);
|
656
|
+
rb_define_method(cBaseEle, "parent=", hpricot_ele_set_parent, 1);
|
657
|
+
cCData = rb_define_class_under(mHpricot, "CData", cBaseEle);
|
658
|
+
rb_define_method(cCData, "content", hpricot_ele_get_tag, 0);
|
659
|
+
rb_define_method(cCData, "content=", hpricot_ele_set_tag, 1);
|
660
|
+
cComment = rb_define_class_under(mHpricot, "Comment", cBaseEle);
|
661
|
+
rb_define_method(cComment, "content", hpricot_ele_get_tag, 0);
|
662
|
+
rb_define_method(cComment, "content=", hpricot_ele_set_tag, 1);
|
663
|
+
cDocType = rb_define_class_under(mHpricot, "DocType", cBaseEle);
|
664
|
+
rb_define_method(cDocType, "target", hpricot_ele_get_tag, 0);
|
665
|
+
rb_define_method(cDocType, "target=", hpricot_ele_set_tag, 1);
|
666
|
+
rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
|
667
|
+
rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
|
668
|
+
rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
|
669
|
+
rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
|
670
|
+
cElem = rb_define_class_under(mHpricot, "Elem", cBaseEle);
|
671
|
+
rb_define_method(cElem, "raw_attributes", hpricot_ele_get_attr, 0);
|
672
|
+
rb_define_method(cElem, "raw_attributes=", hpricot_ele_set_attr, 1);
|
673
|
+
rb_define_method(cElem, "children", hpricot_ele_get_children, 0);
|
674
|
+
rb_define_method(cElem, "children=", hpricot_ele_set_children, 1);
|
675
|
+
rb_define_method(cElem, "etag", hpricot_ele_get_etag, 0);
|
676
|
+
rb_define_method(cElem, "etag=", hpricot_ele_set_etag, 1);
|
677
|
+
rb_define_method(cElem, "name", hpricot_ele_get_tag, 0);
|
678
|
+
rb_define_method(cElem, "name=", hpricot_ele_set_tag, 1);
|
679
|
+
cETag = rb_define_class_under(mHpricot, "ETag", cBaseEle);
|
680
|
+
rb_define_method(cETag, "name", hpricot_ele_get_tag, 0);
|
681
|
+
rb_define_method(cETag, "name=", hpricot_ele_set_tag, 1);
|
682
|
+
cBogusETag = rb_define_class_under(mHpricot, "BogusETag", cETag);
|
683
|
+
cText = rb_define_class_under(mHpricot, "Text", cBaseEle);
|
684
|
+
rb_define_method(cText, "content", hpricot_ele_get_tag, 0);
|
685
|
+
rb_define_method(cText, "content=", hpricot_ele_set_tag, 1);
|
686
|
+
cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", cBaseEle);
|
687
|
+
rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
|
688
|
+
rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
|
689
|
+
rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
|
690
|
+
rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
|
691
|
+
rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
|
692
|
+
rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
|
693
|
+
cProcIns = rb_define_class_under(mHpricot, "ProcIns", cBaseEle);
|
694
|
+
rb_define_method(cProcIns, "target", hpricot_ele_get_tag, 0);
|
695
|
+
rb_define_method(cProcIns, "target=", hpricot_ele_set_tag, 1);
|
696
|
+
rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
|
697
|
+
rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
|
698
|
+
|
699
|
+
s_ElementContent = rb_intern("ElementContent");
|
700
|
+
symAllow = ID2SYM(rb_intern("allow"));
|
701
|
+
symDeny = ID2SYM(rb_intern("deny"));
|
702
|
+
s_downcase = rb_intern("downcase");
|
703
|
+
s_new = rb_intern("new");
|
704
|
+
s_parent = rb_intern("parent");
|
705
|
+
s_read = rb_intern("read");
|
706
|
+
s_to_str = rb_intern("to_str");
|
707
|
+
iv_parent = rb_intern("parent");
|
708
|
+
sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
|
709
|
+
sym_doctype = ID2SYM(rb_intern("doctype"));
|
710
|
+
sym_procins = ID2SYM(rb_intern("procins"));
|
711
|
+
sym_stag = ID2SYM(rb_intern("stag"));
|
712
|
+
sym_etag = ID2SYM(rb_intern("etag"));
|
713
|
+
sym_emptytag = ID2SYM(rb_intern("emptytag"));
|
714
|
+
sym_comment = ID2SYM(rb_intern("comment"));
|
715
|
+
sym_cdata = ID2SYM(rb_intern("cdata"));
|
716
|
+
sym_text = ID2SYM(rb_intern("text"));
|
717
|
+
sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
|
718
|
+
sym_CDATA = ID2SYM(rb_intern("CDATA"));
|
719
|
+
|
720
|
+
rb_const_set(mHpricot, rb_intern("ProcInsParse"),
|
721
|
+
reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
|
722
|
+
}
|