hpricot 0.6.164 → 0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -0
- data/Rakefile +31 -33
- data/ext/fast_xs/fast_xs.c +11 -5
- data/ext/hpricot_scan/HpricotScanService.java +10 -6
- data/ext/hpricot_scan/hpricot_css.c +3502 -0
- data/ext/hpricot_scan/hpricot_css.rl +115 -0
- data/ext/hpricot_scan/hpricot_scan.c +1032 -589
- data/ext/hpricot_scan/hpricot_scan.java.rl +5 -1
- data/ext/hpricot_scan/hpricot_scan.rl +493 -50
- data/ext/hpricot_scan/test.rb +1 -2
- data/lib/hpricot/builder.rb +21 -20
- data/lib/hpricot/elements.rb +12 -12
- data/lib/hpricot/htmlinfo.rb +19 -0
- data/lib/hpricot/inspect.rb +27 -31
- data/lib/hpricot/modules.rb +2 -1
- data/lib/hpricot/parse.rb +8 -268
- data/lib/hpricot/tag.rb +65 -99
- data/lib/hpricot/traverse.rb +20 -14
- data/test/files/boingboing.html +1 -1
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_builder.rb +4 -4
- data/test/test_parser.rb +36 -13
- data/test/test_preserved.rb +6 -2
- metadata +51 -51
- data/ext/hpricot_scan/hpricot_gram.c +0 -882
- data/ext/hpricot_scan/hpricot_gram.h +0 -9
@@ -264,7 +264,11 @@ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
|
|
264
264
|
/* We've used up the entire buffer storing an already-parsed token
|
265
265
|
* prefix that must be preserved. Likely caused by super-long attributes.
|
266
266
|
* See ticket #13. */
|
267
|
-
|
267
|
+
buffer_size += BUFSIZE;
|
268
|
+
char[] new_buf = new char[buffer_size];
|
269
|
+
System.arraycopy(buf, 0, new_buf, 0, buf.length);
|
270
|
+
buf = new_buf;
|
271
|
+
space = buffer_size - have;
|
268
272
|
}
|
269
273
|
|
270
274
|
if (port.respondsTo("read")) {
|
@@ -14,21 +14,42 @@
|
|
14
14
|
#define RSTRING_PTR(str) RSTRING(str)->ptr
|
15
15
|
#endif
|
16
16
|
|
17
|
+
VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
|
18
|
+
|
17
19
|
#define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
|
18
20
|
|
19
21
|
static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
|
20
|
-
sym_cdata, sym_text;
|
21
|
-
static VALUE rb_eHpricotParseError;
|
22
|
-
static
|
22
|
+
sym_cdata, sym_text, sym_EMPTY, sym_CDATA;
|
23
|
+
static VALUE mHpricot, rb_eHpricotParseError;
|
24
|
+
static VALUE cBaseEle, cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cETag, cText,
|
25
|
+
cXMLDecl, cProcIns, symAllow, symDeny;
|
26
|
+
static ID s_ElementContent;
|
27
|
+
static ID s_downcase, s_new, s_parent, s_read, s_to_str;
|
28
|
+
static ID iv_parent;
|
29
|
+
static VALUE reProcInsParse;
|
30
|
+
|
31
|
+
typedef struct {
|
32
|
+
int name;
|
33
|
+
VALUE tag, attr, etag, raw, EC;
|
34
|
+
VALUE parent, children;
|
35
|
+
} hpricot_ele;
|
36
|
+
|
37
|
+
#define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
|
23
38
|
|
24
39
|
#define ELE(N) \
|
25
40
|
if (te > ts || text == 1) { \
|
26
|
-
|
41
|
+
char *raw = NULL; \
|
42
|
+
int rawlen = 0; \
|
27
43
|
ele_open = 0; text = 0; \
|
28
44
|
if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
|
29
|
-
|
45
|
+
raw = ts; rawlen = te - ts; \
|
30
46
|
} \
|
31
|
-
|
47
|
+
if (rb_block_given_p()) { \
|
48
|
+
VALUE raw_string = Qnil; \
|
49
|
+
if (raw != NULL) raw_string = rb_str_new(raw, rawlen); \
|
50
|
+
rb_yield_tokens(sym_##N, tag, attr, Qnil, taint); \
|
51
|
+
} else \
|
52
|
+
rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint); \
|
32
53
|
}
|
33
54
|
|
34
55
|
#define SET(N, E) \
|
@@ -39,7 +60,7 @@ static ID s_read, s_to_str;
|
|
39
60
|
|
40
61
|
#define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
|
41
62
|
|
42
|
-
#define SLIDE(N) if (
|
63
|
+
#define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
|
43
64
|
|
44
65
|
#define ATTR(K, V) \
|
45
66
|
if (!NIL_P(K)) { \
|
@@ -91,11 +112,11 @@ static ID s_read, s_to_str;
|
|
91
112
|
else { SET(aval, p); }
|
92
113
|
}
|
93
114
|
action akey { SET(akey, p); }
|
94
|
-
action xmlver { SET(aval, p); ATTR(
|
95
|
-
action xmlenc { SET(aval, p); ATTR(
|
96
|
-
action xmlsd { SET(aval, p); ATTR(
|
97
|
-
action pubid { SET(aval, p); ATTR(
|
98
|
-
action sysid { SET(aval, p); ATTR(
|
115
|
+
action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
|
116
|
+
action xmlenc { SET(aval, p); ATTR(ID2SYM(rb_intern("encoding")), aval); }
|
117
|
+
action xmlsd { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
|
118
|
+
action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
|
119
|
+
action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
|
99
120
|
|
100
121
|
action new_attr {
|
101
122
|
akey = Qnil;
|
@@ -132,29 +153,350 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
|
|
132
153
|
rb_yield(ary);
|
133
154
|
}
|
134
155
|
|
135
|
-
|
156
|
+
/* rb_hash_lookup() is only in Ruby 1.8.7 */
|
157
|
+
static VALUE
|
158
|
+
our_rb_hash_lookup(VALUE hash, VALUE key)
|
159
|
+
{
|
160
|
+
VALUE val;
|
161
|
+
|
162
|
+
if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
|
163
|
+
return Qnil; /* without Hash#default */
|
164
|
+
}
|
165
|
+
|
166
|
+
return val;
|
167
|
+
}
|
168
|
+
|
169
|
+
static void
|
170
|
+
rb_hpricot_add(VALUE focus, VALUE ele)
|
171
|
+
{
|
172
|
+
hpricot_ele *he, *he2;
|
173
|
+
Data_Get_Struct(focus, hpricot_ele, he);
|
174
|
+
Data_Get_Struct(ele, hpricot_ele, he2);
|
175
|
+
if (NIL_P(he->children))
|
176
|
+
he->children = rb_ary_new();
|
177
|
+
rb_ary_push(he->children, ele);
|
178
|
+
he2->parent = focus;
|
179
|
+
}
|
180
|
+
|
181
|
+
typedef struct {
|
182
|
+
VALUE doc;
|
183
|
+
VALUE focus;
|
184
|
+
VALUE last;
|
185
|
+
VALUE EC;
|
186
|
+
unsigned char xml, strict, fixup;
|
187
|
+
} hpricot_state;
|
188
|
+
|
189
|
+
static void
|
190
|
+
hpricot_ele_mark(hpricot_ele *he)
|
191
|
+
{
|
192
|
+
rb_gc_mark(he->tag);
|
193
|
+
rb_gc_mark(he->attr);
|
194
|
+
rb_gc_mark(he->etag);
|
195
|
+
rb_gc_mark(he->raw);
|
196
|
+
rb_gc_mark(he->parent);
|
197
|
+
rb_gc_mark(he->children);
|
198
|
+
}
|
199
|
+
|
200
|
+
static void
|
201
|
+
hpricot_ele_free(hpricot_ele *he)
|
202
|
+
{
|
203
|
+
free(he);
|
204
|
+
}
|
205
|
+
|
206
|
+
#define H_PROP(prop) \
|
207
|
+
static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
|
208
|
+
hpricot_ele *he; \
|
209
|
+
Data_Get_Struct(self, hpricot_ele, he); \
|
210
|
+
he->prop = x; \
|
211
|
+
return self; \
|
212
|
+
} \
|
213
|
+
static VALUE hpricot_ele_get_##prop(VALUE self) { \
|
214
|
+
hpricot_ele *he; \
|
215
|
+
Data_Get_Struct(self, hpricot_ele, he); \
|
216
|
+
return he->prop; \
|
217
|
+
}
|
218
|
+
|
219
|
+
#define H_ATTR(prop) \
|
220
|
+
static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
|
221
|
+
hpricot_ele *he; \
|
222
|
+
Data_Get_Struct(self, hpricot_ele, he); \
|
223
|
+
rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
|
224
|
+
return self; \
|
225
|
+
} \
|
226
|
+
static VALUE hpricot_ele_get_##prop(VALUE self) { \
|
227
|
+
hpricot_ele *he; \
|
228
|
+
Data_Get_Struct(self, hpricot_ele, he); \
|
229
|
+
return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
|
230
|
+
}
|
231
|
+
|
232
|
+
H_PROP(tag);
|
233
|
+
H_PROP(attr);
|
234
|
+
H_PROP(etag);
|
235
|
+
H_PROP(parent);
|
236
|
+
H_PROP(children);
|
237
|
+
H_ATTR(encoding);
|
238
|
+
H_ATTR(version);
|
239
|
+
H_ATTR(standalone);
|
240
|
+
H_ATTR(system_id);
|
241
|
+
H_ATTR(public_id);
|
242
|
+
|
243
|
+
static VALUE
|
244
|
+
hpricot_ele_get_raw(VALUE self, VALUE x) {
|
245
|
+
hpricot_ele *he;
|
246
|
+
Data_Get_Struct(self, hpricot_ele, he);
|
247
|
+
return he->raw;
|
248
|
+
}
|
249
|
+
|
250
|
+
static VALUE
|
251
|
+
hpricot_ele_clear_raw(VALUE self)
|
252
|
+
{
|
253
|
+
hpricot_ele *he;
|
254
|
+
Data_Get_Struct(self, hpricot_ele, he);
|
255
|
+
he->raw = Qnil;
|
256
|
+
return Qtrue;
|
257
|
+
}
|
258
|
+
|
259
|
+
#define H_ELE(klass) \
|
260
|
+
hpricot_ele *he = ALLOC(hpricot_ele); \
|
261
|
+
he->name = 0; \
|
262
|
+
he->tag = tag; \
|
263
|
+
he->attr = attr; \
|
264
|
+
he->raw = Qnil; \
|
265
|
+
he->EC = ec; \
|
266
|
+
he->etag = he->parent = he->children = Qnil; \
|
267
|
+
if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_etag || sym == sym_doctype)) { \
|
268
|
+
he->raw = rb_str_new(raw, rawlen); \
|
269
|
+
} \
|
270
|
+
ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
|
271
|
+
S->last = ele
|
272
|
+
|
273
|
+
VALUE
|
274
|
+
hpricot_ele_alloc(VALUE klass)
|
275
|
+
{
|
276
|
+
VALUE ele;
|
277
|
+
hpricot_ele *he = ALLOC(hpricot_ele);
|
278
|
+
he->name = 0;
|
279
|
+
he->tag = he->attr = he->raw = he->EC = Qnil;
|
280
|
+
he->etag = he->parent = he->children = Qnil;
|
281
|
+
ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
|
282
|
+
return ele;
|
283
|
+
}
|
284
|
+
|
285
|
+
//
|
286
|
+
// the swift, compact parser logic. most of the complicated stuff is done
|
287
|
+
// in the lexer. this step just pairs up the start and end tags.
|
288
|
+
//
|
289
|
+
void
|
290
|
+
rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw, int rawlen, int taint)
|
291
|
+
{
|
292
|
+
VALUE ele, ec = Qnil;
|
293
|
+
|
294
|
+
//
|
295
|
+
// in html mode, fix up start tags incorrectly formed as empty tags
|
296
|
+
//
|
297
|
+
if (!S->xml) {
|
298
|
+
hpricot_ele *last;
|
299
|
+
Data_Get_Struct(S->focus, hpricot_ele, last);
|
300
|
+
if (last->EC == sym_CDATA &&
|
301
|
+
(sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
|
302
|
+
!(sym == sym_etag && rb_str_hash(tag) == last->name))
|
303
|
+
{
|
304
|
+
sym = sym_text;
|
305
|
+
tag = rb_str_new(raw, rawlen);
|
306
|
+
}
|
307
|
+
|
308
|
+
if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
|
309
|
+
ec = rb_hash_aref(S->EC, tag);
|
310
|
+
if (NIL_P(ec)) {
|
311
|
+
tag = rb_funcall(tag, s_downcase, 0);
|
312
|
+
ec = rb_hash_aref(S->EC, tag);
|
313
|
+
}
|
314
|
+
if (sym == sym_emptytag) {
|
315
|
+
if (ec != sym_EMPTY)
|
316
|
+
sym = sym_stag;
|
317
|
+
} else if (sym == sym_stag) {
|
318
|
+
if (ec == sym_EMPTY)
|
319
|
+
sym = sym_emptytag;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
}
|
323
|
+
|
324
|
+
if (sym == sym_emptytag || sym == sym_stag) {
|
325
|
+
H_ELE(cElem);
|
326
|
+
he->name = rb_str_hash(tag);
|
327
|
+
|
328
|
+
if (!S->xml) {
|
329
|
+
VALUE match = Qnil, e = S->focus;
|
330
|
+
while (e != S->doc)
|
331
|
+
{
|
332
|
+
hpricot_ele *hee;
|
333
|
+
Data_Get_Struct(e, hpricot_ele, hee);
|
334
|
+
|
335
|
+
if (TYPE(hee->EC) == T_HASH)
|
336
|
+
{
|
337
|
+
VALUE has = our_rb_hash_lookup(hee->EC, INT2NUM(he->name));
|
338
|
+
if (has != Qnil) {
|
339
|
+
if (has == Qtrue) {
|
340
|
+
if (match == Qnil)
|
341
|
+
match = e;
|
342
|
+
} else if (has == symAllow) {
|
343
|
+
match = S->focus;
|
344
|
+
} else if (has == symDeny) {
|
345
|
+
match = Qnil;
|
346
|
+
}
|
347
|
+
}
|
348
|
+
}
|
349
|
+
|
350
|
+
e = hee->parent;
|
351
|
+
}
|
352
|
+
|
353
|
+
if (match == Qnil)
|
354
|
+
match = S->focus;
|
355
|
+
S->focus = match;
|
356
|
+
}
|
357
|
+
|
358
|
+
rb_hpricot_add(S->focus, ele);
|
359
|
+
|
360
|
+
//
|
361
|
+
// in the case of a start tag that should be empty, just
|
362
|
+
// skip the step that focuses the element. focusing moves
|
363
|
+
// us deeper into the document.
|
364
|
+
//
|
365
|
+
if (sym == sym_stag) {
|
366
|
+
if (S->xml || ec != sym_EMPTY) {
|
367
|
+
S->focus = ele;
|
368
|
+
S->last = Qnil;
|
369
|
+
}
|
370
|
+
}
|
371
|
+
} else if (sym == sym_etag) {
|
372
|
+
int name;
|
373
|
+
VALUE match = Qnil, e = S->focus;
|
374
|
+
if (S->strict) {
|
375
|
+
if (NIL_P(rb_hash_aref(S->EC, tag))) {
|
376
|
+
tag = rb_str_new2("div");
|
377
|
+
}
|
378
|
+
}
|
379
|
+
|
380
|
+
//
|
381
|
+
// another optimization will be to improve this very simple
|
382
|
+
// O(n) tag search, where n is the depth of the focused tag.
|
383
|
+
//
|
384
|
+
// (see also: the search above for fixups)
|
385
|
+
//
|
386
|
+
name = rb_str_hash(tag);
|
387
|
+
while (e != S->doc)
|
388
|
+
{
|
389
|
+
hpricot_ele *he;
|
390
|
+
Data_Get_Struct(e, hpricot_ele, he);
|
391
|
+
|
392
|
+
if (he->name == name)
|
393
|
+
{
|
394
|
+
match = e;
|
395
|
+
break;
|
396
|
+
}
|
397
|
+
|
398
|
+
e = he->parent;
|
399
|
+
}
|
400
|
+
|
401
|
+
if (NIL_P(match))
|
402
|
+
{
|
403
|
+
H_ELE(cBogusETag);
|
404
|
+
rb_hpricot_add(S->focus, ele);
|
405
|
+
}
|
406
|
+
else
|
407
|
+
{
|
408
|
+
H_ELE(cETag);
|
409
|
+
Data_Get_Struct(match, hpricot_ele, he);
|
410
|
+
he->etag = ele;
|
411
|
+
S->focus = he->parent;
|
412
|
+
S->last = Qnil;
|
413
|
+
}
|
414
|
+
} else if (sym == sym_cdata) {
|
415
|
+
H_ELE(cCData);
|
416
|
+
rb_hpricot_add(S->focus, ele);
|
417
|
+
} else if (sym == sym_comment) {
|
418
|
+
H_ELE(cComment);
|
419
|
+
rb_hpricot_add(S->focus, ele);
|
420
|
+
} else if (sym == sym_doctype) {
|
421
|
+
H_ELE(cDocType);
|
422
|
+
if (S->strict) {
|
423
|
+
rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
|
424
|
+
rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN"));
|
425
|
+
}
|
426
|
+
rb_hpricot_add(S->focus, ele);
|
427
|
+
} else if (sym == sym_procins) {
|
428
|
+
VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
|
429
|
+
tag = rb_reg_nth_match(1, match);
|
430
|
+
attr = rb_reg_nth_match(2, match);
|
431
|
+
{
|
432
|
+
H_ELE(cProcIns);
|
433
|
+
rb_hpricot_add(S->focus, ele);
|
434
|
+
}
|
435
|
+
} else if (sym == sym_text) {
|
436
|
+
// TODO: add raw_string as well?
|
437
|
+
if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
|
438
|
+
hpricot_ele *he;
|
439
|
+
Data_Get_Struct(S->last, hpricot_ele, he);
|
440
|
+
rb_str_append(he->tag, tag);
|
441
|
+
} else {
|
442
|
+
H_ELE(cText);
|
443
|
+
rb_hpricot_add(S->focus, ele);
|
444
|
+
}
|
445
|
+
} else if (sym == sym_xmldecl) {
|
446
|
+
H_ELE(cXMLDecl);
|
447
|
+
rb_hpricot_add(S->focus, ele);
|
448
|
+
}
|
449
|
+
}
|
450
|
+
|
451
|
+
VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
136
452
|
{
|
137
|
-
int cs, act, have = 0, nread = 0, curline = 1, text = 0;
|
453
|
+
int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
|
138
454
|
char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
|
139
455
|
|
456
|
+
hpricot_state *S = NULL;
|
457
|
+
VALUE port, opts;
|
140
458
|
VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
|
141
459
|
char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
|
142
|
-
int done = 0, ele_open = 0, buffer_size = 0;
|
460
|
+
int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
|
143
461
|
|
144
|
-
|
145
|
-
|
462
|
+
rb_scan_args(argc, argv, "11", &port, &opts);
|
463
|
+
taint = OBJ_TAINTED(port);
|
464
|
+
io = rb_respond_to(port, s_read);
|
465
|
+
if (!io)
|
146
466
|
{
|
147
|
-
if (
|
467
|
+
if (rb_respond_to(port, s_to_str))
|
148
468
|
{
|
149
|
-
port = rb_funcall(
|
469
|
+
port = rb_funcall(port, s_to_str, 0);
|
150
470
|
StringValue(port);
|
151
471
|
}
|
152
472
|
else
|
153
473
|
{
|
154
|
-
rb_raise(
|
474
|
+
rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
|
155
475
|
}
|
156
476
|
}
|
157
477
|
|
478
|
+
if (TYPE(opts) != T_HASH)
|
479
|
+
opts = Qnil;
|
480
|
+
|
481
|
+
if (!rb_block_given_p())
|
482
|
+
{
|
483
|
+
hpricot_ele *he = ALLOC(hpricot_ele);
|
484
|
+
S = ALLOC(hpricot_state);
|
485
|
+
MEMZERO(he, hpricot_ele, 1);
|
486
|
+
he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
|
487
|
+
S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
|
488
|
+
rb_gc_register_address(&S->doc);
|
489
|
+
S->focus = S->doc;
|
490
|
+
S->last = Qnil;
|
491
|
+
S->xml = OPT(opts, xml);
|
492
|
+
S->strict = OPT(opts, xhtml_strict);
|
493
|
+
S->fixup = OPT(opts, fixup_tags);
|
494
|
+
if (S->strict) S->fixup = 1;
|
495
|
+
rb_ivar_set(S->doc, rb_intern("@options"), opts);
|
496
|
+
|
497
|
+
S->EC = rb_const_get(mHpricot, s_ElementContent);
|
498
|
+
}
|
499
|
+
|
158
500
|
buffer_size = BUFSIZE;
|
159
501
|
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
|
160
502
|
bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
|
@@ -162,38 +504,57 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
162
504
|
buffer_size = NUM2INT(bufsize);
|
163
505
|
}
|
164
506
|
}
|
165
|
-
|
507
|
+
|
508
|
+
if (io)
|
509
|
+
buf = ALLOC_N(char, buffer_size);
|
166
510
|
|
167
511
|
%% write init;
|
168
512
|
|
169
|
-
while (
|
513
|
+
while (!done) {
|
170
514
|
VALUE str;
|
171
|
-
char *p
|
172
|
-
int len, space = buffer_size - have;
|
173
|
-
|
174
|
-
if ( space == 0 ) {
|
175
|
-
/* We've used up the entire buffer storing an already-parsed token
|
176
|
-
* prefix that must be preserved. Likely caused by super-long attributes.
|
177
|
-
* See ticket #13. */
|
178
|
-
rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING_PTR(tag), curline);
|
179
|
-
}
|
515
|
+
char *p, *pe;
|
516
|
+
int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
|
180
517
|
|
181
|
-
if (
|
518
|
+
if (io)
|
182
519
|
{
|
183
|
-
|
520
|
+
if (space == 0) {
|
521
|
+
/* We've used up the entire buffer storing an already-parsed token
|
522
|
+
* prefix that must be preserved. Likely caused by super-long attributes.
|
523
|
+
* Increase buffer size and continue */
|
524
|
+
tokstart_diff = ts - buf;
|
525
|
+
tokend_diff = te - buf;
|
526
|
+
mark_tag_diff = mark_tag - buf;
|
527
|
+
mark_akey_diff = mark_akey - buf;
|
528
|
+
mark_aval_diff = mark_aval - buf;
|
529
|
+
|
530
|
+
buffer_size += BUFSIZE;
|
531
|
+
REALLOC_N(buf, char, buffer_size);
|
532
|
+
|
533
|
+
space = buffer_size - have;
|
534
|
+
|
535
|
+
ts = buf + tokstart_diff;
|
536
|
+
te = buf + tokend_diff;
|
537
|
+
mark_tag = buf + mark_tag_diff;
|
538
|
+
mark_akey = buf + mark_akey_diff;
|
539
|
+
mark_aval = buf + mark_aval_diff;
|
540
|
+
}
|
541
|
+
p = buf + have;
|
542
|
+
|
543
|
+
str = rb_funcall(port, s_read, 1, INT2FIX(space));
|
544
|
+
len = RSTRING_LEN(str);
|
545
|
+
memcpy(p, StringValuePtr(str), len);
|
184
546
|
}
|
185
547
|
else
|
186
548
|
{
|
187
|
-
|
549
|
+
p = RSTRING_PTR(port);
|
550
|
+
len = RSTRING_LEN(port) + 1;
|
551
|
+
done = 1;
|
188
552
|
}
|
189
553
|
|
190
|
-
StringValue(str);
|
191
|
-
memcpy( p, RSTRING_PTR(str), RSTRING_LEN(str) );
|
192
|
-
len = RSTRING_LEN(str);
|
193
554
|
nread += len;
|
194
555
|
|
195
556
|
/* If this is the last buffer, tack on an EOF. */
|
196
|
-
if ( len < space
|
557
|
+
if (io && len < space) {
|
197
558
|
p[len++] = 0;
|
198
559
|
done = 1;
|
199
560
|
}
|
@@ -201,9 +562,10 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
201
562
|
pe = p + len;
|
202
563
|
%% write exec;
|
203
564
|
|
204
|
-
if (
|
205
|
-
|
206
|
-
|
565
|
+
if (cs == hpricot_scan_error) {
|
566
|
+
if (buf != NULL)
|
567
|
+
free(buf);
|
568
|
+
if (!NIL_P(tag))
|
207
569
|
{
|
208
570
|
rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
|
209
571
|
}
|
@@ -213,7 +575,7 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
213
575
|
}
|
214
576
|
}
|
215
577
|
|
216
|
-
if (
|
578
|
+
if (done && ele_open)
|
217
579
|
{
|
218
580
|
ele_open = 0;
|
219
581
|
if (ts > 0) {
|
@@ -223,11 +585,11 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
223
585
|
}
|
224
586
|
}
|
225
587
|
|
226
|
-
if (
|
588
|
+
if (ts == 0)
|
227
589
|
{
|
228
590
|
have = 0;
|
229
591
|
/* text nodes have no ts because each byte is parsed alone */
|
230
|
-
if (
|
592
|
+
if (mark_tag != NULL && text == 1)
|
231
593
|
{
|
232
594
|
if (done)
|
233
595
|
{
|
@@ -242,12 +604,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
242
604
|
CAT(tag, p);
|
243
605
|
}
|
244
606
|
}
|
245
|
-
|
607
|
+
if (io)
|
608
|
+
mark_tag = buf;
|
609
|
+
else
|
610
|
+
mark_tag = RSTRING_PTR(port);
|
246
611
|
}
|
247
|
-
else
|
612
|
+
else if (io)
|
248
613
|
{
|
249
614
|
have = pe - ts;
|
250
|
-
memmove(
|
615
|
+
memmove(buf, ts, have);
|
251
616
|
SLIDE(tag);
|
252
617
|
SLIDE(akey);
|
253
618
|
SLIDE(aval);
|
@@ -255,18 +620,91 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
255
620
|
ts = buf;
|
256
621
|
}
|
257
622
|
}
|
258
|
-
|
623
|
+
|
624
|
+
if (buf != NULL)
|
625
|
+
free(buf);
|
626
|
+
|
627
|
+
if (S != NULL)
|
628
|
+
{
|
629
|
+
VALUE doc = S->doc;
|
630
|
+
rb_gc_unregister_address(&S->doc);
|
631
|
+
free(S);
|
632
|
+
return doc;
|
633
|
+
}
|
634
|
+
|
635
|
+
return Qnil;
|
259
636
|
}
|
260
637
|
|
261
638
|
void Init_hpricot_scan()
|
262
639
|
{
|
263
|
-
|
640
|
+
mHpricot = rb_define_module("Hpricot");
|
264
641
|
rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
|
265
|
-
rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
|
642
|
+
rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
|
643
|
+
rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
|
266
644
|
rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
|
267
645
|
|
646
|
+
cDoc = rb_define_class_under(mHpricot, "Doc", rb_cObject);
|
647
|
+
rb_define_alloc_func(cDoc, hpricot_ele_alloc);
|
648
|
+
rb_define_method(cDoc, "children", hpricot_ele_get_children, 0);
|
649
|
+
rb_define_method(cDoc, "children=", hpricot_ele_set_children, 1);
|
650
|
+
|
651
|
+
cBaseEle = rb_define_class_under(mHpricot, "BaseEle", rb_cObject);
|
652
|
+
rb_define_alloc_func(cBaseEle, hpricot_ele_alloc);
|
653
|
+
rb_define_method(cBaseEle, "raw_string", hpricot_ele_get_raw, 0);
|
654
|
+
rb_define_method(cBaseEle, "clear_raw", hpricot_ele_clear_raw, 0);
|
655
|
+
rb_define_method(cBaseEle, "parent", hpricot_ele_get_parent, 0);
|
656
|
+
rb_define_method(cBaseEle, "parent=", hpricot_ele_set_parent, 1);
|
657
|
+
cCData = rb_define_class_under(mHpricot, "CData", cBaseEle);
|
658
|
+
rb_define_method(cCData, "content", hpricot_ele_get_tag, 0);
|
659
|
+
rb_define_method(cCData, "content=", hpricot_ele_set_tag, 1);
|
660
|
+
cComment = rb_define_class_under(mHpricot, "Comment", cBaseEle);
|
661
|
+
rb_define_method(cComment, "content", hpricot_ele_get_tag, 0);
|
662
|
+
rb_define_method(cComment, "content=", hpricot_ele_set_tag, 1);
|
663
|
+
cDocType = rb_define_class_under(mHpricot, "DocType", cBaseEle);
|
664
|
+
rb_define_method(cDocType, "target", hpricot_ele_get_tag, 0);
|
665
|
+
rb_define_method(cDocType, "target=", hpricot_ele_set_tag, 1);
|
666
|
+
rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
|
667
|
+
rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
|
668
|
+
rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
|
669
|
+
rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
|
670
|
+
cElem = rb_define_class_under(mHpricot, "Elem", cBaseEle);
|
671
|
+
rb_define_method(cElem, "raw_attributes", hpricot_ele_get_attr, 0);
|
672
|
+
rb_define_method(cElem, "raw_attributes=", hpricot_ele_set_attr, 1);
|
673
|
+
rb_define_method(cElem, "children", hpricot_ele_get_children, 0);
|
674
|
+
rb_define_method(cElem, "children=", hpricot_ele_set_children, 1);
|
675
|
+
rb_define_method(cElem, "etag", hpricot_ele_get_etag, 0);
|
676
|
+
rb_define_method(cElem, "etag=", hpricot_ele_set_etag, 1);
|
677
|
+
rb_define_method(cElem, "name", hpricot_ele_get_tag, 0);
|
678
|
+
rb_define_method(cElem, "name=", hpricot_ele_set_tag, 1);
|
679
|
+
cETag = rb_define_class_under(mHpricot, "ETag", cBaseEle);
|
680
|
+
rb_define_method(cETag, "name", hpricot_ele_get_tag, 0);
|
681
|
+
rb_define_method(cETag, "name=", hpricot_ele_set_tag, 1);
|
682
|
+
cBogusETag = rb_define_class_under(mHpricot, "BogusETag", cETag);
|
683
|
+
cText = rb_define_class_under(mHpricot, "Text", cBaseEle);
|
684
|
+
rb_define_method(cText, "content", hpricot_ele_get_tag, 0);
|
685
|
+
rb_define_method(cText, "content=", hpricot_ele_set_tag, 1);
|
686
|
+
cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", cBaseEle);
|
687
|
+
rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
|
688
|
+
rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
|
689
|
+
rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
|
690
|
+
rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
|
691
|
+
rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
|
692
|
+
rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
|
693
|
+
cProcIns = rb_define_class_under(mHpricot, "ProcIns", cBaseEle);
|
694
|
+
rb_define_method(cProcIns, "target", hpricot_ele_get_tag, 0);
|
695
|
+
rb_define_method(cProcIns, "target=", hpricot_ele_set_tag, 1);
|
696
|
+
rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
|
697
|
+
rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
|
698
|
+
|
699
|
+
s_ElementContent = rb_intern("ElementContent");
|
700
|
+
symAllow = ID2SYM(rb_intern("allow"));
|
701
|
+
symDeny = ID2SYM(rb_intern("deny"));
|
702
|
+
s_downcase = rb_intern("downcase");
|
703
|
+
s_new = rb_intern("new");
|
704
|
+
s_parent = rb_intern("parent");
|
268
705
|
s_read = rb_intern("read");
|
269
706
|
s_to_str = rb_intern("to_str");
|
707
|
+
iv_parent = rb_intern("parent");
|
270
708
|
sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
|
271
709
|
sym_doctype = ID2SYM(rb_intern("doctype"));
|
272
710
|
sym_procins = ID2SYM(rb_intern("procins"));
|
@@ -276,4 +714,9 @@ void Init_hpricot_scan()
|
|
276
714
|
sym_comment = ID2SYM(rb_intern("comment"));
|
277
715
|
sym_cdata = ID2SYM(rb_intern("cdata"));
|
278
716
|
sym_text = ID2SYM(rb_intern("text"));
|
717
|
+
sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
|
718
|
+
sym_CDATA = ID2SYM(rb_intern("CDATA"));
|
719
|
+
|
720
|
+
rb_const_set(mHpricot, rb_intern("ProcInsParse"),
|
721
|
+
reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
|
279
722
|
}
|