hpricot 0.7-x86-mswin32

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/CHANGELOG +68 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +260 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +200 -0
  8. data/ext/hpricot_scan/HpricotScanService.java +1305 -0
  9. data/ext/hpricot_scan/extconf.rb +6 -0
  10. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  11. data/ext/hpricot_scan/hpricot_css.c +3502 -0
  12. data/ext/hpricot_scan/hpricot_css.rl +115 -0
  13. data/ext/hpricot_scan/hpricot_scan.c +6704 -0
  14. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  15. data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
  16. data/ext/hpricot_scan/hpricot_scan.rl +722 -0
  17. data/ext/hpricot_scan/test.rb +4 -0
  18. data/extras/mingw-rbconfig.rb +176 -0
  19. data/lib/fast_xs.so +0 -0
  20. data/lib/hpricot.rb +26 -0
  21. data/lib/hpricot/blankslate.rb +63 -0
  22. data/lib/hpricot/builder.rb +216 -0
  23. data/lib/hpricot/elements.rb +510 -0
  24. data/lib/hpricot/htmlinfo.rb +691 -0
  25. data/lib/hpricot/inspect.rb +103 -0
  26. data/lib/hpricot/modules.rb +38 -0
  27. data/lib/hpricot/parse.rb +38 -0
  28. data/lib/hpricot/tag.rb +198 -0
  29. data/lib/hpricot/tags.rb +164 -0
  30. data/lib/hpricot/traverse.rb +838 -0
  31. data/lib/hpricot/xchar.rb +94 -0
  32. data/lib/hpricot_scan.so +0 -0
  33. data/test/files/basic.xhtml +17 -0
  34. data/test/files/boingboing.html +2266 -0
  35. data/test/files/cy0.html +3653 -0
  36. data/test/files/immob.html +400 -0
  37. data/test/files/pace_application.html +1320 -0
  38. data/test/files/tenderlove.html +16 -0
  39. data/test/files/uswebgen.html +220 -0
  40. data/test/files/utf8.html +1054 -0
  41. data/test/files/week9.html +1723 -0
  42. data/test/files/why.xml +19 -0
  43. data/test/load_files.rb +7 -0
  44. data/test/nokogiri-bench.rb +64 -0
  45. data/test/test_alter.rb +77 -0
  46. data/test/test_builder.rb +37 -0
  47. data/test/test_parser.rb +409 -0
  48. data/test/test_paths.rb +25 -0
  49. data/test/test_preserved.rb +70 -0
  50. data/test/test_xml.rb +28 -0
  51. metadata +111 -0
@@ -0,0 +1,722 @@
1
+ /*
2
+ * hpricot_scan.rl
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006 why the lucky stiff
8
+ */
9
+ #include <ruby.h>
10
+
11
+ #ifndef RARRAY_LEN
12
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
13
+ #define RSTRING_LEN(str) RSTRING(str)->len
14
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
15
+ #endif
16
+
17
+ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
18
+
19
+ #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
20
+
21
+ static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
22
+ sym_cdata, sym_text, sym_EMPTY, sym_CDATA;
23
+ static VALUE mHpricot, rb_eHpricotParseError;
24
+ static VALUE cBaseEle, cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cETag, cText,
25
+ cXMLDecl, cProcIns, symAllow, symDeny;
26
+ static ID s_ElementContent;
27
+ static ID s_downcase, s_new, s_parent, s_read, s_to_str;
28
+ static ID iv_parent;
29
+ static VALUE reProcInsParse;
30
+
31
+ typedef struct {
32
+ int name;
33
+ VALUE tag, attr, etag, raw, EC;
34
+ VALUE parent, children;
35
+ } hpricot_ele;
36
+
37
+ #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
38
+
39
+ #define ELE(N) \
40
+ if (te > ts || text == 1) { \
41
+ char *raw = NULL; \
42
+ int rawlen = 0; \
43
+ ele_open = 0; text = 0; \
44
+ if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
45
+ raw = ts; rawlen = te - ts; \
46
+ } \
47
+ if (rb_block_given_p()) { \
48
+ VALUE raw_string = Qnil; \
49
+ if (raw != NULL) raw_string = rb_str_new(raw, rawlen); \
50
+ rb_yield_tokens(sym_##N, tag, attr, Qnil, taint); \
51
+ } else \
52
+ rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint); \
53
+ }
54
+
55
+ #define SET(N, E) \
56
+ if (mark_##N == NULL || E == mark_##N) \
57
+ N = rb_str_new2(""); \
58
+ else if (E > mark_##N) \
59
+ N = rb_str_new(mark_##N, E - mark_##N);
60
+
61
+ #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
62
+
63
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
64
+
65
+ #define ATTR(K, V) \
66
+ if (!NIL_P(K)) { \
67
+ if (NIL_P(attr)) attr = rb_hash_new(); \
68
+ rb_hash_aset(attr, K, V); \
69
+ }
70
+
71
+ #define TEXT_PASS() \
72
+ if (text == 0) \
73
+ { \
74
+ if (ele_open == 1) { \
75
+ ele_open = 0; \
76
+ if (ts > 0) { \
77
+ mark_tag = ts; \
78
+ } \
79
+ } else { \
80
+ mark_tag = p; \
81
+ } \
82
+ attr = Qnil; \
83
+ tag = Qnil; \
84
+ text = 1; \
85
+ }
86
+
87
+ #define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
88
+
89
+ %%{
90
+ machine hpricot_scan;
91
+
92
+ action newEle {
93
+ if (text == 1) {
94
+ CAT(tag, p);
95
+ ELE(text);
96
+ text = 0;
97
+ }
98
+ attr = Qnil;
99
+ tag = Qnil;
100
+ mark_tag = NULL;
101
+ ele_open = 1;
102
+ }
103
+
104
+ action _tag { mark_tag = p; }
105
+ action _aval { mark_aval = p; }
106
+ action _akey { mark_akey = p; }
107
+ action tag { SET(tag, p); }
108
+ action tagc { SET(tag, p-1); }
109
+ action aval { SET(aval, p); }
110
+ action aunq {
111
+ if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
112
+ else { SET(aval, p); }
113
+ }
114
+ action akey { SET(akey, p); }
115
+ action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
116
+ action xmlenc { SET(aval, p); ATTR(ID2SYM(rb_intern("encoding")), aval); }
117
+ action xmlsd { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
118
+ action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
119
+ action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
120
+
121
+ action new_attr {
122
+ akey = Qnil;
123
+ aval = Qnil;
124
+ mark_akey = NULL;
125
+ mark_aval = NULL;
126
+ }
127
+
128
+ action save_attr {
129
+ ATTR(akey, aval);
130
+ }
131
+
132
+ include hpricot_common "hpricot_common.rl";
133
+
134
+ }%%
135
+
136
+ %% write data nofinal;
137
+
138
+ #define BUFSIZE 16384
139
+
140
+ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
141
+ {
142
+ VALUE ary;
143
+ if (sym == sym_text) {
144
+ raw = tag;
145
+ }
146
+ ary = rb_ary_new3(4, sym, tag, attr, raw);
147
+ if (taint) {
148
+ OBJ_TAINT(ary);
149
+ OBJ_TAINT(tag);
150
+ OBJ_TAINT(attr);
151
+ OBJ_TAINT(raw);
152
+ }
153
+ rb_yield(ary);
154
+ }
155
+
156
+ /* rb_hash_lookup() is only in Ruby 1.8.7 */
157
+ static VALUE
158
+ our_rb_hash_lookup(VALUE hash, VALUE key)
159
+ {
160
+ VALUE val;
161
+
162
+ if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
163
+ return Qnil; /* without Hash#default */
164
+ }
165
+
166
+ return val;
167
+ }
168
+
169
+ static void
170
+ rb_hpricot_add(VALUE focus, VALUE ele)
171
+ {
172
+ hpricot_ele *he, *he2;
173
+ Data_Get_Struct(focus, hpricot_ele, he);
174
+ Data_Get_Struct(ele, hpricot_ele, he2);
175
+ if (NIL_P(he->children))
176
+ he->children = rb_ary_new();
177
+ rb_ary_push(he->children, ele);
178
+ he2->parent = focus;
179
+ }
180
+
181
+ typedef struct {
182
+ VALUE doc;
183
+ VALUE focus;
184
+ VALUE last;
185
+ VALUE EC;
186
+ unsigned char xml, strict, fixup;
187
+ } hpricot_state;
188
+
189
+ static void
190
+ hpricot_ele_mark(hpricot_ele *he)
191
+ {
192
+ rb_gc_mark(he->tag);
193
+ rb_gc_mark(he->attr);
194
+ rb_gc_mark(he->etag);
195
+ rb_gc_mark(he->raw);
196
+ rb_gc_mark(he->parent);
197
+ rb_gc_mark(he->children);
198
+ }
199
+
200
+ static void
201
+ hpricot_ele_free(hpricot_ele *he)
202
+ {
203
+ free(he);
204
+ }
205
+
206
+ #define H_PROP(prop) \
207
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
208
+ hpricot_ele *he; \
209
+ Data_Get_Struct(self, hpricot_ele, he); \
210
+ he->prop = x; \
211
+ return self; \
212
+ } \
213
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
214
+ hpricot_ele *he; \
215
+ Data_Get_Struct(self, hpricot_ele, he); \
216
+ return he->prop; \
217
+ }
218
+
219
+ #define H_ATTR(prop) \
220
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
221
+ hpricot_ele *he; \
222
+ Data_Get_Struct(self, hpricot_ele, he); \
223
+ rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
224
+ return self; \
225
+ } \
226
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
227
+ hpricot_ele *he; \
228
+ Data_Get_Struct(self, hpricot_ele, he); \
229
+ return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
230
+ }
231
+
232
+ H_PROP(tag);
233
+ H_PROP(attr);
234
+ H_PROP(etag);
235
+ H_PROP(parent);
236
+ H_PROP(children);
237
+ H_ATTR(encoding);
238
+ H_ATTR(version);
239
+ H_ATTR(standalone);
240
+ H_ATTR(system_id);
241
+ H_ATTR(public_id);
242
+
243
+ static VALUE
244
+ hpricot_ele_get_raw(VALUE self, VALUE x) {
245
+ hpricot_ele *he;
246
+ Data_Get_Struct(self, hpricot_ele, he);
247
+ return he->raw;
248
+ }
249
+
250
+ static VALUE
251
+ hpricot_ele_clear_raw(VALUE self)
252
+ {
253
+ hpricot_ele *he;
254
+ Data_Get_Struct(self, hpricot_ele, he);
255
+ he->raw = Qnil;
256
+ return Qtrue;
257
+ }
258
+
259
+ #define H_ELE(klass) \
260
+ hpricot_ele *he = ALLOC(hpricot_ele); \
261
+ he->name = 0; \
262
+ he->tag = tag; \
263
+ he->attr = attr; \
264
+ he->raw = Qnil; \
265
+ he->EC = ec; \
266
+ he->etag = he->parent = he->children = Qnil; \
267
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_etag || sym == sym_doctype)) { \
268
+ he->raw = rb_str_new(raw, rawlen); \
269
+ } \
270
+ ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
271
+ S->last = ele
272
+
273
+ VALUE
274
+ hpricot_ele_alloc(VALUE klass)
275
+ {
276
+ VALUE ele;
277
+ hpricot_ele *he = ALLOC(hpricot_ele);
278
+ he->name = 0;
279
+ he->tag = he->attr = he->raw = he->EC = Qnil;
280
+ he->etag = he->parent = he->children = Qnil;
281
+ ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
282
+ return ele;
283
+ }
284
+
285
+ //
286
+ // the swift, compact parser logic. most of the complicated stuff is done
287
+ // in the lexer. this step just pairs up the start and end tags.
288
+ //
289
+ void
290
+ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw, int rawlen, int taint)
291
+ {
292
+ VALUE ele, ec = Qnil;
293
+
294
+ //
295
+ // in html mode, fix up start tags incorrectly formed as empty tags
296
+ //
297
+ if (!S->xml) {
298
+ hpricot_ele *last;
299
+ Data_Get_Struct(S->focus, hpricot_ele, last);
300
+ if (last->EC == sym_CDATA &&
301
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
302
+ !(sym == sym_etag && rb_str_hash(tag) == last->name))
303
+ {
304
+ sym = sym_text;
305
+ tag = rb_str_new(raw, rawlen);
306
+ }
307
+
308
+ if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
309
+ ec = rb_hash_aref(S->EC, tag);
310
+ if (NIL_P(ec)) {
311
+ tag = rb_funcall(tag, s_downcase, 0);
312
+ ec = rb_hash_aref(S->EC, tag);
313
+ }
314
+ if (sym == sym_emptytag) {
315
+ if (ec != sym_EMPTY)
316
+ sym = sym_stag;
317
+ } else if (sym == sym_stag) {
318
+ if (ec == sym_EMPTY)
319
+ sym = sym_emptytag;
320
+ }
321
+ }
322
+ }
323
+
324
+ if (sym == sym_emptytag || sym == sym_stag) {
325
+ H_ELE(cElem);
326
+ he->name = rb_str_hash(tag);
327
+
328
+ if (!S->xml) {
329
+ VALUE match = Qnil, e = S->focus;
330
+ while (e != S->doc)
331
+ {
332
+ hpricot_ele *hee;
333
+ Data_Get_Struct(e, hpricot_ele, hee);
334
+
335
+ if (TYPE(hee->EC) == T_HASH)
336
+ {
337
+ VALUE has = our_rb_hash_lookup(hee->EC, INT2NUM(he->name));
338
+ if (has != Qnil) {
339
+ if (has == Qtrue) {
340
+ if (match == Qnil)
341
+ match = e;
342
+ } else if (has == symAllow) {
343
+ match = S->focus;
344
+ } else if (has == symDeny) {
345
+ match = Qnil;
346
+ }
347
+ }
348
+ }
349
+
350
+ e = hee->parent;
351
+ }
352
+
353
+ if (match == Qnil)
354
+ match = S->focus;
355
+ S->focus = match;
356
+ }
357
+
358
+ rb_hpricot_add(S->focus, ele);
359
+
360
+ //
361
+ // in the case of a start tag that should be empty, just
362
+ // skip the step that focuses the element. focusing moves
363
+ // us deeper into the document.
364
+ //
365
+ if (sym == sym_stag) {
366
+ if (S->xml || ec != sym_EMPTY) {
367
+ S->focus = ele;
368
+ S->last = Qnil;
369
+ }
370
+ }
371
+ } else if (sym == sym_etag) {
372
+ int name;
373
+ VALUE match = Qnil, e = S->focus;
374
+ if (S->strict) {
375
+ if (NIL_P(rb_hash_aref(S->EC, tag))) {
376
+ tag = rb_str_new2("div");
377
+ }
378
+ }
379
+
380
+ //
381
+ // another optimization will be to improve this very simple
382
+ // O(n) tag search, where n is the depth of the focused tag.
383
+ //
384
+ // (see also: the search above for fixups)
385
+ //
386
+ name = rb_str_hash(tag);
387
+ while (e != S->doc)
388
+ {
389
+ hpricot_ele *he;
390
+ Data_Get_Struct(e, hpricot_ele, he);
391
+
392
+ if (he->name == name)
393
+ {
394
+ match = e;
395
+ break;
396
+ }
397
+
398
+ e = he->parent;
399
+ }
400
+
401
+ if (NIL_P(match))
402
+ {
403
+ H_ELE(cBogusETag);
404
+ rb_hpricot_add(S->focus, ele);
405
+ }
406
+ else
407
+ {
408
+ H_ELE(cETag);
409
+ Data_Get_Struct(match, hpricot_ele, he);
410
+ he->etag = ele;
411
+ S->focus = he->parent;
412
+ S->last = Qnil;
413
+ }
414
+ } else if (sym == sym_cdata) {
415
+ H_ELE(cCData);
416
+ rb_hpricot_add(S->focus, ele);
417
+ } else if (sym == sym_comment) {
418
+ H_ELE(cComment);
419
+ rb_hpricot_add(S->focus, ele);
420
+ } else if (sym == sym_doctype) {
421
+ H_ELE(cDocType);
422
+ if (S->strict) {
423
+ rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
424
+ rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN"));
425
+ }
426
+ rb_hpricot_add(S->focus, ele);
427
+ } else if (sym == sym_procins) {
428
+ VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
429
+ tag = rb_reg_nth_match(1, match);
430
+ attr = rb_reg_nth_match(2, match);
431
+ {
432
+ H_ELE(cProcIns);
433
+ rb_hpricot_add(S->focus, ele);
434
+ }
435
+ } else if (sym == sym_text) {
436
+ // TODO: add raw_string as well?
437
+ if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
438
+ hpricot_ele *he;
439
+ Data_Get_Struct(S->last, hpricot_ele, he);
440
+ rb_str_append(he->tag, tag);
441
+ } else {
442
+ H_ELE(cText);
443
+ rb_hpricot_add(S->focus, ele);
444
+ }
445
+ } else if (sym == sym_xmldecl) {
446
+ H_ELE(cXMLDecl);
447
+ rb_hpricot_add(S->focus, ele);
448
+ }
449
+ }
450
+
451
+ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
452
+ {
453
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
454
+ char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
455
+
456
+ hpricot_state *S = NULL;
457
+ VALUE port, opts;
458
+ VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
459
+ char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
460
+ int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
461
+
462
+ rb_scan_args(argc, argv, "11", &port, &opts);
463
+ taint = OBJ_TAINTED(port);
464
+ io = rb_respond_to(port, s_read);
465
+ if (!io)
466
+ {
467
+ if (rb_respond_to(port, s_to_str))
468
+ {
469
+ port = rb_funcall(port, s_to_str, 0);
470
+ StringValue(port);
471
+ }
472
+ else
473
+ {
474
+ rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
475
+ }
476
+ }
477
+
478
+ if (TYPE(opts) != T_HASH)
479
+ opts = Qnil;
480
+
481
+ if (!rb_block_given_p())
482
+ {
483
+ hpricot_ele *he = ALLOC(hpricot_ele);
484
+ S = ALLOC(hpricot_state);
485
+ MEMZERO(he, hpricot_ele, 1);
486
+ he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
487
+ S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
488
+ rb_gc_register_address(&S->doc);
489
+ S->focus = S->doc;
490
+ S->last = Qnil;
491
+ S->xml = OPT(opts, xml);
492
+ S->strict = OPT(opts, xhtml_strict);
493
+ S->fixup = OPT(opts, fixup_tags);
494
+ if (S->strict) S->fixup = 1;
495
+ rb_ivar_set(S->doc, rb_intern("@options"), opts);
496
+
497
+ S->EC = rb_const_get(mHpricot, s_ElementContent);
498
+ }
499
+
500
+ buffer_size = BUFSIZE;
501
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
502
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
503
+ if (!NIL_P(bufsize)) {
504
+ buffer_size = NUM2INT(bufsize);
505
+ }
506
+ }
507
+
508
+ if (io)
509
+ buf = ALLOC_N(char, buffer_size);
510
+
511
+ %% write init;
512
+
513
+ while (!done) {
514
+ VALUE str;
515
+ char *p, *pe;
516
+ int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
517
+
518
+ if (io)
519
+ {
520
+ if (space == 0) {
521
+ /* We've used up the entire buffer storing an already-parsed token
522
+ * prefix that must be preserved. Likely caused by super-long attributes.
523
+ * Increase buffer size and continue */
524
+ tokstart_diff = ts - buf;
525
+ tokend_diff = te - buf;
526
+ mark_tag_diff = mark_tag - buf;
527
+ mark_akey_diff = mark_akey - buf;
528
+ mark_aval_diff = mark_aval - buf;
529
+
530
+ buffer_size += BUFSIZE;
531
+ REALLOC_N(buf, char, buffer_size);
532
+
533
+ space = buffer_size - have;
534
+
535
+ ts = buf + tokstart_diff;
536
+ te = buf + tokend_diff;
537
+ mark_tag = buf + mark_tag_diff;
538
+ mark_akey = buf + mark_akey_diff;
539
+ mark_aval = buf + mark_aval_diff;
540
+ }
541
+ p = buf + have;
542
+
543
+ str = rb_funcall(port, s_read, 1, INT2FIX(space));
544
+ len = RSTRING_LEN(str);
545
+ memcpy(p, StringValuePtr(str), len);
546
+ }
547
+ else
548
+ {
549
+ p = RSTRING_PTR(port);
550
+ len = RSTRING_LEN(port) + 1;
551
+ done = 1;
552
+ }
553
+
554
+ nread += len;
555
+
556
+ /* If this is the last buffer, tack on an EOF. */
557
+ if (io && len < space) {
558
+ p[len++] = 0;
559
+ done = 1;
560
+ }
561
+
562
+ pe = p + len;
563
+ %% write exec;
564
+
565
+ if (cs == hpricot_scan_error) {
566
+ if (buf != NULL)
567
+ free(buf);
568
+ if (!NIL_P(tag))
569
+ {
570
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
571
+ }
572
+ else
573
+ {
574
+ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
575
+ }
576
+ }
577
+
578
+ if (done && ele_open)
579
+ {
580
+ ele_open = 0;
581
+ if (ts > 0) {
582
+ mark_tag = ts;
583
+ ts = 0;
584
+ text = 1;
585
+ }
586
+ }
587
+
588
+ if (ts == 0)
589
+ {
590
+ have = 0;
591
+ /* text nodes have no ts because each byte is parsed alone */
592
+ if (mark_tag != NULL && text == 1)
593
+ {
594
+ if (done)
595
+ {
596
+ if (mark_tag < p-1)
597
+ {
598
+ CAT(tag, p-1);
599
+ ELE(text);
600
+ }
601
+ }
602
+ else
603
+ {
604
+ CAT(tag, p);
605
+ }
606
+ }
607
+ if (io)
608
+ mark_tag = buf;
609
+ else
610
+ mark_tag = RSTRING_PTR(port);
611
+ }
612
+ else if (io)
613
+ {
614
+ have = pe - ts;
615
+ memmove(buf, ts, have);
616
+ SLIDE(tag);
617
+ SLIDE(akey);
618
+ SLIDE(aval);
619
+ te = buf + (te - ts);
620
+ ts = buf;
621
+ }
622
+ }
623
+
624
+ if (buf != NULL)
625
+ free(buf);
626
+
627
+ if (S != NULL)
628
+ {
629
+ VALUE doc = S->doc;
630
+ rb_gc_unregister_address(&S->doc);
631
+ free(S);
632
+ return doc;
633
+ }
634
+
635
+ return Qnil;
636
+ }
637
+
638
+ void Init_hpricot_scan()
639
+ {
640
+ mHpricot = rb_define_module("Hpricot");
641
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
642
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
643
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
644
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
645
+
646
+ cDoc = rb_define_class_under(mHpricot, "Doc", rb_cObject);
647
+ rb_define_alloc_func(cDoc, hpricot_ele_alloc);
648
+ rb_define_method(cDoc, "children", hpricot_ele_get_children, 0);
649
+ rb_define_method(cDoc, "children=", hpricot_ele_set_children, 1);
650
+
651
+ cBaseEle = rb_define_class_under(mHpricot, "BaseEle", rb_cObject);
652
+ rb_define_alloc_func(cBaseEle, hpricot_ele_alloc);
653
+ rb_define_method(cBaseEle, "raw_string", hpricot_ele_get_raw, 0);
654
+ rb_define_method(cBaseEle, "clear_raw", hpricot_ele_clear_raw, 0);
655
+ rb_define_method(cBaseEle, "parent", hpricot_ele_get_parent, 0);
656
+ rb_define_method(cBaseEle, "parent=", hpricot_ele_set_parent, 1);
657
+ cCData = rb_define_class_under(mHpricot, "CData", cBaseEle);
658
+ rb_define_method(cCData, "content", hpricot_ele_get_tag, 0);
659
+ rb_define_method(cCData, "content=", hpricot_ele_set_tag, 1);
660
+ cComment = rb_define_class_under(mHpricot, "Comment", cBaseEle);
661
+ rb_define_method(cComment, "content", hpricot_ele_get_tag, 0);
662
+ rb_define_method(cComment, "content=", hpricot_ele_set_tag, 1);
663
+ cDocType = rb_define_class_under(mHpricot, "DocType", cBaseEle);
664
+ rb_define_method(cDocType, "target", hpricot_ele_get_tag, 0);
665
+ rb_define_method(cDocType, "target=", hpricot_ele_set_tag, 1);
666
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
667
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
668
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
669
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
670
+ cElem = rb_define_class_under(mHpricot, "Elem", cBaseEle);
671
+ rb_define_method(cElem, "raw_attributes", hpricot_ele_get_attr, 0);
672
+ rb_define_method(cElem, "raw_attributes=", hpricot_ele_set_attr, 1);
673
+ rb_define_method(cElem, "children", hpricot_ele_get_children, 0);
674
+ rb_define_method(cElem, "children=", hpricot_ele_set_children, 1);
675
+ rb_define_method(cElem, "etag", hpricot_ele_get_etag, 0);
676
+ rb_define_method(cElem, "etag=", hpricot_ele_set_etag, 1);
677
+ rb_define_method(cElem, "name", hpricot_ele_get_tag, 0);
678
+ rb_define_method(cElem, "name=", hpricot_ele_set_tag, 1);
679
+ cETag = rb_define_class_under(mHpricot, "ETag", cBaseEle);
680
+ rb_define_method(cETag, "name", hpricot_ele_get_tag, 0);
681
+ rb_define_method(cETag, "name=", hpricot_ele_set_tag, 1);
682
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", cETag);
683
+ cText = rb_define_class_under(mHpricot, "Text", cBaseEle);
684
+ rb_define_method(cText, "content", hpricot_ele_get_tag, 0);
685
+ rb_define_method(cText, "content=", hpricot_ele_set_tag, 1);
686
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", cBaseEle);
687
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
688
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
689
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
690
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
691
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
692
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
693
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", cBaseEle);
694
+ rb_define_method(cProcIns, "target", hpricot_ele_get_tag, 0);
695
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_tag, 1);
696
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
697
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
698
+
699
+ s_ElementContent = rb_intern("ElementContent");
700
+ symAllow = ID2SYM(rb_intern("allow"));
701
+ symDeny = ID2SYM(rb_intern("deny"));
702
+ s_downcase = rb_intern("downcase");
703
+ s_new = rb_intern("new");
704
+ s_parent = rb_intern("parent");
705
+ s_read = rb_intern("read");
706
+ s_to_str = rb_intern("to_str");
707
+ iv_parent = rb_intern("parent");
708
+ sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
709
+ sym_doctype = ID2SYM(rb_intern("doctype"));
710
+ sym_procins = ID2SYM(rb_intern("procins"));
711
+ sym_stag = ID2SYM(rb_intern("stag"));
712
+ sym_etag = ID2SYM(rb_intern("etag"));
713
+ sym_emptytag = ID2SYM(rb_intern("emptytag"));
714
+ sym_comment = ID2SYM(rb_intern("comment"));
715
+ sym_cdata = ID2SYM(rb_intern("cdata"));
716
+ sym_text = ID2SYM(rb_intern("text"));
717
+ sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
718
+ sym_CDATA = ID2SYM(rb_intern("CDATA"));
719
+
720
+ rb_const_set(mHpricot, rb_intern("ProcInsParse"),
721
+ reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
722
+ }