adamh-hpricot 0.6.168

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/CHANGELOG +62 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +259 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +194 -0
  8. data/ext/hpricot_scan/extconf.rb +6 -0
  9. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  10. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  11. data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
  12. data/ext/hpricot_scan/hpricot_scan.rl +649 -0
  13. data/extras/mingw-rbconfig.rb +176 -0
  14. data/lib/hpricot/blankslate.rb +63 -0
  15. data/lib/hpricot/builder.rb +209 -0
  16. data/lib/hpricot/elements.rb +510 -0
  17. data/lib/hpricot/htmlinfo.rb +672 -0
  18. data/lib/hpricot/inspect.rb +103 -0
  19. data/lib/hpricot/modules.rb +38 -0
  20. data/lib/hpricot/parse.rb +36 -0
  21. data/lib/hpricot/tag.rb +186 -0
  22. data/lib/hpricot/tags.rb +164 -0
  23. data/lib/hpricot/traverse.rb +838 -0
  24. data/lib/hpricot/xchar.rb +94 -0
  25. data/lib/hpricot.rb +26 -0
  26. data/test/files/basic.xhtml +17 -0
  27. data/test/files/boingboing.html +2266 -0
  28. data/test/files/cy0.html +3653 -0
  29. data/test/files/immob.html +400 -0
  30. data/test/files/pace_application.html +1320 -0
  31. data/test/files/tenderlove.html +16 -0
  32. data/test/files/uswebgen.html +220 -0
  33. data/test/files/utf8.html +1054 -0
  34. data/test/files/week9.html +1723 -0
  35. data/test/files/why.xml +19 -0
  36. data/test/load_files.rb +7 -0
  37. data/test/test_alter.rb +77 -0
  38. data/test/test_builder.rb +37 -0
  39. data/test/test_parser.rb +400 -0
  40. data/test/test_paths.rb +25 -0
  41. data/test/test_preserved.rb +66 -0
  42. data/test/test_xml.rb +28 -0
  43. metadata +107 -0
@@ -0,0 +1,649 @@
1
+ /*
2
+ * hpricot_scan.rl
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006 why the lucky stiff
8
+ */
9
+ #include <ruby.h>
10
+
11
+ #ifndef RARRAY_LEN
12
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
13
+ #define RSTRING_LEN(str) RSTRING(str)->len
14
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
15
+ #endif
16
+
17
+ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
18
+
19
+ #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
20
+
21
+ static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
22
+ sym_cdata, sym_text, sym_EMPTY;
23
+ static VALUE mHpricot, rb_eHpricotParseError;
24
+ static VALUE cBaseEle, cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cETag, cText,
25
+ cXMLDecl, cProcIns;
26
+ static ID s_ElementContent;
27
+ static ID s_downcase, s_new, s_parent, s_read, s_to_str;
28
+ static ID iv_parent;
29
+ static VALUE reProcInsParse;
30
+
31
+ typedef struct {
32
+ int name;
33
+ VALUE tag, attr, etag, raw;
34
+ VALUE parent, children;
35
+ } hpricot_ele;
36
+
37
+ #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
38
+
39
+ #define ELE(N) \
40
+ if (te > ts || text == 1) { \
41
+ char *raw = NULL; \
42
+ int rawlen = 0; \
43
+ ele_open = 0; text = 0; \
44
+ if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
45
+ raw = ts; rawlen = te - ts; \
46
+ } \
47
+ if (rb_block_given_p()) { \
48
+ VALUE raw_string = Qnil; \
49
+ if (raw != NULL) raw_string = rb_str_new(raw, rawlen); \
50
+ rb_yield_tokens(sym_##N, tag, attr, Qnil, taint); \
51
+ } else \
52
+ rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint); \
53
+ }
54
+
55
+ #define SET(N, E) \
56
+ if (mark_##N == NULL || E == mark_##N) \
57
+ N = rb_str_new2(""); \
58
+ else if (E > mark_##N) \
59
+ N = rb_str_new(mark_##N, E - mark_##N);
60
+
61
+ #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
62
+
63
+ #define SLIDE(N) if ( mark_##N > ts ) mark_##N = buf + (mark_##N - ts);
64
+
65
+ #define ATTR(K, V) \
66
+ if (!NIL_P(K)) { \
67
+ if (NIL_P(attr)) attr = rb_hash_new(); \
68
+ rb_hash_aset(attr, K, V); \
69
+ }
70
+
71
+ #define TEXT_PASS() \
72
+ if (text == 0) \
73
+ { \
74
+ if (ele_open == 1) { \
75
+ ele_open = 0; \
76
+ if (ts > 0) { \
77
+ mark_tag = ts; \
78
+ } \
79
+ } else { \
80
+ mark_tag = p; \
81
+ } \
82
+ attr = Qnil; \
83
+ tag = Qnil; \
84
+ text = 1; \
85
+ }
86
+
87
+ #define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
88
+
89
+ %%{
90
+ machine hpricot_scan;
91
+
92
+ action newEle {
93
+ if (text == 1) {
94
+ CAT(tag, p);
95
+ ELE(text);
96
+ text = 0;
97
+ }
98
+ attr = Qnil;
99
+ tag = Qnil;
100
+ mark_tag = NULL;
101
+ ele_open = 1;
102
+ }
103
+
104
+ action _tag { mark_tag = p; }
105
+ action _aval { mark_aval = p; }
106
+ action _akey { mark_akey = p; }
107
+ action tag { SET(tag, p); }
108
+ action tagc { SET(tag, p-1); }
109
+ action aval { SET(aval, p); }
110
+ action aunq {
111
+ if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
112
+ else { SET(aval, p); }
113
+ }
114
+ action akey { SET(akey, p); }
115
+ action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
116
+ action xmlenc { SET(aval, p); ATTR(ID2SYM(rb_intern("encoding")), aval); }
117
+ action xmlsd { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
118
+ action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
119
+ action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
120
+
121
+ action new_attr {
122
+ akey = Qnil;
123
+ aval = Qnil;
124
+ mark_akey = NULL;
125
+ mark_aval = NULL;
126
+ }
127
+
128
+ action save_attr {
129
+ ATTR(akey, aval);
130
+ }
131
+
132
+ include hpricot_common "hpricot_common.rl";
133
+
134
+ }%%
135
+
136
+ %% write data nofinal;
137
+
138
+ #define BUFSIZE 16384
139
+
140
+ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
141
+ {
142
+ VALUE ary;
143
+ if (sym == sym_text) {
144
+ raw = tag;
145
+ }
146
+ ary = rb_ary_new3(4, sym, tag, attr, raw);
147
+ if (taint) {
148
+ OBJ_TAINT(ary);
149
+ OBJ_TAINT(tag);
150
+ OBJ_TAINT(attr);
151
+ OBJ_TAINT(raw);
152
+ }
153
+ rb_yield(ary);
154
+ }
155
+
156
+ static void
157
+ rb_hpricot_add(VALUE focus, VALUE ele)
158
+ {
159
+ hpricot_ele *he, *he2;
160
+ Data_Get_Struct(focus, hpricot_ele, he);
161
+ Data_Get_Struct(ele, hpricot_ele, he2);
162
+ if (NIL_P(he->children))
163
+ he->children = rb_ary_new();
164
+ rb_ary_push(he->children, ele);
165
+ he2->parent = focus;
166
+ }
167
+
168
+ typedef struct {
169
+ VALUE doc;
170
+ VALUE focus;
171
+ VALUE last;
172
+ VALUE EC;
173
+ unsigned char xml, strict, fixup;
174
+ } hpricot_state;
175
+
176
+ static void
177
+ hpricot_ele_mark(hpricot_ele *he)
178
+ {
179
+ rb_gc_mark(he->tag);
180
+ rb_gc_mark(he->attr);
181
+ rb_gc_mark(he->etag);
182
+ rb_gc_mark(he->raw);
183
+ rb_gc_mark(he->parent);
184
+ rb_gc_mark(he->children);
185
+ }
186
+
187
+ static void
188
+ hpricot_ele_free(hpricot_ele *he)
189
+ {
190
+ free(he);
191
+ }
192
+
193
+ #define H_PROP(prop) \
194
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
195
+ hpricot_ele *he; \
196
+ Data_Get_Struct(self, hpricot_ele, he); \
197
+ he->prop = x; \
198
+ return self; \
199
+ } \
200
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
201
+ hpricot_ele *he; \
202
+ Data_Get_Struct(self, hpricot_ele, he); \
203
+ return he->prop; \
204
+ }
205
+
206
+ #define H_ATTR(prop) \
207
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
208
+ hpricot_ele *he; \
209
+ Data_Get_Struct(self, hpricot_ele, he); \
210
+ rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
211
+ return self; \
212
+ } \
213
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
214
+ hpricot_ele *he; \
215
+ Data_Get_Struct(self, hpricot_ele, he); \
216
+ return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
217
+ }
218
+
219
+ H_PROP(tag);
220
+ H_PROP(attr);
221
+ H_PROP(etag);
222
+ H_PROP(parent);
223
+ H_PROP(children);
224
+ H_ATTR(encoding);
225
+ H_ATTR(version);
226
+ H_ATTR(standalone);
227
+ H_ATTR(system_id);
228
+ H_ATTR(public_id);
229
+
230
+ static VALUE
231
+ hpricot_ele_get_raw(VALUE self, VALUE x) {
232
+ hpricot_ele *he;
233
+ Data_Get_Struct(self, hpricot_ele, he);
234
+ return he->raw;
235
+ }
236
+
237
+ static VALUE
238
+ hpricot_ele_clear_raw(VALUE self)
239
+ {
240
+ hpricot_ele *he;
241
+ Data_Get_Struct(self, hpricot_ele, he);
242
+ he->raw = Qnil;
243
+ return Qtrue;
244
+ }
245
+
246
+ #define H_ELE(klass) \
247
+ hpricot_ele *he = ALLOC(hpricot_ele); \
248
+ he->name = 0; \
249
+ he->tag = tag; \
250
+ he->attr = attr; \
251
+ he->raw = Qnil; \
252
+ he->etag = he->parent = he->children = Qnil; \
253
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_etag || sym == sym_doctype)) { \
254
+ he->raw = rb_str_new(raw, rawlen); \
255
+ } \
256
+ ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
257
+ S->last = ele
258
+
259
+ VALUE
260
+ hpricot_ele_alloc(VALUE klass)
261
+ {
262
+ VALUE ele;
263
+ hpricot_ele *he = ALLOC(hpricot_ele);
264
+ he->name = 0;
265
+ he->tag = he->attr = he->raw = Qnil;
266
+ he->etag = he->parent = he->children = Qnil;
267
+ ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
268
+ return ele;
269
+ }
270
+
271
+ //
272
+ // the swift, compact parser logic. most of the complicated stuff is done
273
+ // in the lexer. this step just pairs up the start and end tags.
274
+ //
275
+ VALUE
276
+ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw, int rawlen, int taint)
277
+ {
278
+ VALUE ele, ec = Qnil;
279
+
280
+ //
281
+ // in html mode, fix up start tags incorrectly formed as empty tags
282
+ //
283
+ if (!S->xml) {
284
+ if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
285
+ ec = rb_hash_aref(S->EC, tag);
286
+ if (NIL_P(ec)) {
287
+ tag = rb_funcall(tag, s_downcase, 0);
288
+ ec = rb_hash_aref(S->EC, tag);
289
+ }
290
+ if (sym == sym_emptytag) {
291
+ if (ec != sym_EMPTY)
292
+ sym = sym_stag;
293
+ } else if (sym == sym_stag) {
294
+ if (ec == sym_EMPTY)
295
+ sym = sym_emptytag;
296
+ }
297
+ }
298
+ }
299
+
300
+ if (sym == sym_emptytag || sym == sym_stag) {
301
+ H_ELE(cElem);
302
+ he->name = rb_str_hash(tag);
303
+ rb_hpricot_add(S->focus, ele);
304
+
305
+ //
306
+ // in the case of a start tag that should be empty, just
307
+ // skip the step that focuses the element. focusing moves
308
+ // us deeper into the document.
309
+ //
310
+ if (sym == sym_stag) {
311
+ if (S->xml || ec != sym_EMPTY) {
312
+ S->focus = ele;
313
+ S->last = Qnil;
314
+ }
315
+ }
316
+ } else if (sym == sym_etag) {
317
+ int name;
318
+ VALUE match = Qnil, e = S->focus;
319
+ if (S->strict) {
320
+ if (NIL_P(rb_hash_aref(S->EC, tag))) {
321
+ tag = rb_str_new2("div");
322
+ }
323
+ }
324
+
325
+ //
326
+ // another optimization will be to improve this very simple
327
+ // O(n) tag search, where n is the depth of the focused tag.
328
+ //
329
+ name = rb_str_hash(tag);
330
+ while (e != S->doc)
331
+ {
332
+ hpricot_ele *he;
333
+ Data_Get_Struct(e, hpricot_ele, he);
334
+
335
+ if (he->name == name)
336
+ {
337
+ match = e;
338
+ break;
339
+ }
340
+
341
+ e = he->parent;
342
+ }
343
+
344
+ if (NIL_P(match))
345
+ {
346
+ H_ELE(cBogusETag);
347
+ rb_hpricot_add(S->focus, ele);
348
+ }
349
+ else
350
+ {
351
+ H_ELE(cETag);
352
+ Data_Get_Struct(match, hpricot_ele, he);
353
+ he->etag = ele;
354
+ S->focus = he->parent;
355
+ S->last = Qnil;
356
+ }
357
+ } else if (sym == sym_cdata) {
358
+ H_ELE(cCData);
359
+ rb_hpricot_add(S->focus, ele);
360
+ } else if (sym == sym_comment) {
361
+ H_ELE(cComment);
362
+ rb_hpricot_add(S->focus, ele);
363
+ } else if (sym == sym_doctype) {
364
+ H_ELE(cDocType);
365
+ if (S->strict) {
366
+ rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
367
+ rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN"));
368
+ }
369
+ rb_hpricot_add(S->focus, ele);
370
+ } else if (sym == sym_procins) {
371
+ VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
372
+ tag = rb_reg_nth_match(1, match);
373
+ attr = rb_reg_nth_match(2, match);
374
+ H_ELE(cProcIns);
375
+ rb_hpricot_add(S->focus, ele);
376
+ } else if (sym == sym_text) {
377
+ // TODO: add raw_string as well?
378
+ if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
379
+ hpricot_ele *he;
380
+ Data_Get_Struct(S->last, hpricot_ele, he);
381
+ rb_str_append(he->tag, tag);
382
+ } else {
383
+ H_ELE(cText);
384
+ rb_hpricot_add(S->focus, ele);
385
+ }
386
+ } else if (sym == sym_xmldecl) {
387
+ H_ELE(cXMLDecl);
388
+ rb_hpricot_add(S->focus, ele);
389
+ }
390
+ }
391
+
392
+ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
393
+ {
394
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0;
395
+ char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
396
+
397
+ hpricot_state *S = NULL;
398
+ VALUE port, opts;
399
+ VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
400
+ char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
401
+ int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
402
+
403
+ rb_scan_args(argc, argv, "11", &port, &opts);
404
+ taint = OBJ_TAINTED( port );
405
+ if ( !rb_respond_to( port, s_read ) )
406
+ {
407
+ if ( rb_respond_to( port, s_to_str ) )
408
+ {
409
+ port = rb_funcall( port, s_to_str, 0 );
410
+ StringValue(port);
411
+ }
412
+ else
413
+ {
414
+ rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
415
+ }
416
+ }
417
+
418
+ if (TYPE(opts) != T_HASH)
419
+ opts = Qnil;
420
+
421
+ if (!rb_block_given_p())
422
+ {
423
+ S = ALLOC(hpricot_state);
424
+ hpricot_ele *he = ALLOC(hpricot_ele);
425
+ MEMZERO(he, hpricot_ele, 1);
426
+ he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
427
+ S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
428
+ rb_gc_register_address(&S->doc);
429
+ S->focus = S->doc;
430
+ S->last = Qnil;
431
+ S->xml = OPT(opts, xml);
432
+ S->strict = OPT(opts, xhtml_strict);
433
+ S->fixup = OPT(opts, fixup_tags);
434
+ if (S->strict) S->fixup = 1;
435
+
436
+ S->EC = rb_const_get(mHpricot, s_ElementContent);
437
+ }
438
+
439
+ buffer_size = BUFSIZE;
440
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
441
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
442
+ if (!NIL_P(bufsize)) {
443
+ buffer_size = NUM2INT(bufsize);
444
+ }
445
+ }
446
+ buf = ALLOC_N(char, buffer_size);
447
+
448
+ %% write init;
449
+
450
+ while ( !done ) {
451
+ VALUE str;
452
+ char *p, *pe;
453
+ int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
454
+
455
+ if ( space == 0 ) {
456
+ /* We've used up the entire buffer storing an already-parsed token
457
+ * prefix that must be preserved. Likely caused by super-long attributes.
458
+ * Increase buffer size and continue */
459
+ tokstart_diff = ts - buf;
460
+ tokend_diff = te - buf;
461
+ mark_tag_diff = mark_tag - buf;
462
+ mark_akey_diff = mark_akey - buf;
463
+ mark_aval_diff = mark_aval - buf;
464
+
465
+ buffer_size += BUFSIZE;
466
+ buf = REALLOC_N(buf, char, buffer_size);
467
+
468
+ space = buffer_size - have;
469
+
470
+ ts= buf + tokstart_diff;
471
+ te = buf + tokend_diff;
472
+ mark_tag = buf + mark_tag_diff;
473
+ mark_akey = buf + mark_akey_diff;
474
+ mark_aval = buf + mark_aval_diff;
475
+ }
476
+ p = buf + have;
477
+
478
+ if ( rb_respond_to( port, s_read ) )
479
+ {
480
+ str = rb_funcall( port, s_read, 1, INT2FIX(space) );
481
+ }
482
+ else
483
+ {
484
+ str = rb_str_substr( port, nread, space );
485
+ }
486
+
487
+ StringValue(str);
488
+ memcpy( p, RSTRING_PTR(str), RSTRING_LEN(str) );
489
+ len = RSTRING_LEN(str);
490
+ nread += len;
491
+
492
+ /* If this is the last buffer, tack on an EOF. */
493
+ if ( len < space ) {
494
+ p[len++] = 0;
495
+ done = 1;
496
+ }
497
+
498
+ pe = p + len;
499
+ %% write exec;
500
+
501
+ if ( cs == hpricot_scan_error ) {
502
+ free(buf);
503
+ if ( !NIL_P(tag) )
504
+ {
505
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
506
+ }
507
+ else
508
+ {
509
+ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
510
+ }
511
+ }
512
+
513
+ if ( done && ele_open )
514
+ {
515
+ ele_open = 0;
516
+ if (ts > 0) {
517
+ mark_tag = ts;
518
+ ts = 0;
519
+ text = 1;
520
+ }
521
+ }
522
+
523
+ if ( ts == 0 )
524
+ {
525
+ have = 0;
526
+ /* text nodes have no ts because each byte is parsed alone */
527
+ if ( mark_tag != NULL && text == 1 )
528
+ {
529
+ if (done)
530
+ {
531
+ if (mark_tag < p-1)
532
+ {
533
+ CAT(tag, p-1);
534
+ ELE(text);
535
+ }
536
+ }
537
+ else
538
+ {
539
+ CAT(tag, p);
540
+ }
541
+ }
542
+ mark_tag = buf;
543
+ }
544
+ else
545
+ {
546
+ have = pe - ts;
547
+ memmove( buf, ts, have );
548
+ SLIDE(tag);
549
+ SLIDE(akey);
550
+ SLIDE(aval);
551
+ te = buf + (te - ts);
552
+ ts = buf;
553
+ }
554
+ }
555
+ free(buf);
556
+
557
+ if (S != NULL)
558
+ {
559
+ VALUE doc = S->doc;
560
+ rb_gc_unregister_address(&S->doc);
561
+ free(S);
562
+ return doc;
563
+ }
564
+
565
+ return Qnil;
566
+ }
567
+
568
+ void Init_hpricot_scan()
569
+ {
570
+ mHpricot = rb_define_module("Hpricot");
571
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
572
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
573
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
574
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
575
+
576
+ cDoc = rb_define_class_under(mHpricot, "Doc", rb_cObject);
577
+ rb_define_alloc_func(cDoc, hpricot_ele_alloc);
578
+ rb_define_method(cDoc, "children", hpricot_ele_get_children, 0);
579
+ rb_define_method(cDoc, "children=", hpricot_ele_set_children, 1);
580
+
581
+ cBaseEle = rb_define_class_under(mHpricot, "BaseEle", rb_cObject);
582
+ rb_define_alloc_func(cBaseEle, hpricot_ele_alloc);
583
+ rb_define_method(cBaseEle, "raw_string", hpricot_ele_get_raw, 0);
584
+ rb_define_method(cBaseEle, "clear_raw", hpricot_ele_clear_raw, 0);
585
+ rb_define_method(cBaseEle, "parent", hpricot_ele_get_parent, 0);
586
+ rb_define_method(cBaseEle, "parent=", hpricot_ele_set_parent, 1);
587
+ cCData = rb_define_class_under(mHpricot, "CData", cBaseEle);
588
+ rb_define_method(cCData, "content", hpricot_ele_get_tag, 0);
589
+ rb_define_method(cCData, "content=", hpricot_ele_set_tag, 1);
590
+ cComment = rb_define_class_under(mHpricot, "Comment", cBaseEle);
591
+ rb_define_method(cComment, "content", hpricot_ele_get_tag, 0);
592
+ rb_define_method(cComment, "content=", hpricot_ele_set_tag, 1);
593
+ cDocType = rb_define_class_under(mHpricot, "DocType", cBaseEle);
594
+ rb_define_method(cDocType, "target", hpricot_ele_get_tag, 0);
595
+ rb_define_method(cDocType, "target=", hpricot_ele_set_tag, 1);
596
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
597
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
598
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
599
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
600
+ cElem = rb_define_class_under(mHpricot, "Elem", cBaseEle);
601
+ rb_define_method(cElem, "raw_attributes", hpricot_ele_get_attr, 0);
602
+ rb_define_method(cElem, "raw_attributes=", hpricot_ele_set_attr, 1);
603
+ rb_define_method(cElem, "children", hpricot_ele_get_children, 0);
604
+ rb_define_method(cElem, "children=", hpricot_ele_set_children, 1);
605
+ rb_define_method(cElem, "etag", hpricot_ele_get_etag, 0);
606
+ rb_define_method(cElem, "etag=", hpricot_ele_set_etag, 1);
607
+ rb_define_method(cElem, "name", hpricot_ele_get_tag, 0);
608
+ rb_define_method(cElem, "name=", hpricot_ele_set_tag, 1);
609
+ cETag = rb_define_class_under(mHpricot, "ETag", cBaseEle);
610
+ rb_define_method(cETag, "name", hpricot_ele_get_tag, 0);
611
+ rb_define_method(cETag, "name=", hpricot_ele_set_tag, 1);
612
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", cETag);
613
+ cText = rb_define_class_under(mHpricot, "Text", cBaseEle);
614
+ rb_define_method(cText, "content", hpricot_ele_get_tag, 0);
615
+ rb_define_method(cText, "content=", hpricot_ele_set_tag, 1);
616
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", cBaseEle);
617
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
618
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
619
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
620
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
621
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
622
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
623
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", cBaseEle);
624
+ rb_define_method(cProcIns, "target", hpricot_ele_get_tag, 0);
625
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_tag, 1);
626
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
627
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
628
+
629
+ s_ElementContent = rb_intern("ElementContent");
630
+ s_downcase = rb_intern("downcase");
631
+ s_new = rb_intern("new");
632
+ s_parent = rb_intern("parent");
633
+ s_read = rb_intern("read");
634
+ s_to_str = rb_intern("to_str");
635
+ iv_parent = rb_intern("parent");
636
+ sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
637
+ sym_doctype = ID2SYM(rb_intern("doctype"));
638
+ sym_procins = ID2SYM(rb_intern("procins"));
639
+ sym_stag = ID2SYM(rb_intern("stag"));
640
+ sym_etag = ID2SYM(rb_intern("etag"));
641
+ sym_emptytag = ID2SYM(rb_intern("emptytag"));
642
+ sym_comment = ID2SYM(rb_intern("comment"));
643
+ sym_cdata = ID2SYM(rb_intern("cdata"));
644
+ sym_text = ID2SYM(rb_intern("text"));
645
+ sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
646
+
647
+ rb_const_set(mHpricot, rb_intern("ProcInsParse"),
648
+ reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
649
+ }