hpricot 0.6.164 → 0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -264,7 +264,11 @@ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
264
264
  /* We've used up the entire buffer storing an already-parsed token
265
265
  * prefix that must be preserved. Likely caused by super-long attributes.
266
266
  * See ticket #13. */
267
- rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <" + tag.toString() + ">, starting on line "+curline+".");
267
+ buffer_size += BUFSIZE;
268
+ char[] new_buf = new char[buffer_size];
269
+ System.arraycopy(buf, 0, new_buf, 0, buf.length);
270
+ buf = new_buf;
271
+ space = buffer_size - have;
268
272
  }
269
273
 
270
274
  if (port.respondsTo("read")) {
@@ -14,21 +14,42 @@
14
14
  #define RSTRING_PTR(str) RSTRING(str)->ptr
15
15
  #endif
16
16
 
17
+ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
18
+
17
19
  #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
18
20
 
19
21
  static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
20
- sym_cdata, sym_text;
21
- static VALUE rb_eHpricotParseError;
22
- static ID s_read, s_to_str;
22
+ sym_cdata, sym_text, sym_EMPTY, sym_CDATA;
23
+ static VALUE mHpricot, rb_eHpricotParseError;
24
+ static VALUE cBaseEle, cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cETag, cText,
25
+ cXMLDecl, cProcIns, symAllow, symDeny;
26
+ static ID s_ElementContent;
27
+ static ID s_downcase, s_new, s_parent, s_read, s_to_str;
28
+ static ID iv_parent;
29
+ static VALUE reProcInsParse;
30
+
31
+ typedef struct {
32
+ int name;
33
+ VALUE tag, attr, etag, raw, EC;
34
+ VALUE parent, children;
35
+ } hpricot_ele;
36
+
37
+ #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
23
38
 
24
39
  #define ELE(N) \
25
40
  if (te > ts || text == 1) { \
26
- VALUE raw_string = Qnil; \
41
+ char *raw = NULL; \
42
+ int rawlen = 0; \
27
43
  ele_open = 0; text = 0; \
28
44
  if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
29
- raw_string = rb_str_new(ts, te-ts); \
45
+ raw = ts; rawlen = te - ts; \
30
46
  } \
31
- rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
47
+ if (rb_block_given_p()) { \
48
+ VALUE raw_string = Qnil; \
49
+ if (raw != NULL) raw_string = rb_str_new(raw, rawlen); \
50
+ rb_yield_tokens(sym_##N, tag, attr, Qnil, taint); \
51
+ } else \
52
+ rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint); \
32
53
  }
33
54
 
34
55
  #define SET(N, E) \
@@ -39,7 +60,7 @@ static ID s_read, s_to_str;
39
60
 
40
61
  #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
41
62
 
42
- #define SLIDE(N) if ( mark_##N > ts ) mark_##N = buf + (mark_##N - ts);
63
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
43
64
 
44
65
  #define ATTR(K, V) \
45
66
  if (!NIL_P(K)) { \
@@ -91,11 +112,11 @@ static ID s_read, s_to_str;
91
112
  else { SET(aval, p); }
92
113
  }
93
114
  action akey { SET(akey, p); }
94
- action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
95
- action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
96
- action xmlsd { SET(aval, p); ATTR(rb_str_new2("standalone"), aval); }
97
- action pubid { SET(aval, p); ATTR(rb_str_new2("public_id"), aval); }
98
- action sysid { SET(aval, p); ATTR(rb_str_new2("system_id"), aval); }
115
+ action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
116
+ action xmlenc { SET(aval, p); ATTR(ID2SYM(rb_intern("encoding")), aval); }
117
+ action xmlsd { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
118
+ action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
119
+ action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
99
120
 
100
121
  action new_attr {
101
122
  akey = Qnil;
@@ -132,29 +153,350 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
132
153
  rb_yield(ary);
133
154
  }
134
155
 
135
- VALUE hpricot_scan(VALUE self, VALUE port)
156
+ /* rb_hash_lookup() is only in Ruby 1.8.7 */
157
+ static VALUE
158
+ our_rb_hash_lookup(VALUE hash, VALUE key)
159
+ {
160
+ VALUE val;
161
+
162
+ if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
163
+ return Qnil; /* without Hash#default */
164
+ }
165
+
166
+ return val;
167
+ }
168
+
169
+ static void
170
+ rb_hpricot_add(VALUE focus, VALUE ele)
171
+ {
172
+ hpricot_ele *he, *he2;
173
+ Data_Get_Struct(focus, hpricot_ele, he);
174
+ Data_Get_Struct(ele, hpricot_ele, he2);
175
+ if (NIL_P(he->children))
176
+ he->children = rb_ary_new();
177
+ rb_ary_push(he->children, ele);
178
+ he2->parent = focus;
179
+ }
180
+
181
+ typedef struct {
182
+ VALUE doc;
183
+ VALUE focus;
184
+ VALUE last;
185
+ VALUE EC;
186
+ unsigned char xml, strict, fixup;
187
+ } hpricot_state;
188
+
189
+ static void
190
+ hpricot_ele_mark(hpricot_ele *he)
191
+ {
192
+ rb_gc_mark(he->tag);
193
+ rb_gc_mark(he->attr);
194
+ rb_gc_mark(he->etag);
195
+ rb_gc_mark(he->raw);
196
+ rb_gc_mark(he->parent);
197
+ rb_gc_mark(he->children);
198
+ }
199
+
200
+ static void
201
+ hpricot_ele_free(hpricot_ele *he)
202
+ {
203
+ free(he);
204
+ }
205
+
206
+ #define H_PROP(prop) \
207
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
208
+ hpricot_ele *he; \
209
+ Data_Get_Struct(self, hpricot_ele, he); \
210
+ he->prop = x; \
211
+ return self; \
212
+ } \
213
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
214
+ hpricot_ele *he; \
215
+ Data_Get_Struct(self, hpricot_ele, he); \
216
+ return he->prop; \
217
+ }
218
+
219
+ #define H_ATTR(prop) \
220
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
221
+ hpricot_ele *he; \
222
+ Data_Get_Struct(self, hpricot_ele, he); \
223
+ rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
224
+ return self; \
225
+ } \
226
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
227
+ hpricot_ele *he; \
228
+ Data_Get_Struct(self, hpricot_ele, he); \
229
+ return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
230
+ }
231
+
232
+ H_PROP(tag);
233
+ H_PROP(attr);
234
+ H_PROP(etag);
235
+ H_PROP(parent);
236
+ H_PROP(children);
237
+ H_ATTR(encoding);
238
+ H_ATTR(version);
239
+ H_ATTR(standalone);
240
+ H_ATTR(system_id);
241
+ H_ATTR(public_id);
242
+
243
+ static VALUE
244
+ hpricot_ele_get_raw(VALUE self, VALUE x) {
245
+ hpricot_ele *he;
246
+ Data_Get_Struct(self, hpricot_ele, he);
247
+ return he->raw;
248
+ }
249
+
250
+ static VALUE
251
+ hpricot_ele_clear_raw(VALUE self)
252
+ {
253
+ hpricot_ele *he;
254
+ Data_Get_Struct(self, hpricot_ele, he);
255
+ he->raw = Qnil;
256
+ return Qtrue;
257
+ }
258
+
259
+ #define H_ELE(klass) \
260
+ hpricot_ele *he = ALLOC(hpricot_ele); \
261
+ he->name = 0; \
262
+ he->tag = tag; \
263
+ he->attr = attr; \
264
+ he->raw = Qnil; \
265
+ he->EC = ec; \
266
+ he->etag = he->parent = he->children = Qnil; \
267
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_etag || sym == sym_doctype)) { \
268
+ he->raw = rb_str_new(raw, rawlen); \
269
+ } \
270
+ ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
271
+ S->last = ele
272
+
273
+ VALUE
274
+ hpricot_ele_alloc(VALUE klass)
275
+ {
276
+ VALUE ele;
277
+ hpricot_ele *he = ALLOC(hpricot_ele);
278
+ he->name = 0;
279
+ he->tag = he->attr = he->raw = he->EC = Qnil;
280
+ he->etag = he->parent = he->children = Qnil;
281
+ ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
282
+ return ele;
283
+ }
284
+
285
+ //
286
+ // the swift, compact parser logic. most of the complicated stuff is done
287
+ // in the lexer. this step just pairs up the start and end tags.
288
+ //
289
+ void
290
+ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw, int rawlen, int taint)
291
+ {
292
+ VALUE ele, ec = Qnil;
293
+
294
+ //
295
+ // in html mode, fix up start tags incorrectly formed as empty tags
296
+ //
297
+ if (!S->xml) {
298
+ hpricot_ele *last;
299
+ Data_Get_Struct(S->focus, hpricot_ele, last);
300
+ if (last->EC == sym_CDATA &&
301
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
302
+ !(sym == sym_etag && rb_str_hash(tag) == last->name))
303
+ {
304
+ sym = sym_text;
305
+ tag = rb_str_new(raw, rawlen);
306
+ }
307
+
308
+ if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
309
+ ec = rb_hash_aref(S->EC, tag);
310
+ if (NIL_P(ec)) {
311
+ tag = rb_funcall(tag, s_downcase, 0);
312
+ ec = rb_hash_aref(S->EC, tag);
313
+ }
314
+ if (sym == sym_emptytag) {
315
+ if (ec != sym_EMPTY)
316
+ sym = sym_stag;
317
+ } else if (sym == sym_stag) {
318
+ if (ec == sym_EMPTY)
319
+ sym = sym_emptytag;
320
+ }
321
+ }
322
+ }
323
+
324
+ if (sym == sym_emptytag || sym == sym_stag) {
325
+ H_ELE(cElem);
326
+ he->name = rb_str_hash(tag);
327
+
328
+ if (!S->xml) {
329
+ VALUE match = Qnil, e = S->focus;
330
+ while (e != S->doc)
331
+ {
332
+ hpricot_ele *hee;
333
+ Data_Get_Struct(e, hpricot_ele, hee);
334
+
335
+ if (TYPE(hee->EC) == T_HASH)
336
+ {
337
+ VALUE has = our_rb_hash_lookup(hee->EC, INT2NUM(he->name));
338
+ if (has != Qnil) {
339
+ if (has == Qtrue) {
340
+ if (match == Qnil)
341
+ match = e;
342
+ } else if (has == symAllow) {
343
+ match = S->focus;
344
+ } else if (has == symDeny) {
345
+ match = Qnil;
346
+ }
347
+ }
348
+ }
349
+
350
+ e = hee->parent;
351
+ }
352
+
353
+ if (match == Qnil)
354
+ match = S->focus;
355
+ S->focus = match;
356
+ }
357
+
358
+ rb_hpricot_add(S->focus, ele);
359
+
360
+ //
361
+ // in the case of a start tag that should be empty, just
362
+ // skip the step that focuses the element. focusing moves
363
+ // us deeper into the document.
364
+ //
365
+ if (sym == sym_stag) {
366
+ if (S->xml || ec != sym_EMPTY) {
367
+ S->focus = ele;
368
+ S->last = Qnil;
369
+ }
370
+ }
371
+ } else if (sym == sym_etag) {
372
+ int name;
373
+ VALUE match = Qnil, e = S->focus;
374
+ if (S->strict) {
375
+ if (NIL_P(rb_hash_aref(S->EC, tag))) {
376
+ tag = rb_str_new2("div");
377
+ }
378
+ }
379
+
380
+ //
381
+ // another optimization will be to improve this very simple
382
+ // O(n) tag search, where n is the depth of the focused tag.
383
+ //
384
+ // (see also: the search above for fixups)
385
+ //
386
+ name = rb_str_hash(tag);
387
+ while (e != S->doc)
388
+ {
389
+ hpricot_ele *he;
390
+ Data_Get_Struct(e, hpricot_ele, he);
391
+
392
+ if (he->name == name)
393
+ {
394
+ match = e;
395
+ break;
396
+ }
397
+
398
+ e = he->parent;
399
+ }
400
+
401
+ if (NIL_P(match))
402
+ {
403
+ H_ELE(cBogusETag);
404
+ rb_hpricot_add(S->focus, ele);
405
+ }
406
+ else
407
+ {
408
+ H_ELE(cETag);
409
+ Data_Get_Struct(match, hpricot_ele, he);
410
+ he->etag = ele;
411
+ S->focus = he->parent;
412
+ S->last = Qnil;
413
+ }
414
+ } else if (sym == sym_cdata) {
415
+ H_ELE(cCData);
416
+ rb_hpricot_add(S->focus, ele);
417
+ } else if (sym == sym_comment) {
418
+ H_ELE(cComment);
419
+ rb_hpricot_add(S->focus, ele);
420
+ } else if (sym == sym_doctype) {
421
+ H_ELE(cDocType);
422
+ if (S->strict) {
423
+ rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
424
+ rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN"));
425
+ }
426
+ rb_hpricot_add(S->focus, ele);
427
+ } else if (sym == sym_procins) {
428
+ VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
429
+ tag = rb_reg_nth_match(1, match);
430
+ attr = rb_reg_nth_match(2, match);
431
+ {
432
+ H_ELE(cProcIns);
433
+ rb_hpricot_add(S->focus, ele);
434
+ }
435
+ } else if (sym == sym_text) {
436
+ // TODO: add raw_string as well?
437
+ if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
438
+ hpricot_ele *he;
439
+ Data_Get_Struct(S->last, hpricot_ele, he);
440
+ rb_str_append(he->tag, tag);
441
+ } else {
442
+ H_ELE(cText);
443
+ rb_hpricot_add(S->focus, ele);
444
+ }
445
+ } else if (sym == sym_xmldecl) {
446
+ H_ELE(cXMLDecl);
447
+ rb_hpricot_add(S->focus, ele);
448
+ }
449
+ }
450
+
451
+ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
136
452
  {
137
- int cs, act, have = 0, nread = 0, curline = 1, text = 0;
453
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
138
454
  char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
139
455
 
456
+ hpricot_state *S = NULL;
457
+ VALUE port, opts;
140
458
  VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
141
459
  char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
142
- int done = 0, ele_open = 0, buffer_size = 0;
460
+ int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
143
461
 
144
- int taint = OBJ_TAINTED( port );
145
- if ( !rb_respond_to( port, s_read ) )
462
+ rb_scan_args(argc, argv, "11", &port, &opts);
463
+ taint = OBJ_TAINTED(port);
464
+ io = rb_respond_to(port, s_read);
465
+ if (!io)
146
466
  {
147
- if ( rb_respond_to( port, s_to_str ) )
467
+ if (rb_respond_to(port, s_to_str))
148
468
  {
149
- port = rb_funcall( port, s_to_str, 0 );
469
+ port = rb_funcall(port, s_to_str, 0);
150
470
  StringValue(port);
151
471
  }
152
472
  else
153
473
  {
154
- rb_raise( rb_eArgError, "bad Hpricot argument, String or IO only please." );
474
+ rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
155
475
  }
156
476
  }
157
477
 
478
+ if (TYPE(opts) != T_HASH)
479
+ opts = Qnil;
480
+
481
+ if (!rb_block_given_p())
482
+ {
483
+ hpricot_ele *he = ALLOC(hpricot_ele);
484
+ S = ALLOC(hpricot_state);
485
+ MEMZERO(he, hpricot_ele, 1);
486
+ he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
487
+ S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
488
+ rb_gc_register_address(&S->doc);
489
+ S->focus = S->doc;
490
+ S->last = Qnil;
491
+ S->xml = OPT(opts, xml);
492
+ S->strict = OPT(opts, xhtml_strict);
493
+ S->fixup = OPT(opts, fixup_tags);
494
+ if (S->strict) S->fixup = 1;
495
+ rb_ivar_set(S->doc, rb_intern("@options"), opts);
496
+
497
+ S->EC = rb_const_get(mHpricot, s_ElementContent);
498
+ }
499
+
158
500
  buffer_size = BUFSIZE;
159
501
  if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
160
502
  bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
@@ -162,38 +504,57 @@ VALUE hpricot_scan(VALUE self, VALUE port)
162
504
  buffer_size = NUM2INT(bufsize);
163
505
  }
164
506
  }
165
- buf = ALLOC_N(char, buffer_size);
507
+
508
+ if (io)
509
+ buf = ALLOC_N(char, buffer_size);
166
510
 
167
511
  %% write init;
168
512
 
169
- while ( !done ) {
513
+ while (!done) {
170
514
  VALUE str;
171
- char *p = buf + have, *pe;
172
- int len, space = buffer_size - have;
173
-
174
- if ( space == 0 ) {
175
- /* We've used up the entire buffer storing an already-parsed token
176
- * prefix that must be preserved. Likely caused by super-long attributes.
177
- * See ticket #13. */
178
- rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING_PTR(tag), curline);
179
- }
515
+ char *p, *pe;
516
+ int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
180
517
 
181
- if ( rb_respond_to( port, s_read ) )
518
+ if (io)
182
519
  {
183
- str = rb_funcall( port, s_read, 1, INT2FIX(space) );
520
+ if (space == 0) {
521
+ /* We've used up the entire buffer storing an already-parsed token
522
+ * prefix that must be preserved. Likely caused by super-long attributes.
523
+ * Increase buffer size and continue */
524
+ tokstart_diff = ts - buf;
525
+ tokend_diff = te - buf;
526
+ mark_tag_diff = mark_tag - buf;
527
+ mark_akey_diff = mark_akey - buf;
528
+ mark_aval_diff = mark_aval - buf;
529
+
530
+ buffer_size += BUFSIZE;
531
+ REALLOC_N(buf, char, buffer_size);
532
+
533
+ space = buffer_size - have;
534
+
535
+ ts = buf + tokstart_diff;
536
+ te = buf + tokend_diff;
537
+ mark_tag = buf + mark_tag_diff;
538
+ mark_akey = buf + mark_akey_diff;
539
+ mark_aval = buf + mark_aval_diff;
540
+ }
541
+ p = buf + have;
542
+
543
+ str = rb_funcall(port, s_read, 1, INT2FIX(space));
544
+ len = RSTRING_LEN(str);
545
+ memcpy(p, StringValuePtr(str), len);
184
546
  }
185
547
  else
186
548
  {
187
- str = rb_str_substr( port, nread, space );
549
+ p = RSTRING_PTR(port);
550
+ len = RSTRING_LEN(port) + 1;
551
+ done = 1;
188
552
  }
189
553
 
190
- StringValue(str);
191
- memcpy( p, RSTRING_PTR(str), RSTRING_LEN(str) );
192
- len = RSTRING_LEN(str);
193
554
  nread += len;
194
555
 
195
556
  /* If this is the last buffer, tack on an EOF. */
196
- if ( len < space ) {
557
+ if (io && len < space) {
197
558
  p[len++] = 0;
198
559
  done = 1;
199
560
  }
@@ -201,9 +562,10 @@ VALUE hpricot_scan(VALUE self, VALUE port)
201
562
  pe = p + len;
202
563
  %% write exec;
203
564
 
204
- if ( cs == hpricot_scan_error ) {
205
- free(buf);
206
- if ( !NIL_P(tag) )
565
+ if (cs == hpricot_scan_error) {
566
+ if (buf != NULL)
567
+ free(buf);
568
+ if (!NIL_P(tag))
207
569
  {
208
570
  rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
209
571
  }
@@ -213,7 +575,7 @@ VALUE hpricot_scan(VALUE self, VALUE port)
213
575
  }
214
576
  }
215
577
 
216
- if ( done && ele_open )
578
+ if (done && ele_open)
217
579
  {
218
580
  ele_open = 0;
219
581
  if (ts > 0) {
@@ -223,11 +585,11 @@ VALUE hpricot_scan(VALUE self, VALUE port)
223
585
  }
224
586
  }
225
587
 
226
- if ( ts == 0 )
588
+ if (ts == 0)
227
589
  {
228
590
  have = 0;
229
591
  /* text nodes have no ts because each byte is parsed alone */
230
- if ( mark_tag != NULL && text == 1 )
592
+ if (mark_tag != NULL && text == 1)
231
593
  {
232
594
  if (done)
233
595
  {
@@ -242,12 +604,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
242
604
  CAT(tag, p);
243
605
  }
244
606
  }
245
- mark_tag = buf;
607
+ if (io)
608
+ mark_tag = buf;
609
+ else
610
+ mark_tag = RSTRING_PTR(port);
246
611
  }
247
- else
612
+ else if (io)
248
613
  {
249
614
  have = pe - ts;
250
- memmove( buf, ts, have );
615
+ memmove(buf, ts, have);
251
616
  SLIDE(tag);
252
617
  SLIDE(akey);
253
618
  SLIDE(aval);
@@ -255,18 +620,91 @@ VALUE hpricot_scan(VALUE self, VALUE port)
255
620
  ts = buf;
256
621
  }
257
622
  }
258
- free(buf);
623
+
624
+ if (buf != NULL)
625
+ free(buf);
626
+
627
+ if (S != NULL)
628
+ {
629
+ VALUE doc = S->doc;
630
+ rb_gc_unregister_address(&S->doc);
631
+ free(S);
632
+ return doc;
633
+ }
634
+
635
+ return Qnil;
259
636
  }
260
637
 
261
638
  void Init_hpricot_scan()
262
639
  {
263
- VALUE mHpricot = rb_define_module("Hpricot");
640
+ mHpricot = rb_define_module("Hpricot");
264
641
  rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
265
- rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
642
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
643
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
266
644
  rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
267
645
 
646
+ cDoc = rb_define_class_under(mHpricot, "Doc", rb_cObject);
647
+ rb_define_alloc_func(cDoc, hpricot_ele_alloc);
648
+ rb_define_method(cDoc, "children", hpricot_ele_get_children, 0);
649
+ rb_define_method(cDoc, "children=", hpricot_ele_set_children, 1);
650
+
651
+ cBaseEle = rb_define_class_under(mHpricot, "BaseEle", rb_cObject);
652
+ rb_define_alloc_func(cBaseEle, hpricot_ele_alloc);
653
+ rb_define_method(cBaseEle, "raw_string", hpricot_ele_get_raw, 0);
654
+ rb_define_method(cBaseEle, "clear_raw", hpricot_ele_clear_raw, 0);
655
+ rb_define_method(cBaseEle, "parent", hpricot_ele_get_parent, 0);
656
+ rb_define_method(cBaseEle, "parent=", hpricot_ele_set_parent, 1);
657
+ cCData = rb_define_class_under(mHpricot, "CData", cBaseEle);
658
+ rb_define_method(cCData, "content", hpricot_ele_get_tag, 0);
659
+ rb_define_method(cCData, "content=", hpricot_ele_set_tag, 1);
660
+ cComment = rb_define_class_under(mHpricot, "Comment", cBaseEle);
661
+ rb_define_method(cComment, "content", hpricot_ele_get_tag, 0);
662
+ rb_define_method(cComment, "content=", hpricot_ele_set_tag, 1);
663
+ cDocType = rb_define_class_under(mHpricot, "DocType", cBaseEle);
664
+ rb_define_method(cDocType, "target", hpricot_ele_get_tag, 0);
665
+ rb_define_method(cDocType, "target=", hpricot_ele_set_tag, 1);
666
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
667
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
668
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
669
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
670
+ cElem = rb_define_class_under(mHpricot, "Elem", cBaseEle);
671
+ rb_define_method(cElem, "raw_attributes", hpricot_ele_get_attr, 0);
672
+ rb_define_method(cElem, "raw_attributes=", hpricot_ele_set_attr, 1);
673
+ rb_define_method(cElem, "children", hpricot_ele_get_children, 0);
674
+ rb_define_method(cElem, "children=", hpricot_ele_set_children, 1);
675
+ rb_define_method(cElem, "etag", hpricot_ele_get_etag, 0);
676
+ rb_define_method(cElem, "etag=", hpricot_ele_set_etag, 1);
677
+ rb_define_method(cElem, "name", hpricot_ele_get_tag, 0);
678
+ rb_define_method(cElem, "name=", hpricot_ele_set_tag, 1);
679
+ cETag = rb_define_class_under(mHpricot, "ETag", cBaseEle);
680
+ rb_define_method(cETag, "name", hpricot_ele_get_tag, 0);
681
+ rb_define_method(cETag, "name=", hpricot_ele_set_tag, 1);
682
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", cETag);
683
+ cText = rb_define_class_under(mHpricot, "Text", cBaseEle);
684
+ rb_define_method(cText, "content", hpricot_ele_get_tag, 0);
685
+ rb_define_method(cText, "content=", hpricot_ele_set_tag, 1);
686
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", cBaseEle);
687
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
688
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
689
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
690
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
691
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
692
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
693
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", cBaseEle);
694
+ rb_define_method(cProcIns, "target", hpricot_ele_get_tag, 0);
695
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_tag, 1);
696
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
697
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
698
+
699
+ s_ElementContent = rb_intern("ElementContent");
700
+ symAllow = ID2SYM(rb_intern("allow"));
701
+ symDeny = ID2SYM(rb_intern("deny"));
702
+ s_downcase = rb_intern("downcase");
703
+ s_new = rb_intern("new");
704
+ s_parent = rb_intern("parent");
268
705
  s_read = rb_intern("read");
269
706
  s_to_str = rb_intern("to_str");
707
+ iv_parent = rb_intern("parent");
270
708
  sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
271
709
  sym_doctype = ID2SYM(rb_intern("doctype"));
272
710
  sym_procins = ID2SYM(rb_intern("procins"));
@@ -276,4 +714,9 @@ void Init_hpricot_scan()
276
714
  sym_comment = ID2SYM(rb_intern("comment"));
277
715
  sym_cdata = ID2SYM(rb_intern("cdata"));
278
716
  sym_text = ID2SYM(rb_intern("text"));
717
+ sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
718
+ sym_CDATA = ID2SYM(rb_intern("CDATA"));
719
+
720
+ rb_const_set(mHpricot, rb_intern("ProcInsParse"),
721
+ reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
279
722
  }