hpricot 0.6.164 → 0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -264,7 +264,11 @@ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
264
264
  /* We've used up the entire buffer storing an already-parsed token
265
265
  * prefix that must be preserved. Likely caused by super-long attributes.
266
266
  * See ticket #13. */
267
- rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <" + tag.toString() + ">, starting on line "+curline+".");
267
+ buffer_size += BUFSIZE;
268
+ char[] new_buf = new char[buffer_size];
269
+ System.arraycopy(buf, 0, new_buf, 0, buf.length);
270
+ buf = new_buf;
271
+ space = buffer_size - have;
268
272
  }
269
273
 
270
274
  if (port.respondsTo("read")) {
@@ -14,21 +14,42 @@
14
14
  #define RSTRING_PTR(str) RSTRING(str)->ptr
15
15
  #endif
16
16
 
17
+ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
18
+
17
19
  #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
18
20
 
19
21
  static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
20
- sym_cdata, sym_text;
21
- static VALUE rb_eHpricotParseError;
22
- static ID s_read, s_to_str;
22
+ sym_cdata, sym_text, sym_EMPTY, sym_CDATA;
23
+ static VALUE mHpricot, rb_eHpricotParseError;
24
+ static VALUE cBaseEle, cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cETag, cText,
25
+ cXMLDecl, cProcIns, symAllow, symDeny;
26
+ static ID s_ElementContent;
27
+ static ID s_downcase, s_new, s_parent, s_read, s_to_str;
28
+ static ID iv_parent;
29
+ static VALUE reProcInsParse;
30
+
31
+ typedef struct {
32
+ int name;
33
+ VALUE tag, attr, etag, raw, EC;
34
+ VALUE parent, children;
35
+ } hpricot_ele;
36
+
37
+ #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
23
38
 
24
39
  #define ELE(N) \
25
40
  if (te > ts || text == 1) { \
26
- VALUE raw_string = Qnil; \
41
+ char *raw = NULL; \
42
+ int rawlen = 0; \
27
43
  ele_open = 0; text = 0; \
28
44
  if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
29
- raw_string = rb_str_new(ts, te-ts); \
45
+ raw = ts; rawlen = te - ts; \
30
46
  } \
31
- rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
47
+ if (rb_block_given_p()) { \
48
+ VALUE raw_string = Qnil; \
49
+ if (raw != NULL) raw_string = rb_str_new(raw, rawlen); \
50
+ rb_yield_tokens(sym_##N, tag, attr, Qnil, taint); \
51
+ } else \
52
+ rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint); \
32
53
  }
33
54
 
34
55
  #define SET(N, E) \
@@ -39,7 +60,7 @@ static ID s_read, s_to_str;
39
60
 
40
61
  #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
41
62
 
42
- #define SLIDE(N) if ( mark_##N > ts ) mark_##N = buf + (mark_##N - ts);
63
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
43
64
 
44
65
  #define ATTR(K, V) \
45
66
  if (!NIL_P(K)) { \
@@ -91,11 +112,11 @@ static ID s_read, s_to_str;
91
112
  else { SET(aval, p); }
92
113
  }
93
114
  action akey { SET(akey, p); }
94
- action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
95
- action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
96
- action xmlsd { SET(aval, p); ATTR(rb_str_new2("standalone"), aval); }
97
- action pubid { SET(aval, p); ATTR(rb_str_new2("public_id"), aval); }
98
- action sysid { SET(aval, p); ATTR(rb_str_new2("system_id"), aval); }
115
+ action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
116
+ action xmlenc { SET(aval, p); ATTR(ID2SYM(rb_intern("encoding")), aval); }
117
+ action xmlsd { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
118
+ action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
119
+ action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
99
120
 
100
121
  action new_attr {
101
122
  akey = Qnil;
@@ -132,29 +153,350 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
132
153
  rb_yield(ary);
133
154
  }
134
155
 
135
- VALUE hpricot_scan(VALUE self, VALUE port)
156
+ /* rb_hash_lookup() is only in Ruby 1.8.7 */
157
+ static VALUE
158
+ our_rb_hash_lookup(VALUE hash, VALUE key)
159
+ {
160
+ VALUE val;
161
+
162
+ if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
163
+ return Qnil; /* without Hash#default */
164
+ }
165
+
166
+ return val;
167
+ }
168
+
169
+ static void
170
+ rb_hpricot_add(VALUE focus, VALUE ele)
171
+ {
172
+ hpricot_ele *he, *he2;
173
+ Data_Get_Struct(focus, hpricot_ele, he);
174
+ Data_Get_Struct(ele, hpricot_ele, he2);
175
+ if (NIL_P(he->children))
176
+ he->children = rb_ary_new();
177
+ rb_ary_push(he->children, ele);
178
+ he2->parent = focus;
179
+ }
180
+
181
+ typedef struct {
182
+ VALUE doc;
183
+ VALUE focus;
184
+ VALUE last;
185
+ VALUE EC;
186
+ unsigned char xml, strict, fixup;
187
+ } hpricot_state;
188
+
189
+ static void
190
+ hpricot_ele_mark(hpricot_ele *he)
191
+ {
192
+ rb_gc_mark(he->tag);
193
+ rb_gc_mark(he->attr);
194
+ rb_gc_mark(he->etag);
195
+ rb_gc_mark(he->raw);
196
+ rb_gc_mark(he->parent);
197
+ rb_gc_mark(he->children);
198
+ }
199
+
200
+ static void
201
+ hpricot_ele_free(hpricot_ele *he)
202
+ {
203
+ free(he);
204
+ }
205
+
206
+ #define H_PROP(prop) \
207
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
208
+ hpricot_ele *he; \
209
+ Data_Get_Struct(self, hpricot_ele, he); \
210
+ he->prop = x; \
211
+ return self; \
212
+ } \
213
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
214
+ hpricot_ele *he; \
215
+ Data_Get_Struct(self, hpricot_ele, he); \
216
+ return he->prop; \
217
+ }
218
+
219
+ #define H_ATTR(prop) \
220
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
221
+ hpricot_ele *he; \
222
+ Data_Get_Struct(self, hpricot_ele, he); \
223
+ rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
224
+ return self; \
225
+ } \
226
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
227
+ hpricot_ele *he; \
228
+ Data_Get_Struct(self, hpricot_ele, he); \
229
+ return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
230
+ }
231
+
232
+ H_PROP(tag);
233
+ H_PROP(attr);
234
+ H_PROP(etag);
235
+ H_PROP(parent);
236
+ H_PROP(children);
237
+ H_ATTR(encoding);
238
+ H_ATTR(version);
239
+ H_ATTR(standalone);
240
+ H_ATTR(system_id);
241
+ H_ATTR(public_id);
242
+
243
+ static VALUE
244
+ hpricot_ele_get_raw(VALUE self, VALUE x) {
245
+ hpricot_ele *he;
246
+ Data_Get_Struct(self, hpricot_ele, he);
247
+ return he->raw;
248
+ }
249
+
250
+ static VALUE
251
+ hpricot_ele_clear_raw(VALUE self)
252
+ {
253
+ hpricot_ele *he;
254
+ Data_Get_Struct(self, hpricot_ele, he);
255
+ he->raw = Qnil;
256
+ return Qtrue;
257
+ }
258
+
259
+ #define H_ELE(klass) \
260
+ hpricot_ele *he = ALLOC(hpricot_ele); \
261
+ he->name = 0; \
262
+ he->tag = tag; \
263
+ he->attr = attr; \
264
+ he->raw = Qnil; \
265
+ he->EC = ec; \
266
+ he->etag = he->parent = he->children = Qnil; \
267
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_etag || sym == sym_doctype)) { \
268
+ he->raw = rb_str_new(raw, rawlen); \
269
+ } \
270
+ ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
271
+ S->last = ele
272
+
273
+ VALUE
274
+ hpricot_ele_alloc(VALUE klass)
275
+ {
276
+ VALUE ele;
277
+ hpricot_ele *he = ALLOC(hpricot_ele);
278
+ he->name = 0;
279
+ he->tag = he->attr = he->raw = he->EC = Qnil;
280
+ he->etag = he->parent = he->children = Qnil;
281
+ ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
282
+ return ele;
283
+ }
284
+
285
+ //
286
+ // the swift, compact parser logic. most of the complicated stuff is done
287
+ // in the lexer. this step just pairs up the start and end tags.
288
+ //
289
+ void
290
+ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw, int rawlen, int taint)
291
+ {
292
+ VALUE ele, ec = Qnil;
293
+
294
+ //
295
+ // in html mode, fix up start tags incorrectly formed as empty tags
296
+ //
297
+ if (!S->xml) {
298
+ hpricot_ele *last;
299
+ Data_Get_Struct(S->focus, hpricot_ele, last);
300
+ if (last->EC == sym_CDATA &&
301
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
302
+ !(sym == sym_etag && rb_str_hash(tag) == last->name))
303
+ {
304
+ sym = sym_text;
305
+ tag = rb_str_new(raw, rawlen);
306
+ }
307
+
308
+ if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
309
+ ec = rb_hash_aref(S->EC, tag);
310
+ if (NIL_P(ec)) {
311
+ tag = rb_funcall(tag, s_downcase, 0);
312
+ ec = rb_hash_aref(S->EC, tag);
313
+ }
314
+ if (sym == sym_emptytag) {
315
+ if (ec != sym_EMPTY)
316
+ sym = sym_stag;
317
+ } else if (sym == sym_stag) {
318
+ if (ec == sym_EMPTY)
319
+ sym = sym_emptytag;
320
+ }
321
+ }
322
+ }
323
+
324
+ if (sym == sym_emptytag || sym == sym_stag) {
325
+ H_ELE(cElem);
326
+ he->name = rb_str_hash(tag);
327
+
328
+ if (!S->xml) {
329
+ VALUE match = Qnil, e = S->focus;
330
+ while (e != S->doc)
331
+ {
332
+ hpricot_ele *hee;
333
+ Data_Get_Struct(e, hpricot_ele, hee);
334
+
335
+ if (TYPE(hee->EC) == T_HASH)
336
+ {
337
+ VALUE has = our_rb_hash_lookup(hee->EC, INT2NUM(he->name));
338
+ if (has != Qnil) {
339
+ if (has == Qtrue) {
340
+ if (match == Qnil)
341
+ match = e;
342
+ } else if (has == symAllow) {
343
+ match = S->focus;
344
+ } else if (has == symDeny) {
345
+ match = Qnil;
346
+ }
347
+ }
348
+ }
349
+
350
+ e = hee->parent;
351
+ }
352
+
353
+ if (match == Qnil)
354
+ match = S->focus;
355
+ S->focus = match;
356
+ }
357
+
358
+ rb_hpricot_add(S->focus, ele);
359
+
360
+ //
361
+ // in the case of a start tag that should be empty, just
362
+ // skip the step that focuses the element. focusing moves
363
+ // us deeper into the document.
364
+ //
365
+ if (sym == sym_stag) {
366
+ if (S->xml || ec != sym_EMPTY) {
367
+ S->focus = ele;
368
+ S->last = Qnil;
369
+ }
370
+ }
371
+ } else if (sym == sym_etag) {
372
+ int name;
373
+ VALUE match = Qnil, e = S->focus;
374
+ if (S->strict) {
375
+ if (NIL_P(rb_hash_aref(S->EC, tag))) {
376
+ tag = rb_str_new2("div");
377
+ }
378
+ }
379
+
380
+ //
381
+ // another optimization will be to improve this very simple
382
+ // O(n) tag search, where n is the depth of the focused tag.
383
+ //
384
+ // (see also: the search above for fixups)
385
+ //
386
+ name = rb_str_hash(tag);
387
+ while (e != S->doc)
388
+ {
389
+ hpricot_ele *he;
390
+ Data_Get_Struct(e, hpricot_ele, he);
391
+
392
+ if (he->name == name)
393
+ {
394
+ match = e;
395
+ break;
396
+ }
397
+
398
+ e = he->parent;
399
+ }
400
+
401
+ if (NIL_P(match))
402
+ {
403
+ H_ELE(cBogusETag);
404
+ rb_hpricot_add(S->focus, ele);
405
+ }
406
+ else
407
+ {
408
+ H_ELE(cETag);
409
+ Data_Get_Struct(match, hpricot_ele, he);
410
+ he->etag = ele;
411
+ S->focus = he->parent;
412
+ S->last = Qnil;
413
+ }
414
+ } else if (sym == sym_cdata) {
415
+ H_ELE(cCData);
416
+ rb_hpricot_add(S->focus, ele);
417
+ } else if (sym == sym_comment) {
418
+ H_ELE(cComment);
419
+ rb_hpricot_add(S->focus, ele);
420
+ } else if (sym == sym_doctype) {
421
+ H_ELE(cDocType);
422
+ if (S->strict) {
423
+ rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
424
+ rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN"));
425
+ }
426
+ rb_hpricot_add(S->focus, ele);
427
+ } else if (sym == sym_procins) {
428
+ VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
429
+ tag = rb_reg_nth_match(1, match);
430
+ attr = rb_reg_nth_match(2, match);
431
+ {
432
+ H_ELE(cProcIns);
433
+ rb_hpricot_add(S->focus, ele);
434
+ }
435
+ } else if (sym == sym_text) {
436
+ // TODO: add raw_string as well?
437
+ if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
438
+ hpricot_ele *he;
439
+ Data_Get_Struct(S->last, hpricot_ele, he);
440
+ rb_str_append(he->tag, tag);
441
+ } else {
442
+ H_ELE(cText);
443
+ rb_hpricot_add(S->focus, ele);
444
+ }
445
+ } else if (sym == sym_xmldecl) {
446
+ H_ELE(cXMLDecl);
447
+ rb_hpricot_add(S->focus, ele);
448
+ }
449
+ }
450
+
451
+ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
136
452
  {
137
- int cs, act, have = 0, nread = 0, curline = 1, text = 0;
453
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
138
454
  char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
139
455
 
456
+ hpricot_state *S = NULL;
457
+ VALUE port, opts;
140
458
  VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
141
459
  char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
142
- int done = 0, ele_open = 0, buffer_size = 0;
460
+ int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
143
461
 
144
- int taint = OBJ_TAINTED( port );
145
- if ( !rb_respond_to( port, s_read ) )
462
+ rb_scan_args(argc, argv, "11", &port, &opts);
463
+ taint = OBJ_TAINTED(port);
464
+ io = rb_respond_to(port, s_read);
465
+ if (!io)
146
466
  {
147
- if ( rb_respond_to( port, s_to_str ) )
467
+ if (rb_respond_to(port, s_to_str))
148
468
  {
149
- port = rb_funcall( port, s_to_str, 0 );
469
+ port = rb_funcall(port, s_to_str, 0);
150
470
  StringValue(port);
151
471
  }
152
472
  else
153
473
  {
154
- rb_raise( rb_eArgError, "bad Hpricot argument, String or IO only please." );
474
+ rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
155
475
  }
156
476
  }
157
477
 
478
+ if (TYPE(opts) != T_HASH)
479
+ opts = Qnil;
480
+
481
+ if (!rb_block_given_p())
482
+ {
483
+ hpricot_ele *he = ALLOC(hpricot_ele);
484
+ S = ALLOC(hpricot_state);
485
+ MEMZERO(he, hpricot_ele, 1);
486
+ he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
487
+ S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
488
+ rb_gc_register_address(&S->doc);
489
+ S->focus = S->doc;
490
+ S->last = Qnil;
491
+ S->xml = OPT(opts, xml);
492
+ S->strict = OPT(opts, xhtml_strict);
493
+ S->fixup = OPT(opts, fixup_tags);
494
+ if (S->strict) S->fixup = 1;
495
+ rb_ivar_set(S->doc, rb_intern("@options"), opts);
496
+
497
+ S->EC = rb_const_get(mHpricot, s_ElementContent);
498
+ }
499
+
158
500
  buffer_size = BUFSIZE;
159
501
  if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
160
502
  bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
@@ -162,38 +504,57 @@ VALUE hpricot_scan(VALUE self, VALUE port)
162
504
  buffer_size = NUM2INT(bufsize);
163
505
  }
164
506
  }
165
- buf = ALLOC_N(char, buffer_size);
507
+
508
+ if (io)
509
+ buf = ALLOC_N(char, buffer_size);
166
510
 
167
511
  %% write init;
168
512
 
169
- while ( !done ) {
513
+ while (!done) {
170
514
  VALUE str;
171
- char *p = buf + have, *pe;
172
- int len, space = buffer_size - have;
173
-
174
- if ( space == 0 ) {
175
- /* We've used up the entire buffer storing an already-parsed token
176
- * prefix that must be preserved. Likely caused by super-long attributes.
177
- * See ticket #13. */
178
- rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING_PTR(tag), curline);
179
- }
515
+ char *p, *pe;
516
+ int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
180
517
 
181
- if ( rb_respond_to( port, s_read ) )
518
+ if (io)
182
519
  {
183
- str = rb_funcall( port, s_read, 1, INT2FIX(space) );
520
+ if (space == 0) {
521
+ /* We've used up the entire buffer storing an already-parsed token
522
+ * prefix that must be preserved. Likely caused by super-long attributes.
523
+ * Increase buffer size and continue */
524
+ tokstart_diff = ts - buf;
525
+ tokend_diff = te - buf;
526
+ mark_tag_diff = mark_tag - buf;
527
+ mark_akey_diff = mark_akey - buf;
528
+ mark_aval_diff = mark_aval - buf;
529
+
530
+ buffer_size += BUFSIZE;
531
+ REALLOC_N(buf, char, buffer_size);
532
+
533
+ space = buffer_size - have;
534
+
535
+ ts = buf + tokstart_diff;
536
+ te = buf + tokend_diff;
537
+ mark_tag = buf + mark_tag_diff;
538
+ mark_akey = buf + mark_akey_diff;
539
+ mark_aval = buf + mark_aval_diff;
540
+ }
541
+ p = buf + have;
542
+
543
+ str = rb_funcall(port, s_read, 1, INT2FIX(space));
544
+ len = RSTRING_LEN(str);
545
+ memcpy(p, StringValuePtr(str), len);
184
546
  }
185
547
  else
186
548
  {
187
- str = rb_str_substr( port, nread, space );
549
+ p = RSTRING_PTR(port);
550
+ len = RSTRING_LEN(port) + 1;
551
+ done = 1;
188
552
  }
189
553
 
190
- StringValue(str);
191
- memcpy( p, RSTRING_PTR(str), RSTRING_LEN(str) );
192
- len = RSTRING_LEN(str);
193
554
  nread += len;
194
555
 
195
556
  /* If this is the last buffer, tack on an EOF. */
196
- if ( len < space ) {
557
+ if (io && len < space) {
197
558
  p[len++] = 0;
198
559
  done = 1;
199
560
  }
@@ -201,9 +562,10 @@ VALUE hpricot_scan(VALUE self, VALUE port)
201
562
  pe = p + len;
202
563
  %% write exec;
203
564
 
204
- if ( cs == hpricot_scan_error ) {
205
- free(buf);
206
- if ( !NIL_P(tag) )
565
+ if (cs == hpricot_scan_error) {
566
+ if (buf != NULL)
567
+ free(buf);
568
+ if (!NIL_P(tag))
207
569
  {
208
570
  rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
209
571
  }
@@ -213,7 +575,7 @@ VALUE hpricot_scan(VALUE self, VALUE port)
213
575
  }
214
576
  }
215
577
 
216
- if ( done && ele_open )
578
+ if (done && ele_open)
217
579
  {
218
580
  ele_open = 0;
219
581
  if (ts > 0) {
@@ -223,11 +585,11 @@ VALUE hpricot_scan(VALUE self, VALUE port)
223
585
  }
224
586
  }
225
587
 
226
- if ( ts == 0 )
588
+ if (ts == 0)
227
589
  {
228
590
  have = 0;
229
591
  /* text nodes have no ts because each byte is parsed alone */
230
- if ( mark_tag != NULL && text == 1 )
592
+ if (mark_tag != NULL && text == 1)
231
593
  {
232
594
  if (done)
233
595
  {
@@ -242,12 +604,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
242
604
  CAT(tag, p);
243
605
  }
244
606
  }
245
- mark_tag = buf;
607
+ if (io)
608
+ mark_tag = buf;
609
+ else
610
+ mark_tag = RSTRING_PTR(port);
246
611
  }
247
- else
612
+ else if (io)
248
613
  {
249
614
  have = pe - ts;
250
- memmove( buf, ts, have );
615
+ memmove(buf, ts, have);
251
616
  SLIDE(tag);
252
617
  SLIDE(akey);
253
618
  SLIDE(aval);
@@ -255,18 +620,91 @@ VALUE hpricot_scan(VALUE self, VALUE port)
255
620
  ts = buf;
256
621
  }
257
622
  }
258
- free(buf);
623
+
624
+ if (buf != NULL)
625
+ free(buf);
626
+
627
+ if (S != NULL)
628
+ {
629
+ VALUE doc = S->doc;
630
+ rb_gc_unregister_address(&S->doc);
631
+ free(S);
632
+ return doc;
633
+ }
634
+
635
+ return Qnil;
259
636
  }
260
637
 
261
638
  void Init_hpricot_scan()
262
639
  {
263
- VALUE mHpricot = rb_define_module("Hpricot");
640
+ mHpricot = rb_define_module("Hpricot");
264
641
  rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
265
- rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
642
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
643
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
266
644
  rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
267
645
 
646
+ cDoc = rb_define_class_under(mHpricot, "Doc", rb_cObject);
647
+ rb_define_alloc_func(cDoc, hpricot_ele_alloc);
648
+ rb_define_method(cDoc, "children", hpricot_ele_get_children, 0);
649
+ rb_define_method(cDoc, "children=", hpricot_ele_set_children, 1);
650
+
651
+ cBaseEle = rb_define_class_under(mHpricot, "BaseEle", rb_cObject);
652
+ rb_define_alloc_func(cBaseEle, hpricot_ele_alloc);
653
+ rb_define_method(cBaseEle, "raw_string", hpricot_ele_get_raw, 0);
654
+ rb_define_method(cBaseEle, "clear_raw", hpricot_ele_clear_raw, 0);
655
+ rb_define_method(cBaseEle, "parent", hpricot_ele_get_parent, 0);
656
+ rb_define_method(cBaseEle, "parent=", hpricot_ele_set_parent, 1);
657
+ cCData = rb_define_class_under(mHpricot, "CData", cBaseEle);
658
+ rb_define_method(cCData, "content", hpricot_ele_get_tag, 0);
659
+ rb_define_method(cCData, "content=", hpricot_ele_set_tag, 1);
660
+ cComment = rb_define_class_under(mHpricot, "Comment", cBaseEle);
661
+ rb_define_method(cComment, "content", hpricot_ele_get_tag, 0);
662
+ rb_define_method(cComment, "content=", hpricot_ele_set_tag, 1);
663
+ cDocType = rb_define_class_under(mHpricot, "DocType", cBaseEle);
664
+ rb_define_method(cDocType, "target", hpricot_ele_get_tag, 0);
665
+ rb_define_method(cDocType, "target=", hpricot_ele_set_tag, 1);
666
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
667
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
668
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
669
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
670
+ cElem = rb_define_class_under(mHpricot, "Elem", cBaseEle);
671
+ rb_define_method(cElem, "raw_attributes", hpricot_ele_get_attr, 0);
672
+ rb_define_method(cElem, "raw_attributes=", hpricot_ele_set_attr, 1);
673
+ rb_define_method(cElem, "children", hpricot_ele_get_children, 0);
674
+ rb_define_method(cElem, "children=", hpricot_ele_set_children, 1);
675
+ rb_define_method(cElem, "etag", hpricot_ele_get_etag, 0);
676
+ rb_define_method(cElem, "etag=", hpricot_ele_set_etag, 1);
677
+ rb_define_method(cElem, "name", hpricot_ele_get_tag, 0);
678
+ rb_define_method(cElem, "name=", hpricot_ele_set_tag, 1);
679
+ cETag = rb_define_class_under(mHpricot, "ETag", cBaseEle);
680
+ rb_define_method(cETag, "name", hpricot_ele_get_tag, 0);
681
+ rb_define_method(cETag, "name=", hpricot_ele_set_tag, 1);
682
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", cETag);
683
+ cText = rb_define_class_under(mHpricot, "Text", cBaseEle);
684
+ rb_define_method(cText, "content", hpricot_ele_get_tag, 0);
685
+ rb_define_method(cText, "content=", hpricot_ele_set_tag, 1);
686
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", cBaseEle);
687
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
688
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
689
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
690
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
691
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
692
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
693
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", cBaseEle);
694
+ rb_define_method(cProcIns, "target", hpricot_ele_get_tag, 0);
695
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_tag, 1);
696
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
697
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
698
+
699
+ s_ElementContent = rb_intern("ElementContent");
700
+ symAllow = ID2SYM(rb_intern("allow"));
701
+ symDeny = ID2SYM(rb_intern("deny"));
702
+ s_downcase = rb_intern("downcase");
703
+ s_new = rb_intern("new");
704
+ s_parent = rb_intern("parent");
268
705
  s_read = rb_intern("read");
269
706
  s_to_str = rb_intern("to_str");
707
+ iv_parent = rb_intern("parent");
270
708
  sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
271
709
  sym_doctype = ID2SYM(rb_intern("doctype"));
272
710
  sym_procins = ID2SYM(rb_intern("procins"));
@@ -276,4 +714,9 @@ void Init_hpricot_scan()
276
714
  sym_comment = ID2SYM(rb_intern("comment"));
277
715
  sym_cdata = ID2SYM(rb_intern("cdata"));
278
716
  sym_text = ID2SYM(rb_intern("text"));
717
+ sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
718
+ sym_CDATA = ID2SYM(rb_intern("CDATA"));
719
+
720
+ rb_const_set(mHpricot, rb_intern("ProcInsParse"),
721
+ reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
279
722
  }