why-hpricot 0.6.210 → 0.7.229

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,20 +19,26 @@ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
19
19
  #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
20
20
 
21
21
  static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
22
- sym_cdata, sym_text, sym_EMPTY, sym_CDATA;
22
+ sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
23
+ sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;
23
24
  static VALUE mHpricot, rb_eHpricotParseError;
24
- static VALUE cBaseEle, cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cETag, cText,
25
+ static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,
25
26
  cXMLDecl, cProcIns, symAllow, symDeny;
26
27
  static ID s_ElementContent;
27
28
  static ID s_downcase, s_new, s_parent, s_read, s_to_str;
28
- static ID iv_parent;
29
29
  static VALUE reProcInsParse;
30
30
 
31
- typedef struct {
32
- int name;
33
- VALUE tag, attr, etag, raw, EC;
34
- VALUE parent, children;
35
- } hpricot_ele;
31
+ #define H_ELE_TAG 0
32
+ #define H_ELE_PARENT 1
33
+ #define H_ELE_ATTR 2
34
+ #define H_ELE_ETAG 3
35
+ #define H_ELE_RAW 4
36
+ #define H_ELE_EC 5
37
+ #define H_ELE_HASH 6
38
+ #define H_ELE_CHILDREN 7
39
+
40
+ #define H_ELE_GET(ele, idx) RSTRUCT_PTR(ele)[idx]
41
+ #define H_ELE_SET(ele, idx, val) RSTRUCT_PTR(ele)[idx] = val
36
42
 
37
43
  #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
38
44
 
@@ -60,7 +66,7 @@ typedef struct {
60
66
 
61
67
  #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
62
68
 
63
- #define SLIDE(N) if ( mark_##N > ts ) mark_##N = buf + (mark_##N - ts);
69
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
64
70
 
65
71
  #define ATTR(K, V) \
66
72
  if (!NIL_P(K)) { \
@@ -107,7 +113,7 @@ typedef struct {
107
113
  action tag { SET(tag, p); }
108
114
  action tagc { SET(tag, p-1); }
109
115
  action aval { SET(aval, p); }
110
- action aunq {
116
+ action aunq {
111
117
  if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
112
118
  else { SET(aval, p); }
113
119
  }
@@ -118,14 +124,16 @@ typedef struct {
118
124
  action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
119
125
  action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
120
126
 
121
- action new_attr {
127
+ action new_attr {
122
128
  akey = Qnil;
123
129
  aval = Qnil;
124
130
  mark_akey = NULL;
125
131
  mark_aval = NULL;
126
132
  }
127
133
 
128
- action save_attr {
134
+ action save_attr {
135
+ if (!S->xml)
136
+ akey = rb_funcall(akey, s_downcase, 0);
129
137
  ATTR(akey, aval);
130
138
  }
131
139
 
@@ -144,7 +152,7 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
144
152
  raw = tag;
145
153
  }
146
154
  ary = rb_ary_new3(4, sym, tag, attr, raw);
147
- if (taint) {
155
+ if (taint) {
148
156
  OBJ_TAINT(ary);
149
157
  OBJ_TAINT(tag);
150
158
  OBJ_TAINT(attr);
@@ -153,16 +161,30 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
153
161
  rb_yield(ary);
154
162
  }
155
163
 
164
+ #ifndef RHASH_TBL
165
+ /* rb_hash_lookup() is only in Ruby 1.8.7 */
166
+ static VALUE
167
+ our_rb_hash_lookup(VALUE hash, VALUE key)
168
+ {
169
+ VALUE val;
170
+
171
+ if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
172
+ return Qnil; /* without Hash#default */
173
+ }
174
+
175
+ return val;
176
+ }
177
+ #define rb_hash_lookup our_rb_hash_lookup
178
+ #endif
179
+
156
180
  static void
157
181
  rb_hpricot_add(VALUE focus, VALUE ele)
158
182
  {
159
- hpricot_ele *he, *he2;
160
- Data_Get_Struct(focus, hpricot_ele, he);
161
- Data_Get_Struct(ele, hpricot_ele, he2);
162
- if (NIL_P(he->children))
163
- he->children = rb_ary_new();
164
- rb_ary_push(he->children, ele);
165
- he2->parent = focus;
183
+ VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
184
+ if (NIL_P(children))
185
+ H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
186
+ rb_ary_push(children, ele);
187
+ H_ELE_SET(ele, H_ELE_PARENT, focus);
166
188
  }
167
189
 
168
190
  typedef struct {
@@ -173,102 +195,70 @@ typedef struct {
173
195
  unsigned char xml, strict, fixup;
174
196
  } hpricot_state;
175
197
 
176
- static void
177
- hpricot_ele_mark(hpricot_ele *he)
178
- {
179
- rb_gc_mark(he->tag);
180
- rb_gc_mark(he->attr);
181
- rb_gc_mark(he->etag);
182
- rb_gc_mark(he->raw);
183
- rb_gc_mark(he->parent);
184
- rb_gc_mark(he->children);
185
- }
186
-
187
- static void
188
- hpricot_ele_free(hpricot_ele *he)
189
- {
190
- free(he);
191
- }
192
-
193
- #define H_PROP(prop) \
198
+ #define H_PROP(prop, idx) \
194
199
  static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
195
- hpricot_ele *he; \
196
- Data_Get_Struct(self, hpricot_ele, he); \
197
- he->prop = x; \
200
+ H_ELE_SET(self, idx, x); \
198
201
  return self; \
199
202
  } \
203
+ static VALUE hpricot_ele_clear_##prop(VALUE self) { \
204
+ H_ELE_SET(self, idx, Qnil); \
205
+ return Qtrue; \
206
+ } \
200
207
  static VALUE hpricot_ele_get_##prop(VALUE self) { \
201
- hpricot_ele *he; \
202
- Data_Get_Struct(self, hpricot_ele, he); \
203
- return he->prop; \
208
+ return H_ELE_GET(self, idx); \
204
209
  }
205
210
 
206
211
  #define H_ATTR(prop) \
207
212
  static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
208
- hpricot_ele *he; \
209
- Data_Get_Struct(self, hpricot_ele, he); \
210
- rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
213
+ rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
211
214
  return self; \
212
215
  } \
213
216
  static VALUE hpricot_ele_get_##prop(VALUE self) { \
214
- hpricot_ele *he; \
215
- Data_Get_Struct(self, hpricot_ele, he); \
216
- return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
217
+ return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
217
218
  }
218
219
 
219
- H_PROP(tag);
220
- H_PROP(attr);
221
- H_PROP(etag);
222
- H_PROP(parent);
223
- H_PROP(children);
220
+ H_PROP(name, H_ELE_TAG);
221
+ H_PROP(raw, H_ELE_RAW);
222
+ H_PROP(parent, H_ELE_PARENT);
223
+ H_PROP(attr, H_ELE_ATTR);
224
+ H_PROP(etag, H_ELE_ETAG);
225
+ H_PROP(children, H_ELE_CHILDREN);
226
+ H_ATTR(target);
224
227
  H_ATTR(encoding);
225
228
  H_ATTR(version);
226
229
  H_ATTR(standalone);
227
230
  H_ATTR(system_id);
228
231
  H_ATTR(public_id);
229
232
 
230
- static VALUE
231
- hpricot_ele_get_raw(VALUE self, VALUE x) {
232
- hpricot_ele *he;
233
- Data_Get_Struct(self, hpricot_ele, he);
234
- return he->raw;
235
- }
236
-
237
- static VALUE
238
- hpricot_ele_clear_raw(VALUE self)
239
- {
240
- hpricot_ele *he;
241
- Data_Get_Struct(self, hpricot_ele, he);
242
- he->raw = Qnil;
243
- return Qtrue;
244
- }
245
-
246
233
  #define H_ELE(klass) \
247
- hpricot_ele *he = ALLOC(hpricot_ele); \
248
- he->name = 0; \
249
- he->tag = tag; \
250
- he->attr = attr; \
251
- he->raw = Qnil; \
252
- he->EC = ec; \
253
- he->etag = he->parent = he->children = Qnil; \
254
- if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_etag || sym == sym_doctype)) { \
255
- he->raw = rb_str_new(raw, rawlen); \
234
+ ele = rb_obj_alloc(klass); \
235
+ if (klass == cElem) { \
236
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
237
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
238
+ H_ELE_SET(ele, H_ELE_EC, ec); \
239
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
240
+ H_ELE_SET(ele, H_ELE_RAW, rb_str_new(raw, rawlen)); \
241
+ } \
242
+ } else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
243
+ if (klass == cBogusETag) { \
244
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
245
+ if (raw != NULL) \
246
+ H_ELE_SET(ele, H_ELE_ATTR, rb_str_new(raw, rawlen)); \
247
+ } else { \
248
+ if (klass == cDocType) \
249
+ ATTR(ID2SYM(rb_intern("target")), tag); \
250
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
251
+ if (klass != cProcIns) { \
252
+ tag = Qnil; \
253
+ if (raw != NULL) tag = rb_str_new(raw, rawlen); \
254
+ } \
255
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
256
+ } \
257
+ } else { \
258
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
256
259
  } \
257
- ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
258
260
  S->last = ele
259
261
 
260
- VALUE
261
- hpricot_ele_alloc(VALUE klass)
262
- {
263
- VALUE ele;
264
- hpricot_ele *he = ALLOC(hpricot_ele);
265
- he->name = 0;
266
- he->tag = he->attr = he->raw = he->EC = Qnil;
267
- he->etag = he->parent = he->children = Qnil;
268
- ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
269
- return ele;
270
- }
271
-
272
262
  //
273
263
  // the swift, compact parser logic. most of the complicated stuff is done
274
264
  // in the lexer. this step just pairs up the start and end tags.
@@ -282,22 +272,23 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
282
272
  // in html mode, fix up start tags incorrectly formed as empty tags
283
273
  //
284
274
  if (!S->xml) {
285
- hpricot_ele *last;
286
- Data_Get_Struct(S->focus, hpricot_ele, last);
287
- if (last->EC == sym_CDATA &&
288
- (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
289
- !(sym == sym_etag && rb_str_hash(tag) == last->name))
290
- {
291
- sym = sym_text;
292
- tag = rb_str_new(raw, rawlen);
293
- }
294
-
295
275
  if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
296
276
  ec = rb_hash_aref(S->EC, tag);
297
277
  if (NIL_P(ec)) {
298
278
  tag = rb_funcall(tag, s_downcase, 0);
299
279
  ec = rb_hash_aref(S->EC, tag);
300
280
  }
281
+ }
282
+
283
+ if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
284
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
285
+ !(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
286
+ {
287
+ sym = sym_text;
288
+ tag = rb_str_new(raw, rawlen);
289
+ }
290
+
291
+ if (!NIL_P(ec)) {
301
292
  if (sym == sym_emptytag) {
302
293
  if (ec != sym_EMPTY)
303
294
  sym = sym_stag;
@@ -309,19 +300,19 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
309
300
  }
310
301
 
311
302
  if (sym == sym_emptytag || sym == sym_stag) {
303
+ VALUE name = INT2FIX(rb_str_hash(tag));
312
304
  H_ELE(cElem);
313
- he->name = rb_str_hash(tag);
305
+ H_ELE_SET(ele, H_ELE_HASH, name);
314
306
 
315
307
  if (!S->xml) {
316
308
  VALUE match = Qnil, e = S->focus;
317
309
  while (e != S->doc)
318
310
  {
319
- hpricot_ele *hee;
320
- Data_Get_Struct(e, hpricot_ele, hee);
311
+ VALUE hEC = H_ELE_GET(e, H_ELE_EC);
321
312
 
322
- if (TYPE(hee->EC) == T_HASH)
313
+ if (TYPE(hEC) == T_HASH)
323
314
  {
324
- VALUE has = rb_hash_lookup(hee->EC, INT2NUM(he->name));
315
+ VALUE has = rb_hash_lookup(hEC, name);
325
316
  if (has != Qnil) {
326
317
  if (has == Qtrue) {
327
318
  if (match == Qnil)
@@ -334,7 +325,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
334
325
  }
335
326
  }
336
327
 
337
- e = hee->parent;
328
+ e = H_ELE_GET(e, H_ELE_PARENT);
338
329
  }
339
330
 
340
331
  if (match == Qnil)
@@ -356,8 +347,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
356
347
  }
357
348
  }
358
349
  } else if (sym == sym_etag) {
359
- int name;
360
- VALUE match = Qnil, e = S->focus;
350
+ VALUE name, match = Qnil, e = S->focus;
361
351
  if (S->strict) {
362
352
  if (NIL_P(rb_hash_aref(S->EC, tag))) {
363
353
  tag = rb_str_new2("div");
@@ -370,19 +360,16 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
370
360
  //
371
361
  // (see also: the search above for fixups)
372
362
  //
373
- name = rb_str_hash(tag);
363
+ name = INT2FIX(rb_str_hash(tag));
374
364
  while (e != S->doc)
375
365
  {
376
- hpricot_ele *he;
377
- Data_Get_Struct(e, hpricot_ele, he);
378
-
379
- if (he->name == name)
366
+ if (H_ELE_GET(e, H_ELE_HASH) == name)
380
367
  {
381
368
  match = e;
382
369
  break;
383
370
  }
384
371
 
385
- e = he->parent;
372
+ e = H_ELE_GET(e, H_ELE_PARENT);
386
373
  }
387
374
 
388
375
  if (NIL_P(match))
@@ -392,10 +379,11 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
392
379
  }
393
380
  else
394
381
  {
395
- H_ELE(cETag);
396
- Data_Get_Struct(match, hpricot_ele, he);
397
- he->etag = ele;
398
- S->focus = he->parent;
382
+ VALUE ele = Qnil;
383
+ if (raw != NULL)
384
+ ele = rb_str_new(raw, rawlen);
385
+ H_ELE_SET(match, H_ELE_ETAG, ele);
386
+ S->focus = H_ELE_GET(match, H_ELE_PARENT);
399
387
  S->last = Qnil;
400
388
  }
401
389
  } else if (sym == sym_cdata) {
@@ -415,14 +403,14 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
415
403
  VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
416
404
  tag = rb_reg_nth_match(1, match);
417
405
  attr = rb_reg_nth_match(2, match);
418
- H_ELE(cProcIns);
419
- rb_hpricot_add(S->focus, ele);
406
+ {
407
+ H_ELE(cProcIns);
408
+ rb_hpricot_add(S->focus, ele);
409
+ }
420
410
  } else if (sym == sym_text) {
421
411
  // TODO: add raw_string as well?
422
412
  if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
423
- hpricot_ele *he;
424
- Data_Get_Struct(S->last, hpricot_ele, he);
425
- rb_str_append(he->tag, tag);
413
+ rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
426
414
  } else {
427
415
  H_ELE(cText);
428
416
  rb_hpricot_add(S->focus, ele);
@@ -435,7 +423,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
435
423
 
436
424
  VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
437
425
  {
438
- int cs, act, have = 0, nread = 0, curline = 1, text = 0;
426
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
439
427
  char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
440
428
 
441
429
  hpricot_state *S = NULL;
@@ -445,12 +433,13 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
445
433
  int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
446
434
 
447
435
  rb_scan_args(argc, argv, "11", &port, &opts);
448
- taint = OBJ_TAINTED( port );
449
- if ( !rb_respond_to( port, s_read ) )
436
+ taint = OBJ_TAINTED(port);
437
+ io = rb_respond_to(port, s_read);
438
+ if (!io)
450
439
  {
451
- if ( rb_respond_to( port, s_to_str ) )
440
+ if (rb_respond_to(port, s_to_str))
452
441
  {
453
- port = rb_funcall( port, s_to_str, 0 );
442
+ port = rb_funcall(port, s_to_str, 0);
454
443
  StringValue(port);
455
444
  }
456
445
  else
@@ -465,10 +454,7 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
465
454
  if (!rb_block_given_p())
466
455
  {
467
456
  S = ALLOC(hpricot_state);
468
- hpricot_ele *he = ALLOC(hpricot_ele);
469
- MEMZERO(he, hpricot_ele, 1);
470
- he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
471
- S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
457
+ S->doc = rb_obj_alloc(cDoc);
472
458
  rb_gc_register_address(&S->doc);
473
459
  S->focus = S->doc;
474
460
  S->last = Qnil;
@@ -488,65 +474,68 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
488
474
  buffer_size = NUM2INT(bufsize);
489
475
  }
490
476
  }
491
- buf = ALLOC_N(char, buffer_size);
477
+
478
+ if (io)
479
+ buf = ALLOC_N(char, buffer_size);
492
480
 
493
481
  %% write init;
494
-
495
- while ( !done ) {
482
+
483
+ while (!done) {
496
484
  VALUE str;
497
485
  char *p, *pe;
498
486
  int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
499
487
 
500
- if ( space == 0 ) {
501
- /* We've used up the entire buffer storing an already-parsed token
502
- * prefix that must be preserved. Likely caused by super-long attributes.
503
- * Increase buffer size and continue */
504
- tokstart_diff = ts - buf;
505
- tokend_diff = te - buf;
506
- mark_tag_diff = mark_tag - buf;
507
- mark_akey_diff = mark_akey - buf;
508
- mark_aval_diff = mark_aval - buf;
509
-
510
- buffer_size += BUFSIZE;
511
- REALLOC_N(buf, char, buffer_size);
512
-
513
- space = buffer_size - have;
514
-
515
- ts= buf + tokstart_diff;
516
- te = buf + tokend_diff;
517
- mark_tag = buf + mark_tag_diff;
518
- mark_akey = buf + mark_akey_diff;
519
- mark_aval = buf + mark_aval_diff;
520
- }
521
- p = buf + have;
522
-
523
- if ( rb_respond_to( port, s_read ) )
488
+ if (io)
524
489
  {
490
+ if (space == 0) {
491
+ /* We've used up the entire buffer storing an already-parsed token
492
+ * prefix that must be preserved. Likely caused by super-long attributes.
493
+ * Increase buffer size and continue */
494
+ tokstart_diff = ts - buf;
495
+ tokend_diff = te - buf;
496
+ mark_tag_diff = mark_tag - buf;
497
+ mark_akey_diff = mark_akey - buf;
498
+ mark_aval_diff = mark_aval - buf;
499
+
500
+ buffer_size += BUFSIZE;
501
+ REALLOC_N(buf, char, buffer_size);
502
+
503
+ space = buffer_size - have;
504
+
505
+ ts = buf + tokstart_diff;
506
+ te = buf + tokend_diff;
507
+ mark_tag = buf + mark_tag_diff;
508
+ mark_akey = buf + mark_akey_diff;
509
+ mark_aval = buf + mark_aval_diff;
510
+ }
511
+ p = buf + have;
512
+
525
513
  str = rb_funcall(port, s_read, 1, INT2FIX(space));
526
514
  len = RSTRING_LEN(str);
527
515
  memcpy(p, StringValuePtr(str), len);
528
516
  }
529
517
  else
530
518
  {
531
- len = RSTRING_LEN(port) - nread;
532
- if (len > space) len = space;
533
- memcpy(p, StringValuePtr(port) + nread, len);
519
+ p = RSTRING_PTR(port);
520
+ len = RSTRING_LEN(port) + 1;
521
+ done = 1;
534
522
  }
535
523
 
536
524
  nread += len;
537
525
 
538
526
  /* If this is the last buffer, tack on an EOF. */
539
- if ( len < space ) {
527
+ if (io && len < space) {
540
528
  p[len++] = 0;
541
529
  done = 1;
542
530
  }
543
531
 
544
532
  pe = p + len;
545
533
  %% write exec;
546
-
547
- if ( cs == hpricot_scan_error ) {
548
- free(buf);
549
- if ( !NIL_P(tag) )
534
+
535
+ if (cs == hpricot_scan_error) {
536
+ if (buf != NULL)
537
+ free(buf);
538
+ if (!NIL_P(tag))
550
539
  {
551
540
  rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
552
541
  }
@@ -555,8 +544,8 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
555
544
  rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
556
545
  }
557
546
  }
558
-
559
- if ( done && ele_open )
547
+
548
+ if (done && ele_open)
560
549
  {
561
550
  ele_open = 0;
562
551
  if (ts > 0) {
@@ -566,11 +555,11 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
566
555
  }
567
556
  }
568
557
 
569
- if ( ts == 0 )
558
+ if (ts == 0)
570
559
  {
571
560
  have = 0;
572
561
  /* text nodes have no ts because each byte is parsed alone */
573
- if ( mark_tag != NULL && text == 1 )
562
+ if (mark_tag != NULL && text == 1)
574
563
  {
575
564
  if (done)
576
565
  {
@@ -585,12 +574,15 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
585
574
  CAT(tag, p);
586
575
  }
587
576
  }
588
- mark_tag = buf;
577
+ if (io)
578
+ mark_tag = buf;
579
+ else
580
+ mark_tag = RSTRING_PTR(port);
589
581
  }
590
- else
582
+ else if (io)
591
583
  {
592
584
  have = pe - ts;
593
- memmove( buf, ts, have );
585
+ memmove(buf, ts, have);
594
586
  SLIDE(tag);
595
587
  SLIDE(akey);
596
588
  SLIDE(aval);
@@ -598,7 +590,9 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
598
590
  ts = buf;
599
591
  }
600
592
  }
601
- free(buf);
593
+
594
+ if (buf != NULL)
595
+ free(buf);
602
596
 
603
597
  if (S != NULL)
604
598
  {
@@ -611,66 +605,103 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
611
605
  return Qnil;
612
606
  }
613
607
 
614
- void Init_hpricot_scan()
608
+ static VALUE
609
+ alloc_hpricot_struct(VALUE klass)
615
610
  {
616
- mHpricot = rb_define_module("Hpricot");
617
- rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
618
- rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
619
- rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
620
- rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
611
+ VALUE size;
612
+ long n;
613
+ NEWOBJ(st, struct RStruct);
614
+ OBJSETUP(st, klass, T_STRUCT);
615
+
616
+ size = rb_struct_iv_get(klass, "__size__");
617
+ n = FIX2LONG(size);
618
+
619
+ #ifndef RSTRUCT_EMBED_LEN_MAX
620
+ st->ptr = ALLOC_N(VALUE, n);
621
+ rb_mem_clear(st->ptr, n);
622
+ st->len = n;
623
+ #else
624
+ if (0 < n && n <= RSTRUCT_EMBED_LEN_MAX) {
625
+ RBASIC(st)->flags &= ~RSTRUCT_EMBED_LEN_MASK;
626
+ RBASIC(st)->flags |= n << RSTRUCT_EMBED_LEN_SHIFT;
627
+ rb_mem_clear(st->as.ary, n);
628
+ } else {
629
+ st->as.heap.ptr = ALLOC_N(VALUE, n);
630
+ rb_mem_clear(st->as.heap.ptr, n);
631
+ st->as.heap.len = n;
632
+ }
633
+ #endif
621
634
 
622
- cDoc = rb_define_class_under(mHpricot, "Doc", rb_cObject);
623
- rb_define_alloc_func(cDoc, hpricot_ele_alloc);
624
- rb_define_method(cDoc, "children", hpricot_ele_get_children, 0);
625
- rb_define_method(cDoc, "children=", hpricot_ele_set_children, 1);
626
-
627
- cBaseEle = rb_define_class_under(mHpricot, "BaseEle", rb_cObject);
628
- rb_define_alloc_func(cBaseEle, hpricot_ele_alloc);
629
- rb_define_method(cBaseEle, "raw_string", hpricot_ele_get_raw, 0);
630
- rb_define_method(cBaseEle, "clear_raw", hpricot_ele_clear_raw, 0);
631
- rb_define_method(cBaseEle, "parent", hpricot_ele_get_parent, 0);
632
- rb_define_method(cBaseEle, "parent=", hpricot_ele_set_parent, 1);
633
- cCData = rb_define_class_under(mHpricot, "CData", cBaseEle);
634
- rb_define_method(cCData, "content", hpricot_ele_get_tag, 0);
635
- rb_define_method(cCData, "content=", hpricot_ele_set_tag, 1);
636
- cComment = rb_define_class_under(mHpricot, "Comment", cBaseEle);
637
- rb_define_method(cComment, "content", hpricot_ele_get_tag, 0);
638
- rb_define_method(cComment, "content=", hpricot_ele_set_tag, 1);
639
- cDocType = rb_define_class_under(mHpricot, "DocType", cBaseEle);
640
- rb_define_method(cDocType, "target", hpricot_ele_get_tag, 0);
641
- rb_define_method(cDocType, "target=", hpricot_ele_set_tag, 1);
642
- rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
643
- rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
644
- rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
645
- rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
646
- cElem = rb_define_class_under(mHpricot, "Elem", cBaseEle);
647
- rb_define_method(cElem, "raw_attributes", hpricot_ele_get_attr, 0);
648
- rb_define_method(cElem, "raw_attributes=", hpricot_ele_set_attr, 1);
649
- rb_define_method(cElem, "children", hpricot_ele_get_children, 0);
650
- rb_define_method(cElem, "children=", hpricot_ele_set_children, 1);
651
- rb_define_method(cElem, "etag", hpricot_ele_get_etag, 0);
652
- rb_define_method(cElem, "etag=", hpricot_ele_set_etag, 1);
653
- rb_define_method(cElem, "name", hpricot_ele_get_tag, 0);
654
- rb_define_method(cElem, "name=", hpricot_ele_set_tag, 1);
655
- cETag = rb_define_class_under(mHpricot, "ETag", cBaseEle);
656
- rb_define_method(cETag, "name", hpricot_ele_get_tag, 0);
657
- rb_define_method(cETag, "name=", hpricot_ele_set_tag, 1);
658
- cBogusETag = rb_define_class_under(mHpricot, "BogusETag", cETag);
659
- cText = rb_define_class_under(mHpricot, "Text", cBaseEle);
660
- rb_define_method(cText, "content", hpricot_ele_get_tag, 0);
661
- rb_define_method(cText, "content=", hpricot_ele_set_tag, 1);
662
- cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", cBaseEle);
663
- rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
664
- rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
665
- rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
666
- rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
667
- rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
668
- rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
669
- cProcIns = rb_define_class_under(mHpricot, "ProcIns", cBaseEle);
670
- rb_define_method(cProcIns, "target", hpricot_ele_get_tag, 0);
671
- rb_define_method(cProcIns, "target=", hpricot_ele_set_tag, 1);
672
- rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
673
- rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
635
+ return (VALUE)st;
636
+ }
637
+
638
+ static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);}
639
+ static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);}
640
+ static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);}
641
+ static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);}
642
+ static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);}
643
+ static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);}
644
+ static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);}
645
+ static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);}
646
+ static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);}
647
+ static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}
648
+
649
+ static VALUE (*ref_func[10])() = {
650
+ hpricot_struct_ref0,
651
+ hpricot_struct_ref1,
652
+ hpricot_struct_ref2,
653
+ hpricot_struct_ref3,
654
+ hpricot_struct_ref4,
655
+ hpricot_struct_ref5,
656
+ hpricot_struct_ref6,
657
+ hpricot_struct_ref7,
658
+ hpricot_struct_ref8,
659
+ hpricot_struct_ref9,
660
+ };
661
+
662
+ static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);}
663
+ static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);}
664
+ static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);}
665
+ static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);}
666
+ static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);}
667
+ static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);}
668
+ static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);}
669
+ static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);}
670
+ static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);}
671
+ static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}
672
+
673
+ static VALUE (*set_func[10])() = {
674
+ hpricot_struct_set0,
675
+ hpricot_struct_set1,
676
+ hpricot_struct_set2,
677
+ hpricot_struct_set3,
678
+ hpricot_struct_set4,
679
+ hpricot_struct_set5,
680
+ hpricot_struct_set6,
681
+ hpricot_struct_set7,
682
+ hpricot_struct_set8,
683
+ hpricot_struct_set9,
684
+ };
685
+
686
+ static VALUE
687
+ make_hpricot_struct(VALUE members)
688
+ {
689
+ int i = 0;
690
+ VALUE klass = rb_class_new(rb_cObject);
691
+ rb_iv_set(klass, "__size__", INT2NUM(RARRAY_LEN(members)));
692
+ rb_define_alloc_func(klass, alloc_hpricot_struct);
693
+ rb_define_singleton_method(klass, "new", rb_class_new_instance, -1);
694
+ for (i = 0; i < RARRAY_LEN(members); i++) {
695
+ ID id = SYM2ID(RARRAY_PTR(members)[i]);
696
+ rb_define_method_id(klass, id, ref_func[i], 0);
697
+ rb_define_method_id(klass, rb_id_attrset(id), set_func[i], 1);
698
+ }
699
+ return klass;
700
+ }
701
+
702
+ void Init_hpricot_scan()
703
+ {
704
+ VALUE structElem, structAttr, structBasic;
674
705
 
675
706
  s_ElementContent = rb_intern("ElementContent");
676
707
  symAllow = ID2SYM(rb_intern("allow"));
@@ -680,19 +711,78 @@ void Init_hpricot_scan()
680
711
  s_parent = rb_intern("parent");
681
712
  s_read = rb_intern("read");
682
713
  s_to_str = rb_intern("to_str");
683
- iv_parent = rb_intern("parent");
684
714
  sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
685
715
  sym_doctype = ID2SYM(rb_intern("doctype"));
686
716
  sym_procins = ID2SYM(rb_intern("procins"));
687
717
  sym_stag = ID2SYM(rb_intern("stag"));
688
718
  sym_etag = ID2SYM(rb_intern("etag"));
689
719
  sym_emptytag = ID2SYM(rb_intern("emptytag"));
720
+ sym_allowed = ID2SYM(rb_intern("allowed"));
721
+ sym_children = ID2SYM(rb_intern("children"));
690
722
  sym_comment = ID2SYM(rb_intern("comment"));
691
723
  sym_cdata = ID2SYM(rb_intern("cdata"));
724
+ sym_name = ID2SYM(rb_intern("name"));
725
+ sym_parent = ID2SYM(rb_intern("parent"));
726
+ sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
727
+ sym_raw_string = ID2SYM(rb_intern("raw_string"));
728
+ sym_tagno = ID2SYM(rb_intern("tagno"));
692
729
  sym_text = ID2SYM(rb_intern("text"));
693
730
  sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
694
731
  sym_CDATA = ID2SYM(rb_intern("CDATA"));
695
732
 
733
+ mHpricot = rb_define_module("Hpricot");
734
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
735
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
736
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
737
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
738
+
739
+ structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
740
+ sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
741
+ sym_tagno, sym_children));
742
+ structAttr = make_hpricot_struct(rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes));
743
+ structBasic = make_hpricot_struct(rb_ary_new3(2, sym_name, sym_parent));
744
+
745
+ cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
746
+ cCData = rb_define_class_under(mHpricot, "CData", structBasic);
747
+ rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
748
+ rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
749
+ cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
750
+ rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
751
+ rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
752
+ cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
753
+ rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
754
+ rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
755
+ rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
756
+ rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
757
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
758
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
759
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
760
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
761
+ cElem = rb_define_class_under(mHpricot, "Elem", structElem);
762
+ rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
763
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
764
+ rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
765
+ rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
766
+ cText = rb_define_class_under(mHpricot, "Text", structBasic);
767
+ rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
768
+ rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
769
+ rb_define_method(cText, "content", hpricot_ele_get_name, 0);
770
+ rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
771
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
772
+ rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
773
+ rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
774
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
775
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
776
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
777
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
778
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
779
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
780
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
781
+ rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
782
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
783
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
784
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
785
+
696
786
  rb_const_set(mHpricot, rb_intern("ProcInsParse"),
697
787
  reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
698
788
  }