why-hpricot 0.6.210 → 0.7.229

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,20 +19,26 @@ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
19
19
  #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
20
20
 
21
21
  static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
22
- sym_cdata, sym_text, sym_EMPTY, sym_CDATA;
22
+ sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
23
+ sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;
23
24
  static VALUE mHpricot, rb_eHpricotParseError;
24
- static VALUE cBaseEle, cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cETag, cText,
25
+ static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,
25
26
  cXMLDecl, cProcIns, symAllow, symDeny;
26
27
  static ID s_ElementContent;
27
28
  static ID s_downcase, s_new, s_parent, s_read, s_to_str;
28
- static ID iv_parent;
29
29
  static VALUE reProcInsParse;
30
30
 
31
- typedef struct {
32
- int name;
33
- VALUE tag, attr, etag, raw, EC;
34
- VALUE parent, children;
35
- } hpricot_ele;
31
+ #define H_ELE_TAG 0
32
+ #define H_ELE_PARENT 1
33
+ #define H_ELE_ATTR 2
34
+ #define H_ELE_ETAG 3
35
+ #define H_ELE_RAW 4
36
+ #define H_ELE_EC 5
37
+ #define H_ELE_HASH 6
38
+ #define H_ELE_CHILDREN 7
39
+
40
+ #define H_ELE_GET(ele, idx) RSTRUCT_PTR(ele)[idx]
41
+ #define H_ELE_SET(ele, idx, val) RSTRUCT_PTR(ele)[idx] = val
36
42
 
37
43
  #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
38
44
 
@@ -60,7 +66,7 @@ typedef struct {
60
66
 
61
67
  #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
62
68
 
63
- #define SLIDE(N) if ( mark_##N > ts ) mark_##N = buf + (mark_##N - ts);
69
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
64
70
 
65
71
  #define ATTR(K, V) \
66
72
  if (!NIL_P(K)) { \
@@ -107,7 +113,7 @@ typedef struct {
107
113
  action tag { SET(tag, p); }
108
114
  action tagc { SET(tag, p-1); }
109
115
  action aval { SET(aval, p); }
110
- action aunq {
116
+ action aunq {
111
117
  if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
112
118
  else { SET(aval, p); }
113
119
  }
@@ -118,14 +124,16 @@ typedef struct {
118
124
  action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
119
125
  action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
120
126
 
121
- action new_attr {
127
+ action new_attr {
122
128
  akey = Qnil;
123
129
  aval = Qnil;
124
130
  mark_akey = NULL;
125
131
  mark_aval = NULL;
126
132
  }
127
133
 
128
- action save_attr {
134
+ action save_attr {
135
+ if (!S->xml)
136
+ akey = rb_funcall(akey, s_downcase, 0);
129
137
  ATTR(akey, aval);
130
138
  }
131
139
 
@@ -144,7 +152,7 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
144
152
  raw = tag;
145
153
  }
146
154
  ary = rb_ary_new3(4, sym, tag, attr, raw);
147
- if (taint) {
155
+ if (taint) {
148
156
  OBJ_TAINT(ary);
149
157
  OBJ_TAINT(tag);
150
158
  OBJ_TAINT(attr);
@@ -153,16 +161,30 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
153
161
  rb_yield(ary);
154
162
  }
155
163
 
164
+ #ifndef RHASH_TBL
165
+ /* rb_hash_lookup() is only in Ruby 1.8.7 */
166
+ static VALUE
167
+ our_rb_hash_lookup(VALUE hash, VALUE key)
168
+ {
169
+ VALUE val;
170
+
171
+ if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
172
+ return Qnil; /* without Hash#default */
173
+ }
174
+
175
+ return val;
176
+ }
177
+ #define rb_hash_lookup our_rb_hash_lookup
178
+ #endif
179
+
156
180
  static void
157
181
  rb_hpricot_add(VALUE focus, VALUE ele)
158
182
  {
159
- hpricot_ele *he, *he2;
160
- Data_Get_Struct(focus, hpricot_ele, he);
161
- Data_Get_Struct(ele, hpricot_ele, he2);
162
- if (NIL_P(he->children))
163
- he->children = rb_ary_new();
164
- rb_ary_push(he->children, ele);
165
- he2->parent = focus;
183
+ VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
184
+ if (NIL_P(children))
185
+ H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
186
+ rb_ary_push(children, ele);
187
+ H_ELE_SET(ele, H_ELE_PARENT, focus);
166
188
  }
167
189
 
168
190
  typedef struct {
@@ -173,102 +195,70 @@ typedef struct {
173
195
  unsigned char xml, strict, fixup;
174
196
  } hpricot_state;
175
197
 
176
- static void
177
- hpricot_ele_mark(hpricot_ele *he)
178
- {
179
- rb_gc_mark(he->tag);
180
- rb_gc_mark(he->attr);
181
- rb_gc_mark(he->etag);
182
- rb_gc_mark(he->raw);
183
- rb_gc_mark(he->parent);
184
- rb_gc_mark(he->children);
185
- }
186
-
187
- static void
188
- hpricot_ele_free(hpricot_ele *he)
189
- {
190
- free(he);
191
- }
192
-
193
- #define H_PROP(prop) \
198
+ #define H_PROP(prop, idx) \
194
199
  static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
195
- hpricot_ele *he; \
196
- Data_Get_Struct(self, hpricot_ele, he); \
197
- he->prop = x; \
200
+ H_ELE_SET(self, idx, x); \
198
201
  return self; \
199
202
  } \
203
+ static VALUE hpricot_ele_clear_##prop(VALUE self) { \
204
+ H_ELE_SET(self, idx, Qnil); \
205
+ return Qtrue; \
206
+ } \
200
207
  static VALUE hpricot_ele_get_##prop(VALUE self) { \
201
- hpricot_ele *he; \
202
- Data_Get_Struct(self, hpricot_ele, he); \
203
- return he->prop; \
208
+ return H_ELE_GET(self, idx); \
204
209
  }
205
210
 
206
211
  #define H_ATTR(prop) \
207
212
  static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
208
- hpricot_ele *he; \
209
- Data_Get_Struct(self, hpricot_ele, he); \
210
- rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
213
+ rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
211
214
  return self; \
212
215
  } \
213
216
  static VALUE hpricot_ele_get_##prop(VALUE self) { \
214
- hpricot_ele *he; \
215
- Data_Get_Struct(self, hpricot_ele, he); \
216
- return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
217
+ return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
217
218
  }
218
219
 
219
- H_PROP(tag);
220
- H_PROP(attr);
221
- H_PROP(etag);
222
- H_PROP(parent);
223
- H_PROP(children);
220
+ H_PROP(name, H_ELE_TAG);
221
+ H_PROP(raw, H_ELE_RAW);
222
+ H_PROP(parent, H_ELE_PARENT);
223
+ H_PROP(attr, H_ELE_ATTR);
224
+ H_PROP(etag, H_ELE_ETAG);
225
+ H_PROP(children, H_ELE_CHILDREN);
226
+ H_ATTR(target);
224
227
  H_ATTR(encoding);
225
228
  H_ATTR(version);
226
229
  H_ATTR(standalone);
227
230
  H_ATTR(system_id);
228
231
  H_ATTR(public_id);
229
232
 
230
- static VALUE
231
- hpricot_ele_get_raw(VALUE self, VALUE x) {
232
- hpricot_ele *he;
233
- Data_Get_Struct(self, hpricot_ele, he);
234
- return he->raw;
235
- }
236
-
237
- static VALUE
238
- hpricot_ele_clear_raw(VALUE self)
239
- {
240
- hpricot_ele *he;
241
- Data_Get_Struct(self, hpricot_ele, he);
242
- he->raw = Qnil;
243
- return Qtrue;
244
- }
245
-
246
233
  #define H_ELE(klass) \
247
- hpricot_ele *he = ALLOC(hpricot_ele); \
248
- he->name = 0; \
249
- he->tag = tag; \
250
- he->attr = attr; \
251
- he->raw = Qnil; \
252
- he->EC = ec; \
253
- he->etag = he->parent = he->children = Qnil; \
254
- if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_etag || sym == sym_doctype)) { \
255
- he->raw = rb_str_new(raw, rawlen); \
234
+ ele = rb_obj_alloc(klass); \
235
+ if (klass == cElem) { \
236
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
237
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
238
+ H_ELE_SET(ele, H_ELE_EC, ec); \
239
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
240
+ H_ELE_SET(ele, H_ELE_RAW, rb_str_new(raw, rawlen)); \
241
+ } \
242
+ } else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
243
+ if (klass == cBogusETag) { \
244
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
245
+ if (raw != NULL) \
246
+ H_ELE_SET(ele, H_ELE_ATTR, rb_str_new(raw, rawlen)); \
247
+ } else { \
248
+ if (klass == cDocType) \
249
+ ATTR(ID2SYM(rb_intern("target")), tag); \
250
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
251
+ if (klass != cProcIns) { \
252
+ tag = Qnil; \
253
+ if (raw != NULL) tag = rb_str_new(raw, rawlen); \
254
+ } \
255
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
256
+ } \
257
+ } else { \
258
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
256
259
  } \
257
- ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
258
260
  S->last = ele
259
261
 
260
- VALUE
261
- hpricot_ele_alloc(VALUE klass)
262
- {
263
- VALUE ele;
264
- hpricot_ele *he = ALLOC(hpricot_ele);
265
- he->name = 0;
266
- he->tag = he->attr = he->raw = he->EC = Qnil;
267
- he->etag = he->parent = he->children = Qnil;
268
- ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
269
- return ele;
270
- }
271
-
272
262
  //
273
263
  // the swift, compact parser logic. most of the complicated stuff is done
274
264
  // in the lexer. this step just pairs up the start and end tags.
@@ -282,22 +272,23 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
282
272
  // in html mode, fix up start tags incorrectly formed as empty tags
283
273
  //
284
274
  if (!S->xml) {
285
- hpricot_ele *last;
286
- Data_Get_Struct(S->focus, hpricot_ele, last);
287
- if (last->EC == sym_CDATA &&
288
- (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
289
- !(sym == sym_etag && rb_str_hash(tag) == last->name))
290
- {
291
- sym = sym_text;
292
- tag = rb_str_new(raw, rawlen);
293
- }
294
-
295
275
  if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
296
276
  ec = rb_hash_aref(S->EC, tag);
297
277
  if (NIL_P(ec)) {
298
278
  tag = rb_funcall(tag, s_downcase, 0);
299
279
  ec = rb_hash_aref(S->EC, tag);
300
280
  }
281
+ }
282
+
283
+ if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
284
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
285
+ !(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
286
+ {
287
+ sym = sym_text;
288
+ tag = rb_str_new(raw, rawlen);
289
+ }
290
+
291
+ if (!NIL_P(ec)) {
301
292
  if (sym == sym_emptytag) {
302
293
  if (ec != sym_EMPTY)
303
294
  sym = sym_stag;
@@ -309,19 +300,19 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
309
300
  }
310
301
 
311
302
  if (sym == sym_emptytag || sym == sym_stag) {
303
+ VALUE name = INT2FIX(rb_str_hash(tag));
312
304
  H_ELE(cElem);
313
- he->name = rb_str_hash(tag);
305
+ H_ELE_SET(ele, H_ELE_HASH, name);
314
306
 
315
307
  if (!S->xml) {
316
308
  VALUE match = Qnil, e = S->focus;
317
309
  while (e != S->doc)
318
310
  {
319
- hpricot_ele *hee;
320
- Data_Get_Struct(e, hpricot_ele, hee);
311
+ VALUE hEC = H_ELE_GET(e, H_ELE_EC);
321
312
 
322
- if (TYPE(hee->EC) == T_HASH)
313
+ if (TYPE(hEC) == T_HASH)
323
314
  {
324
- VALUE has = rb_hash_lookup(hee->EC, INT2NUM(he->name));
315
+ VALUE has = rb_hash_lookup(hEC, name);
325
316
  if (has != Qnil) {
326
317
  if (has == Qtrue) {
327
318
  if (match == Qnil)
@@ -334,7 +325,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
334
325
  }
335
326
  }
336
327
 
337
- e = hee->parent;
328
+ e = H_ELE_GET(e, H_ELE_PARENT);
338
329
  }
339
330
 
340
331
  if (match == Qnil)
@@ -356,8 +347,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
356
347
  }
357
348
  }
358
349
  } else if (sym == sym_etag) {
359
- int name;
360
- VALUE match = Qnil, e = S->focus;
350
+ VALUE name, match = Qnil, e = S->focus;
361
351
  if (S->strict) {
362
352
  if (NIL_P(rb_hash_aref(S->EC, tag))) {
363
353
  tag = rb_str_new2("div");
@@ -370,19 +360,16 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
370
360
  //
371
361
  // (see also: the search above for fixups)
372
362
  //
373
- name = rb_str_hash(tag);
363
+ name = INT2FIX(rb_str_hash(tag));
374
364
  while (e != S->doc)
375
365
  {
376
- hpricot_ele *he;
377
- Data_Get_Struct(e, hpricot_ele, he);
378
-
379
- if (he->name == name)
366
+ if (H_ELE_GET(e, H_ELE_HASH) == name)
380
367
  {
381
368
  match = e;
382
369
  break;
383
370
  }
384
371
 
385
- e = he->parent;
372
+ e = H_ELE_GET(e, H_ELE_PARENT);
386
373
  }
387
374
 
388
375
  if (NIL_P(match))
@@ -392,10 +379,11 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
392
379
  }
393
380
  else
394
381
  {
395
- H_ELE(cETag);
396
- Data_Get_Struct(match, hpricot_ele, he);
397
- he->etag = ele;
398
- S->focus = he->parent;
382
+ VALUE ele = Qnil;
383
+ if (raw != NULL)
384
+ ele = rb_str_new(raw, rawlen);
385
+ H_ELE_SET(match, H_ELE_ETAG, ele);
386
+ S->focus = H_ELE_GET(match, H_ELE_PARENT);
399
387
  S->last = Qnil;
400
388
  }
401
389
  } else if (sym == sym_cdata) {
@@ -415,14 +403,14 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
415
403
  VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
416
404
  tag = rb_reg_nth_match(1, match);
417
405
  attr = rb_reg_nth_match(2, match);
418
- H_ELE(cProcIns);
419
- rb_hpricot_add(S->focus, ele);
406
+ {
407
+ H_ELE(cProcIns);
408
+ rb_hpricot_add(S->focus, ele);
409
+ }
420
410
  } else if (sym == sym_text) {
421
411
  // TODO: add raw_string as well?
422
412
  if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
423
- hpricot_ele *he;
424
- Data_Get_Struct(S->last, hpricot_ele, he);
425
- rb_str_append(he->tag, tag);
413
+ rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
426
414
  } else {
427
415
  H_ELE(cText);
428
416
  rb_hpricot_add(S->focus, ele);
@@ -435,7 +423,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
435
423
 
436
424
  VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
437
425
  {
438
- int cs, act, have = 0, nread = 0, curline = 1, text = 0;
426
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
439
427
  char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
440
428
 
441
429
  hpricot_state *S = NULL;
@@ -445,12 +433,13 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
445
433
  int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
446
434
 
447
435
  rb_scan_args(argc, argv, "11", &port, &opts);
448
- taint = OBJ_TAINTED( port );
449
- if ( !rb_respond_to( port, s_read ) )
436
+ taint = OBJ_TAINTED(port);
437
+ io = rb_respond_to(port, s_read);
438
+ if (!io)
450
439
  {
451
- if ( rb_respond_to( port, s_to_str ) )
440
+ if (rb_respond_to(port, s_to_str))
452
441
  {
453
- port = rb_funcall( port, s_to_str, 0 );
442
+ port = rb_funcall(port, s_to_str, 0);
454
443
  StringValue(port);
455
444
  }
456
445
  else
@@ -465,10 +454,7 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
465
454
  if (!rb_block_given_p())
466
455
  {
467
456
  S = ALLOC(hpricot_state);
468
- hpricot_ele *he = ALLOC(hpricot_ele);
469
- MEMZERO(he, hpricot_ele, 1);
470
- he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
471
- S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
457
+ S->doc = rb_obj_alloc(cDoc);
472
458
  rb_gc_register_address(&S->doc);
473
459
  S->focus = S->doc;
474
460
  S->last = Qnil;
@@ -488,65 +474,68 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
488
474
  buffer_size = NUM2INT(bufsize);
489
475
  }
490
476
  }
491
- buf = ALLOC_N(char, buffer_size);
477
+
478
+ if (io)
479
+ buf = ALLOC_N(char, buffer_size);
492
480
 
493
481
  %% write init;
494
-
495
- while ( !done ) {
482
+
483
+ while (!done) {
496
484
  VALUE str;
497
485
  char *p, *pe;
498
486
  int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
499
487
 
500
- if ( space == 0 ) {
501
- /* We've used up the entire buffer storing an already-parsed token
502
- * prefix that must be preserved. Likely caused by super-long attributes.
503
- * Increase buffer size and continue */
504
- tokstart_diff = ts - buf;
505
- tokend_diff = te - buf;
506
- mark_tag_diff = mark_tag - buf;
507
- mark_akey_diff = mark_akey - buf;
508
- mark_aval_diff = mark_aval - buf;
509
-
510
- buffer_size += BUFSIZE;
511
- REALLOC_N(buf, char, buffer_size);
512
-
513
- space = buffer_size - have;
514
-
515
- ts= buf + tokstart_diff;
516
- te = buf + tokend_diff;
517
- mark_tag = buf + mark_tag_diff;
518
- mark_akey = buf + mark_akey_diff;
519
- mark_aval = buf + mark_aval_diff;
520
- }
521
- p = buf + have;
522
-
523
- if ( rb_respond_to( port, s_read ) )
488
+ if (io)
524
489
  {
490
+ if (space == 0) {
491
+ /* We've used up the entire buffer storing an already-parsed token
492
+ * prefix that must be preserved. Likely caused by super-long attributes.
493
+ * Increase buffer size and continue */
494
+ tokstart_diff = ts - buf;
495
+ tokend_diff = te - buf;
496
+ mark_tag_diff = mark_tag - buf;
497
+ mark_akey_diff = mark_akey - buf;
498
+ mark_aval_diff = mark_aval - buf;
499
+
500
+ buffer_size += BUFSIZE;
501
+ REALLOC_N(buf, char, buffer_size);
502
+
503
+ space = buffer_size - have;
504
+
505
+ ts = buf + tokstart_diff;
506
+ te = buf + tokend_diff;
507
+ mark_tag = buf + mark_tag_diff;
508
+ mark_akey = buf + mark_akey_diff;
509
+ mark_aval = buf + mark_aval_diff;
510
+ }
511
+ p = buf + have;
512
+
525
513
  str = rb_funcall(port, s_read, 1, INT2FIX(space));
526
514
  len = RSTRING_LEN(str);
527
515
  memcpy(p, StringValuePtr(str), len);
528
516
  }
529
517
  else
530
518
  {
531
- len = RSTRING_LEN(port) - nread;
532
- if (len > space) len = space;
533
- memcpy(p, StringValuePtr(port) + nread, len);
519
+ p = RSTRING_PTR(port);
520
+ len = RSTRING_LEN(port) + 1;
521
+ done = 1;
534
522
  }
535
523
 
536
524
  nread += len;
537
525
 
538
526
  /* If this is the last buffer, tack on an EOF. */
539
- if ( len < space ) {
527
+ if (io && len < space) {
540
528
  p[len++] = 0;
541
529
  done = 1;
542
530
  }
543
531
 
544
532
  pe = p + len;
545
533
  %% write exec;
546
-
547
- if ( cs == hpricot_scan_error ) {
548
- free(buf);
549
- if ( !NIL_P(tag) )
534
+
535
+ if (cs == hpricot_scan_error) {
536
+ if (buf != NULL)
537
+ free(buf);
538
+ if (!NIL_P(tag))
550
539
  {
551
540
  rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
552
541
  }
@@ -555,8 +544,8 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
555
544
  rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
556
545
  }
557
546
  }
558
-
559
- if ( done && ele_open )
547
+
548
+ if (done && ele_open)
560
549
  {
561
550
  ele_open = 0;
562
551
  if (ts > 0) {
@@ -566,11 +555,11 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
566
555
  }
567
556
  }
568
557
 
569
- if ( ts == 0 )
558
+ if (ts == 0)
570
559
  {
571
560
  have = 0;
572
561
  /* text nodes have no ts because each byte is parsed alone */
573
- if ( mark_tag != NULL && text == 1 )
562
+ if (mark_tag != NULL && text == 1)
574
563
  {
575
564
  if (done)
576
565
  {
@@ -585,12 +574,15 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
585
574
  CAT(tag, p);
586
575
  }
587
576
  }
588
- mark_tag = buf;
577
+ if (io)
578
+ mark_tag = buf;
579
+ else
580
+ mark_tag = RSTRING_PTR(port);
589
581
  }
590
- else
582
+ else if (io)
591
583
  {
592
584
  have = pe - ts;
593
- memmove( buf, ts, have );
585
+ memmove(buf, ts, have);
594
586
  SLIDE(tag);
595
587
  SLIDE(akey);
596
588
  SLIDE(aval);
@@ -598,7 +590,9 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
598
590
  ts = buf;
599
591
  }
600
592
  }
601
- free(buf);
593
+
594
+ if (buf != NULL)
595
+ free(buf);
602
596
 
603
597
  if (S != NULL)
604
598
  {
@@ -611,66 +605,103 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
611
605
  return Qnil;
612
606
  }
613
607
 
614
- void Init_hpricot_scan()
608
+ static VALUE
609
+ alloc_hpricot_struct(VALUE klass)
615
610
  {
616
- mHpricot = rb_define_module("Hpricot");
617
- rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
618
- rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
619
- rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
620
- rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
611
+ VALUE size;
612
+ long n;
613
+ NEWOBJ(st, struct RStruct);
614
+ OBJSETUP(st, klass, T_STRUCT);
615
+
616
+ size = rb_struct_iv_get(klass, "__size__");
617
+ n = FIX2LONG(size);
618
+
619
+ #ifndef RSTRUCT_EMBED_LEN_MAX
620
+ st->ptr = ALLOC_N(VALUE, n);
621
+ rb_mem_clear(st->ptr, n);
622
+ st->len = n;
623
+ #else
624
+ if (0 < n && n <= RSTRUCT_EMBED_LEN_MAX) {
625
+ RBASIC(st)->flags &= ~RSTRUCT_EMBED_LEN_MASK;
626
+ RBASIC(st)->flags |= n << RSTRUCT_EMBED_LEN_SHIFT;
627
+ rb_mem_clear(st->as.ary, n);
628
+ } else {
629
+ st->as.heap.ptr = ALLOC_N(VALUE, n);
630
+ rb_mem_clear(st->as.heap.ptr, n);
631
+ st->as.heap.len = n;
632
+ }
633
+ #endif
621
634
 
622
- cDoc = rb_define_class_under(mHpricot, "Doc", rb_cObject);
623
- rb_define_alloc_func(cDoc, hpricot_ele_alloc);
624
- rb_define_method(cDoc, "children", hpricot_ele_get_children, 0);
625
- rb_define_method(cDoc, "children=", hpricot_ele_set_children, 1);
626
-
627
- cBaseEle = rb_define_class_under(mHpricot, "BaseEle", rb_cObject);
628
- rb_define_alloc_func(cBaseEle, hpricot_ele_alloc);
629
- rb_define_method(cBaseEle, "raw_string", hpricot_ele_get_raw, 0);
630
- rb_define_method(cBaseEle, "clear_raw", hpricot_ele_clear_raw, 0);
631
- rb_define_method(cBaseEle, "parent", hpricot_ele_get_parent, 0);
632
- rb_define_method(cBaseEle, "parent=", hpricot_ele_set_parent, 1);
633
- cCData = rb_define_class_under(mHpricot, "CData", cBaseEle);
634
- rb_define_method(cCData, "content", hpricot_ele_get_tag, 0);
635
- rb_define_method(cCData, "content=", hpricot_ele_set_tag, 1);
636
- cComment = rb_define_class_under(mHpricot, "Comment", cBaseEle);
637
- rb_define_method(cComment, "content", hpricot_ele_get_tag, 0);
638
- rb_define_method(cComment, "content=", hpricot_ele_set_tag, 1);
639
- cDocType = rb_define_class_under(mHpricot, "DocType", cBaseEle);
640
- rb_define_method(cDocType, "target", hpricot_ele_get_tag, 0);
641
- rb_define_method(cDocType, "target=", hpricot_ele_set_tag, 1);
642
- rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
643
- rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
644
- rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
645
- rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
646
- cElem = rb_define_class_under(mHpricot, "Elem", cBaseEle);
647
- rb_define_method(cElem, "raw_attributes", hpricot_ele_get_attr, 0);
648
- rb_define_method(cElem, "raw_attributes=", hpricot_ele_set_attr, 1);
649
- rb_define_method(cElem, "children", hpricot_ele_get_children, 0);
650
- rb_define_method(cElem, "children=", hpricot_ele_set_children, 1);
651
- rb_define_method(cElem, "etag", hpricot_ele_get_etag, 0);
652
- rb_define_method(cElem, "etag=", hpricot_ele_set_etag, 1);
653
- rb_define_method(cElem, "name", hpricot_ele_get_tag, 0);
654
- rb_define_method(cElem, "name=", hpricot_ele_set_tag, 1);
655
- cETag = rb_define_class_under(mHpricot, "ETag", cBaseEle);
656
- rb_define_method(cETag, "name", hpricot_ele_get_tag, 0);
657
- rb_define_method(cETag, "name=", hpricot_ele_set_tag, 1);
658
- cBogusETag = rb_define_class_under(mHpricot, "BogusETag", cETag);
659
- cText = rb_define_class_under(mHpricot, "Text", cBaseEle);
660
- rb_define_method(cText, "content", hpricot_ele_get_tag, 0);
661
- rb_define_method(cText, "content=", hpricot_ele_set_tag, 1);
662
- cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", cBaseEle);
663
- rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
664
- rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
665
- rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
666
- rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
667
- rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
668
- rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
669
- cProcIns = rb_define_class_under(mHpricot, "ProcIns", cBaseEle);
670
- rb_define_method(cProcIns, "target", hpricot_ele_get_tag, 0);
671
- rb_define_method(cProcIns, "target=", hpricot_ele_set_tag, 1);
672
- rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
673
- rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
635
+ return (VALUE)st;
636
+ }
637
+
638
+ static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);}
639
+ static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);}
640
+ static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);}
641
+ static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);}
642
+ static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);}
643
+ static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);}
644
+ static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);}
645
+ static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);}
646
+ static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);}
647
+ static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}
648
+
649
+ static VALUE (*ref_func[10])() = {
650
+ hpricot_struct_ref0,
651
+ hpricot_struct_ref1,
652
+ hpricot_struct_ref2,
653
+ hpricot_struct_ref3,
654
+ hpricot_struct_ref4,
655
+ hpricot_struct_ref5,
656
+ hpricot_struct_ref6,
657
+ hpricot_struct_ref7,
658
+ hpricot_struct_ref8,
659
+ hpricot_struct_ref9,
660
+ };
661
+
662
+ static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);}
663
+ static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);}
664
+ static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);}
665
+ static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);}
666
+ static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);}
667
+ static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);}
668
+ static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);}
669
+ static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);}
670
+ static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);}
671
+ static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}
672
+
673
+ static VALUE (*set_func[10])() = {
674
+ hpricot_struct_set0,
675
+ hpricot_struct_set1,
676
+ hpricot_struct_set2,
677
+ hpricot_struct_set3,
678
+ hpricot_struct_set4,
679
+ hpricot_struct_set5,
680
+ hpricot_struct_set6,
681
+ hpricot_struct_set7,
682
+ hpricot_struct_set8,
683
+ hpricot_struct_set9,
684
+ };
685
+
686
+ static VALUE
687
+ make_hpricot_struct(VALUE members)
688
+ {
689
+ int i = 0;
690
+ VALUE klass = rb_class_new(rb_cObject);
691
+ rb_iv_set(klass, "__size__", INT2NUM(RARRAY_LEN(members)));
692
+ rb_define_alloc_func(klass, alloc_hpricot_struct);
693
+ rb_define_singleton_method(klass, "new", rb_class_new_instance, -1);
694
+ for (i = 0; i < RARRAY_LEN(members); i++) {
695
+ ID id = SYM2ID(RARRAY_PTR(members)[i]);
696
+ rb_define_method_id(klass, id, ref_func[i], 0);
697
+ rb_define_method_id(klass, rb_id_attrset(id), set_func[i], 1);
698
+ }
699
+ return klass;
700
+ }
701
+
702
+ void Init_hpricot_scan()
703
+ {
704
+ VALUE structElem, structAttr, structBasic;
674
705
 
675
706
  s_ElementContent = rb_intern("ElementContent");
676
707
  symAllow = ID2SYM(rb_intern("allow"));
@@ -680,19 +711,78 @@ void Init_hpricot_scan()
680
711
  s_parent = rb_intern("parent");
681
712
  s_read = rb_intern("read");
682
713
  s_to_str = rb_intern("to_str");
683
- iv_parent = rb_intern("parent");
684
714
  sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
685
715
  sym_doctype = ID2SYM(rb_intern("doctype"));
686
716
  sym_procins = ID2SYM(rb_intern("procins"));
687
717
  sym_stag = ID2SYM(rb_intern("stag"));
688
718
  sym_etag = ID2SYM(rb_intern("etag"));
689
719
  sym_emptytag = ID2SYM(rb_intern("emptytag"));
720
+ sym_allowed = ID2SYM(rb_intern("allowed"));
721
+ sym_children = ID2SYM(rb_intern("children"));
690
722
  sym_comment = ID2SYM(rb_intern("comment"));
691
723
  sym_cdata = ID2SYM(rb_intern("cdata"));
724
+ sym_name = ID2SYM(rb_intern("name"));
725
+ sym_parent = ID2SYM(rb_intern("parent"));
726
+ sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
727
+ sym_raw_string = ID2SYM(rb_intern("raw_string"));
728
+ sym_tagno = ID2SYM(rb_intern("tagno"));
692
729
  sym_text = ID2SYM(rb_intern("text"));
693
730
  sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
694
731
  sym_CDATA = ID2SYM(rb_intern("CDATA"));
695
732
 
733
+ mHpricot = rb_define_module("Hpricot");
734
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
735
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
736
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
737
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
738
+
739
+ structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
740
+ sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
741
+ sym_tagno, sym_children));
742
+ structAttr = make_hpricot_struct(rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes));
743
+ structBasic = make_hpricot_struct(rb_ary_new3(2, sym_name, sym_parent));
744
+
745
+ cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
746
+ cCData = rb_define_class_under(mHpricot, "CData", structBasic);
747
+ rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
748
+ rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
749
+ cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
750
+ rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
751
+ rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
752
+ cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
753
+ rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
754
+ rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
755
+ rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
756
+ rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
757
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
758
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
759
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
760
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
761
+ cElem = rb_define_class_under(mHpricot, "Elem", structElem);
762
+ rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
763
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
764
+ rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
765
+ rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
766
+ cText = rb_define_class_under(mHpricot, "Text", structBasic);
767
+ rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
768
+ rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
769
+ rb_define_method(cText, "content", hpricot_ele_get_name, 0);
770
+ rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
771
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
772
+ rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
773
+ rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
774
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
775
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
776
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
777
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
778
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
779
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
780
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
781
+ rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
782
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
783
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
784
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
785
+
696
786
  rb_const_set(mHpricot, rb_intern("ProcInsParse"),
697
787
  reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
698
788
  }