adamh-hpricot 0.6.211 → 0.7.229

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,20 +19,26 @@ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
19
19
  #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
20
20
 
21
21
  static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
22
- sym_cdata, sym_text, sym_EMPTY, sym_CDATA;
22
+ sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
23
+ sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;
23
24
  static VALUE mHpricot, rb_eHpricotParseError;
24
- static VALUE cBaseEle, cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cETag, cText,
25
+ static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,
25
26
  cXMLDecl, cProcIns, symAllow, symDeny;
26
27
  static ID s_ElementContent;
27
28
  static ID s_downcase, s_new, s_parent, s_read, s_to_str;
28
- static ID iv_parent;
29
29
  static VALUE reProcInsParse;
30
30
 
31
- typedef struct {
32
- int name;
33
- VALUE tag, attr, etag, raw, EC;
34
- VALUE parent, children;
35
- } hpricot_ele;
31
+ #define H_ELE_TAG 0
32
+ #define H_ELE_PARENT 1
33
+ #define H_ELE_ATTR 2
34
+ #define H_ELE_ETAG 3
35
+ #define H_ELE_RAW 4
36
+ #define H_ELE_EC 5
37
+ #define H_ELE_HASH 6
38
+ #define H_ELE_CHILDREN 7
39
+
40
+ #define H_ELE_GET(ele, idx) RSTRUCT_PTR(ele)[idx]
41
+ #define H_ELE_SET(ele, idx, val) RSTRUCT_PTR(ele)[idx] = val
36
42
 
37
43
  #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
38
44
 
@@ -60,7 +66,7 @@ typedef struct {
60
66
 
61
67
  #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
62
68
 
63
- #define SLIDE(N) if ( mark_##N > ts ) mark_##N = buf + (mark_##N - ts);
69
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
64
70
 
65
71
  #define ATTR(K, V) \
66
72
  if (!NIL_P(K)) { \
@@ -107,7 +113,7 @@ typedef struct {
107
113
  action tag { SET(tag, p); }
108
114
  action tagc { SET(tag, p-1); }
109
115
  action aval { SET(aval, p); }
110
- action aunq {
116
+ action aunq {
111
117
  if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
112
118
  else { SET(aval, p); }
113
119
  }
@@ -118,14 +124,16 @@ typedef struct {
118
124
  action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
119
125
  action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
120
126
 
121
- action new_attr {
127
+ action new_attr {
122
128
  akey = Qnil;
123
129
  aval = Qnil;
124
130
  mark_akey = NULL;
125
131
  mark_aval = NULL;
126
132
  }
127
133
 
128
- action save_attr {
134
+ action save_attr {
135
+ if (!S->xml)
136
+ akey = rb_funcall(akey, s_downcase, 0);
129
137
  ATTR(akey, aval);
130
138
  }
131
139
 
@@ -144,7 +152,7 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
144
152
  raw = tag;
145
153
  }
146
154
  ary = rb_ary_new3(4, sym, tag, attr, raw);
147
- if (taint) {
155
+ if (taint) {
148
156
  OBJ_TAINT(ary);
149
157
  OBJ_TAINT(tag);
150
158
  OBJ_TAINT(attr);
@@ -153,6 +161,7 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
153
161
  rb_yield(ary);
154
162
  }
155
163
 
164
+ #ifndef RHASH_TBL
156
165
  /* rb_hash_lookup() is only in Ruby 1.8.7 */
157
166
  static VALUE
158
167
  our_rb_hash_lookup(VALUE hash, VALUE key)
@@ -165,17 +174,17 @@ our_rb_hash_lookup(VALUE hash, VALUE key)
165
174
 
166
175
  return val;
167
176
  }
177
+ #define rb_hash_lookup our_rb_hash_lookup
178
+ #endif
168
179
 
169
180
  static void
170
181
  rb_hpricot_add(VALUE focus, VALUE ele)
171
182
  {
172
- hpricot_ele *he, *he2;
173
- Data_Get_Struct(focus, hpricot_ele, he);
174
- Data_Get_Struct(ele, hpricot_ele, he2);
175
- if (NIL_P(he->children))
176
- he->children = rb_ary_new();
177
- rb_ary_push(he->children, ele);
178
- he2->parent = focus;
183
+ VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
184
+ if (NIL_P(children))
185
+ H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
186
+ rb_ary_push(children, ele);
187
+ H_ELE_SET(ele, H_ELE_PARENT, focus);
179
188
  }
180
189
 
181
190
  typedef struct {
@@ -186,102 +195,70 @@ typedef struct {
186
195
  unsigned char xml, strict, fixup;
187
196
  } hpricot_state;
188
197
 
189
- static void
190
- hpricot_ele_mark(hpricot_ele *he)
191
- {
192
- rb_gc_mark(he->tag);
193
- rb_gc_mark(he->attr);
194
- rb_gc_mark(he->etag);
195
- rb_gc_mark(he->raw);
196
- rb_gc_mark(he->parent);
197
- rb_gc_mark(he->children);
198
- }
199
-
200
- static void
201
- hpricot_ele_free(hpricot_ele *he)
202
- {
203
- free(he);
204
- }
205
-
206
- #define H_PROP(prop) \
198
+ #define H_PROP(prop, idx) \
207
199
  static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
208
- hpricot_ele *he; \
209
- Data_Get_Struct(self, hpricot_ele, he); \
210
- he->prop = x; \
200
+ H_ELE_SET(self, idx, x); \
211
201
  return self; \
212
202
  } \
203
+ static VALUE hpricot_ele_clear_##prop(VALUE self) { \
204
+ H_ELE_SET(self, idx, Qnil); \
205
+ return Qtrue; \
206
+ } \
213
207
  static VALUE hpricot_ele_get_##prop(VALUE self) { \
214
- hpricot_ele *he; \
215
- Data_Get_Struct(self, hpricot_ele, he); \
216
- return he->prop; \
208
+ return H_ELE_GET(self, idx); \
217
209
  }
218
210
 
219
211
  #define H_ATTR(prop) \
220
212
  static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
221
- hpricot_ele *he; \
222
- Data_Get_Struct(self, hpricot_ele, he); \
223
- rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
213
+ rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
224
214
  return self; \
225
215
  } \
226
216
  static VALUE hpricot_ele_get_##prop(VALUE self) { \
227
- hpricot_ele *he; \
228
- Data_Get_Struct(self, hpricot_ele, he); \
229
- return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
217
+ return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
230
218
  }
231
219
 
232
- H_PROP(tag);
233
- H_PROP(attr);
234
- H_PROP(etag);
235
- H_PROP(parent);
236
- H_PROP(children);
220
+ H_PROP(name, H_ELE_TAG);
221
+ H_PROP(raw, H_ELE_RAW);
222
+ H_PROP(parent, H_ELE_PARENT);
223
+ H_PROP(attr, H_ELE_ATTR);
224
+ H_PROP(etag, H_ELE_ETAG);
225
+ H_PROP(children, H_ELE_CHILDREN);
226
+ H_ATTR(target);
237
227
  H_ATTR(encoding);
238
228
  H_ATTR(version);
239
229
  H_ATTR(standalone);
240
230
  H_ATTR(system_id);
241
231
  H_ATTR(public_id);
242
232
 
243
- static VALUE
244
- hpricot_ele_get_raw(VALUE self, VALUE x) {
245
- hpricot_ele *he;
246
- Data_Get_Struct(self, hpricot_ele, he);
247
- return he->raw;
248
- }
249
-
250
- static VALUE
251
- hpricot_ele_clear_raw(VALUE self)
252
- {
253
- hpricot_ele *he;
254
- Data_Get_Struct(self, hpricot_ele, he);
255
- he->raw = Qnil;
256
- return Qtrue;
257
- }
258
-
259
233
  #define H_ELE(klass) \
260
- hpricot_ele *he = ALLOC(hpricot_ele); \
261
- he->name = 0; \
262
- he->tag = tag; \
263
- he->attr = attr; \
264
- he->raw = Qnil; \
265
- he->EC = ec; \
266
- he->etag = he->parent = he->children = Qnil; \
267
- if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_etag || sym == sym_doctype)) { \
268
- he->raw = rb_str_new(raw, rawlen); \
234
+ ele = rb_obj_alloc(klass); \
235
+ if (klass == cElem) { \
236
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
237
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
238
+ H_ELE_SET(ele, H_ELE_EC, ec); \
239
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
240
+ H_ELE_SET(ele, H_ELE_RAW, rb_str_new(raw, rawlen)); \
241
+ } \
242
+ } else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
243
+ if (klass == cBogusETag) { \
244
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
245
+ if (raw != NULL) \
246
+ H_ELE_SET(ele, H_ELE_ATTR, rb_str_new(raw, rawlen)); \
247
+ } else { \
248
+ if (klass == cDocType) \
249
+ ATTR(ID2SYM(rb_intern("target")), tag); \
250
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
251
+ if (klass != cProcIns) { \
252
+ tag = Qnil; \
253
+ if (raw != NULL) tag = rb_str_new(raw, rawlen); \
254
+ } \
255
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
256
+ } \
257
+ } else { \
258
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
269
259
  } \
270
- ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
271
260
  S->last = ele
272
261
 
273
- VALUE
274
- hpricot_ele_alloc(VALUE klass)
275
- {
276
- VALUE ele;
277
- hpricot_ele *he = ALLOC(hpricot_ele);
278
- he->name = 0;
279
- he->tag = he->attr = he->raw = he->EC = Qnil;
280
- he->etag = he->parent = he->children = Qnil;
281
- ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
282
- return ele;
283
- }
284
-
285
262
  //
286
263
  // the swift, compact parser logic. most of the complicated stuff is done
287
264
  // in the lexer. this step just pairs up the start and end tags.
@@ -295,22 +272,23 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
295
272
  // in html mode, fix up start tags incorrectly formed as empty tags
296
273
  //
297
274
  if (!S->xml) {
298
- hpricot_ele *last;
299
- Data_Get_Struct(S->focus, hpricot_ele, last);
300
- if (last->EC == sym_CDATA &&
301
- (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
302
- !(sym == sym_etag && rb_str_hash(tag) == last->name))
303
- {
304
- sym = sym_text;
305
- tag = rb_str_new(raw, rawlen);
306
- }
307
-
308
275
  if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
309
276
  ec = rb_hash_aref(S->EC, tag);
310
277
  if (NIL_P(ec)) {
311
278
  tag = rb_funcall(tag, s_downcase, 0);
312
279
  ec = rb_hash_aref(S->EC, tag);
313
280
  }
281
+ }
282
+
283
+ if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
284
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
285
+ !(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
286
+ {
287
+ sym = sym_text;
288
+ tag = rb_str_new(raw, rawlen);
289
+ }
290
+
291
+ if (!NIL_P(ec)) {
314
292
  if (sym == sym_emptytag) {
315
293
  if (ec != sym_EMPTY)
316
294
  sym = sym_stag;
@@ -322,19 +300,19 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
322
300
  }
323
301
 
324
302
  if (sym == sym_emptytag || sym == sym_stag) {
303
+ VALUE name = INT2FIX(rb_str_hash(tag));
325
304
  H_ELE(cElem);
326
- he->name = rb_str_hash(tag);
305
+ H_ELE_SET(ele, H_ELE_HASH, name);
327
306
 
328
307
  if (!S->xml) {
329
308
  VALUE match = Qnil, e = S->focus;
330
309
  while (e != S->doc)
331
310
  {
332
- hpricot_ele *hee;
333
- Data_Get_Struct(e, hpricot_ele, hee);
311
+ VALUE hEC = H_ELE_GET(e, H_ELE_EC);
334
312
 
335
- if (TYPE(hee->EC) == T_HASH)
313
+ if (TYPE(hEC) == T_HASH)
336
314
  {
337
- VALUE has = our_rb_hash_lookup(hee->EC, INT2NUM(he->name));
315
+ VALUE has = rb_hash_lookup(hEC, name);
338
316
  if (has != Qnil) {
339
317
  if (has == Qtrue) {
340
318
  if (match == Qnil)
@@ -347,7 +325,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
347
325
  }
348
326
  }
349
327
 
350
- e = hee->parent;
328
+ e = H_ELE_GET(e, H_ELE_PARENT);
351
329
  }
352
330
 
353
331
  if (match == Qnil)
@@ -369,8 +347,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
369
347
  }
370
348
  }
371
349
  } else if (sym == sym_etag) {
372
- int name;
373
- VALUE match = Qnil, e = S->focus;
350
+ VALUE name, match = Qnil, e = S->focus;
374
351
  if (S->strict) {
375
352
  if (NIL_P(rb_hash_aref(S->EC, tag))) {
376
353
  tag = rb_str_new2("div");
@@ -383,19 +360,16 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
383
360
  //
384
361
  // (see also: the search above for fixups)
385
362
  //
386
- name = rb_str_hash(tag);
363
+ name = INT2FIX(rb_str_hash(tag));
387
364
  while (e != S->doc)
388
365
  {
389
- hpricot_ele *he;
390
- Data_Get_Struct(e, hpricot_ele, he);
391
-
392
- if (he->name == name)
366
+ if (H_ELE_GET(e, H_ELE_HASH) == name)
393
367
  {
394
368
  match = e;
395
369
  break;
396
370
  }
397
371
 
398
- e = he->parent;
372
+ e = H_ELE_GET(e, H_ELE_PARENT);
399
373
  }
400
374
 
401
375
  if (NIL_P(match))
@@ -405,10 +379,11 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
405
379
  }
406
380
  else
407
381
  {
408
- H_ELE(cETag);
409
- Data_Get_Struct(match, hpricot_ele, he);
410
- he->etag = ele;
411
- S->focus = he->parent;
382
+ VALUE ele = Qnil;
383
+ if (raw != NULL)
384
+ ele = rb_str_new(raw, rawlen);
385
+ H_ELE_SET(match, H_ELE_ETAG, ele);
386
+ S->focus = H_ELE_GET(match, H_ELE_PARENT);
412
387
  S->last = Qnil;
413
388
  }
414
389
  } else if (sym == sym_cdata) {
@@ -429,15 +404,13 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
429
404
  tag = rb_reg_nth_match(1, match);
430
405
  attr = rb_reg_nth_match(2, match);
431
406
  {
432
- H_ELE(cProcIns);
433
- rb_hpricot_add(S->focus, ele);
407
+ H_ELE(cProcIns);
408
+ rb_hpricot_add(S->focus, ele);
434
409
  }
435
410
  } else if (sym == sym_text) {
436
411
  // TODO: add raw_string as well?
437
412
  if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
438
- hpricot_ele *he;
439
- Data_Get_Struct(S->last, hpricot_ele, he);
440
- rb_str_append(he->tag, tag);
413
+ rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
441
414
  } else {
442
415
  H_ELE(cText);
443
416
  rb_hpricot_add(S->focus, ele);
@@ -450,7 +423,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
450
423
 
451
424
  VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
452
425
  {
453
- int cs, act, have = 0, nread = 0, curline = 1, text = 0;
426
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
454
427
  char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
455
428
 
456
429
  hpricot_state *S = NULL;
@@ -460,12 +433,13 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
460
433
  int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
461
434
 
462
435
  rb_scan_args(argc, argv, "11", &port, &opts);
463
- taint = OBJ_TAINTED( port );
464
- if ( !rb_respond_to( port, s_read ) )
436
+ taint = OBJ_TAINTED(port);
437
+ io = rb_respond_to(port, s_read);
438
+ if (!io)
465
439
  {
466
- if ( rb_respond_to( port, s_to_str ) )
440
+ if (rb_respond_to(port, s_to_str))
467
441
  {
468
- port = rb_funcall( port, s_to_str, 0 );
442
+ port = rb_funcall(port, s_to_str, 0);
469
443
  StringValue(port);
470
444
  }
471
445
  else
@@ -479,11 +453,8 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
479
453
 
480
454
  if (!rb_block_given_p())
481
455
  {
482
- hpricot_ele *he = ALLOC(hpricot_ele);
483
456
  S = ALLOC(hpricot_state);
484
- MEMZERO(he, hpricot_ele, 1);
485
- he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
486
- S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
457
+ S->doc = rb_obj_alloc(cDoc);
487
458
  rb_gc_register_address(&S->doc);
488
459
  S->focus = S->doc;
489
460
  S->last = Qnil;
@@ -503,65 +474,68 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
503
474
  buffer_size = NUM2INT(bufsize);
504
475
  }
505
476
  }
506
- buf = ALLOC_N(char, buffer_size);
477
+
478
+ if (io)
479
+ buf = ALLOC_N(char, buffer_size);
507
480
 
508
481
  %% write init;
509
-
510
- while ( !done ) {
482
+
483
+ while (!done) {
511
484
  VALUE str;
512
485
  char *p, *pe;
513
486
  int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
514
487
 
515
- if ( space == 0 ) {
516
- /* We've used up the entire buffer storing an already-parsed token
517
- * prefix that must be preserved. Likely caused by super-long attributes.
518
- * Increase buffer size and continue */
519
- tokstart_diff = ts - buf;
520
- tokend_diff = te - buf;
521
- mark_tag_diff = mark_tag - buf;
522
- mark_akey_diff = mark_akey - buf;
523
- mark_aval_diff = mark_aval - buf;
524
-
525
- buffer_size += BUFSIZE;
526
- REALLOC_N(buf, char, buffer_size);
527
-
528
- space = buffer_size - have;
529
-
530
- ts= buf + tokstart_diff;
531
- te = buf + tokend_diff;
532
- mark_tag = buf + mark_tag_diff;
533
- mark_akey = buf + mark_akey_diff;
534
- mark_aval = buf + mark_aval_diff;
535
- }
536
- p = buf + have;
537
-
538
- if ( rb_respond_to( port, s_read ) )
488
+ if (io)
539
489
  {
490
+ if (space == 0) {
491
+ /* We've used up the entire buffer storing an already-parsed token
492
+ * prefix that must be preserved. Likely caused by super-long attributes.
493
+ * Increase buffer size and continue */
494
+ tokstart_diff = ts - buf;
495
+ tokend_diff = te - buf;
496
+ mark_tag_diff = mark_tag - buf;
497
+ mark_akey_diff = mark_akey - buf;
498
+ mark_aval_diff = mark_aval - buf;
499
+
500
+ buffer_size += BUFSIZE;
501
+ REALLOC_N(buf, char, buffer_size);
502
+
503
+ space = buffer_size - have;
504
+
505
+ ts = buf + tokstart_diff;
506
+ te = buf + tokend_diff;
507
+ mark_tag = buf + mark_tag_diff;
508
+ mark_akey = buf + mark_akey_diff;
509
+ mark_aval = buf + mark_aval_diff;
510
+ }
511
+ p = buf + have;
512
+
540
513
  str = rb_funcall(port, s_read, 1, INT2FIX(space));
541
514
  len = RSTRING_LEN(str);
542
515
  memcpy(p, StringValuePtr(str), len);
543
516
  }
544
517
  else
545
518
  {
546
- len = RSTRING_LEN(port) - nread;
547
- if (len > space) len = space;
548
- memcpy(p, StringValuePtr(port) + nread, len);
519
+ p = RSTRING_PTR(port);
520
+ len = RSTRING_LEN(port) + 1;
521
+ done = 1;
549
522
  }
550
523
 
551
524
  nread += len;
552
525
 
553
526
  /* If this is the last buffer, tack on an EOF. */
554
- if ( len < space ) {
527
+ if (io && len < space) {
555
528
  p[len++] = 0;
556
529
  done = 1;
557
530
  }
558
531
 
559
532
  pe = p + len;
560
533
  %% write exec;
561
-
562
- if ( cs == hpricot_scan_error ) {
563
- free(buf);
564
- if ( !NIL_P(tag) )
534
+
535
+ if (cs == hpricot_scan_error) {
536
+ if (buf != NULL)
537
+ free(buf);
538
+ if (!NIL_P(tag))
565
539
  {
566
540
  rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
567
541
  }
@@ -570,8 +544,8 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
570
544
  rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
571
545
  }
572
546
  }
573
-
574
- if ( done && ele_open )
547
+
548
+ if (done && ele_open)
575
549
  {
576
550
  ele_open = 0;
577
551
  if (ts > 0) {
@@ -581,11 +555,11 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
581
555
  }
582
556
  }
583
557
 
584
- if ( ts == 0 )
558
+ if (ts == 0)
585
559
  {
586
560
  have = 0;
587
561
  /* text nodes have no ts because each byte is parsed alone */
588
- if ( mark_tag != NULL && text == 1 )
562
+ if (mark_tag != NULL && text == 1)
589
563
  {
590
564
  if (done)
591
565
  {
@@ -600,12 +574,15 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
600
574
  CAT(tag, p);
601
575
  }
602
576
  }
603
- mark_tag = buf;
577
+ if (io)
578
+ mark_tag = buf;
579
+ else
580
+ mark_tag = RSTRING_PTR(port);
604
581
  }
605
- else
582
+ else if (io)
606
583
  {
607
584
  have = pe - ts;
608
- memmove( buf, ts, have );
585
+ memmove(buf, ts, have);
609
586
  SLIDE(tag);
610
587
  SLIDE(akey);
611
588
  SLIDE(aval);
@@ -613,7 +590,9 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
613
590
  ts = buf;
614
591
  }
615
592
  }
616
- free(buf);
593
+
594
+ if (buf != NULL)
595
+ free(buf);
617
596
 
618
597
  if (S != NULL)
619
598
  {
@@ -626,66 +605,103 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
626
605
  return Qnil;
627
606
  }
628
607
 
629
- void Init_hpricot_scan()
608
+ static VALUE
609
+ alloc_hpricot_struct(VALUE klass)
630
610
  {
631
- mHpricot = rb_define_module("Hpricot");
632
- rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
633
- rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
634
- rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
635
- rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
611
+ VALUE size;
612
+ long n;
613
+ NEWOBJ(st, struct RStruct);
614
+ OBJSETUP(st, klass, T_STRUCT);
615
+
616
+ size = rb_struct_iv_get(klass, "__size__");
617
+ n = FIX2LONG(size);
618
+
619
+ #ifndef RSTRUCT_EMBED_LEN_MAX
620
+ st->ptr = ALLOC_N(VALUE, n);
621
+ rb_mem_clear(st->ptr, n);
622
+ st->len = n;
623
+ #else
624
+ if (0 < n && n <= RSTRUCT_EMBED_LEN_MAX) {
625
+ RBASIC(st)->flags &= ~RSTRUCT_EMBED_LEN_MASK;
626
+ RBASIC(st)->flags |= n << RSTRUCT_EMBED_LEN_SHIFT;
627
+ rb_mem_clear(st->as.ary, n);
628
+ } else {
629
+ st->as.heap.ptr = ALLOC_N(VALUE, n);
630
+ rb_mem_clear(st->as.heap.ptr, n);
631
+ st->as.heap.len = n;
632
+ }
633
+ #endif
636
634
 
637
- cDoc = rb_define_class_under(mHpricot, "Doc", rb_cObject);
638
- rb_define_alloc_func(cDoc, hpricot_ele_alloc);
639
- rb_define_method(cDoc, "children", hpricot_ele_get_children, 0);
640
- rb_define_method(cDoc, "children=", hpricot_ele_set_children, 1);
641
-
642
- cBaseEle = rb_define_class_under(mHpricot, "BaseEle", rb_cObject);
643
- rb_define_alloc_func(cBaseEle, hpricot_ele_alloc);
644
- rb_define_method(cBaseEle, "raw_string", hpricot_ele_get_raw, 0);
645
- rb_define_method(cBaseEle, "clear_raw", hpricot_ele_clear_raw, 0);
646
- rb_define_method(cBaseEle, "parent", hpricot_ele_get_parent, 0);
647
- rb_define_method(cBaseEle, "parent=", hpricot_ele_set_parent, 1);
648
- cCData = rb_define_class_under(mHpricot, "CData", cBaseEle);
649
- rb_define_method(cCData, "content", hpricot_ele_get_tag, 0);
650
- rb_define_method(cCData, "content=", hpricot_ele_set_tag, 1);
651
- cComment = rb_define_class_under(mHpricot, "Comment", cBaseEle);
652
- rb_define_method(cComment, "content", hpricot_ele_get_tag, 0);
653
- rb_define_method(cComment, "content=", hpricot_ele_set_tag, 1);
654
- cDocType = rb_define_class_under(mHpricot, "DocType", cBaseEle);
655
- rb_define_method(cDocType, "target", hpricot_ele_get_tag, 0);
656
- rb_define_method(cDocType, "target=", hpricot_ele_set_tag, 1);
657
- rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
658
- rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
659
- rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
660
- rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
661
- cElem = rb_define_class_under(mHpricot, "Elem", cBaseEle);
662
- rb_define_method(cElem, "raw_attributes", hpricot_ele_get_attr, 0);
663
- rb_define_method(cElem, "raw_attributes=", hpricot_ele_set_attr, 1);
664
- rb_define_method(cElem, "children", hpricot_ele_get_children, 0);
665
- rb_define_method(cElem, "children=", hpricot_ele_set_children, 1);
666
- rb_define_method(cElem, "etag", hpricot_ele_get_etag, 0);
667
- rb_define_method(cElem, "etag=", hpricot_ele_set_etag, 1);
668
- rb_define_method(cElem, "name", hpricot_ele_get_tag, 0);
669
- rb_define_method(cElem, "name=", hpricot_ele_set_tag, 1);
670
- cETag = rb_define_class_under(mHpricot, "ETag", cBaseEle);
671
- rb_define_method(cETag, "name", hpricot_ele_get_tag, 0);
672
- rb_define_method(cETag, "name=", hpricot_ele_set_tag, 1);
673
- cBogusETag = rb_define_class_under(mHpricot, "BogusETag", cETag);
674
- cText = rb_define_class_under(mHpricot, "Text", cBaseEle);
675
- rb_define_method(cText, "content", hpricot_ele_get_tag, 0);
676
- rb_define_method(cText, "content=", hpricot_ele_set_tag, 1);
677
- cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", cBaseEle);
678
- rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
679
- rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
680
- rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
681
- rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
682
- rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
683
- rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
684
- cProcIns = rb_define_class_under(mHpricot, "ProcIns", cBaseEle);
685
- rb_define_method(cProcIns, "target", hpricot_ele_get_tag, 0);
686
- rb_define_method(cProcIns, "target=", hpricot_ele_set_tag, 1);
687
- rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
688
- rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
635
+ return (VALUE)st;
636
+ }
637
+
638
+ static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);}
639
+ static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);}
640
+ static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);}
641
+ static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);}
642
+ static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);}
643
+ static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);}
644
+ static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);}
645
+ static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);}
646
+ static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);}
647
+ static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}
648
+
649
+ static VALUE (*ref_func[10])() = {
650
+ hpricot_struct_ref0,
651
+ hpricot_struct_ref1,
652
+ hpricot_struct_ref2,
653
+ hpricot_struct_ref3,
654
+ hpricot_struct_ref4,
655
+ hpricot_struct_ref5,
656
+ hpricot_struct_ref6,
657
+ hpricot_struct_ref7,
658
+ hpricot_struct_ref8,
659
+ hpricot_struct_ref9,
660
+ };
661
+
662
+ static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);}
663
+ static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);}
664
+ static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);}
665
+ static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);}
666
+ static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);}
667
+ static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);}
668
+ static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);}
669
+ static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);}
670
+ static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);}
671
+ static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}
672
+
673
+ static VALUE (*set_func[10])() = {
674
+ hpricot_struct_set0,
675
+ hpricot_struct_set1,
676
+ hpricot_struct_set2,
677
+ hpricot_struct_set3,
678
+ hpricot_struct_set4,
679
+ hpricot_struct_set5,
680
+ hpricot_struct_set6,
681
+ hpricot_struct_set7,
682
+ hpricot_struct_set8,
683
+ hpricot_struct_set9,
684
+ };
685
+
686
+ static VALUE
687
+ make_hpricot_struct(VALUE members)
688
+ {
689
+ int i = 0;
690
+ VALUE klass = rb_class_new(rb_cObject);
691
+ rb_iv_set(klass, "__size__", INT2NUM(RARRAY_LEN(members)));
692
+ rb_define_alloc_func(klass, alloc_hpricot_struct);
693
+ rb_define_singleton_method(klass, "new", rb_class_new_instance, -1);
694
+ for (i = 0; i < RARRAY_LEN(members); i++) {
695
+ ID id = SYM2ID(RARRAY_PTR(members)[i]);
696
+ rb_define_method_id(klass, id, ref_func[i], 0);
697
+ rb_define_method_id(klass, rb_id_attrset(id), set_func[i], 1);
698
+ }
699
+ return klass;
700
+ }
701
+
702
+ void Init_hpricot_scan()
703
+ {
704
+ VALUE structElem, structAttr, structBasic;
689
705
 
690
706
  s_ElementContent = rb_intern("ElementContent");
691
707
  symAllow = ID2SYM(rb_intern("allow"));
@@ -695,19 +711,78 @@ void Init_hpricot_scan()
695
711
  s_parent = rb_intern("parent");
696
712
  s_read = rb_intern("read");
697
713
  s_to_str = rb_intern("to_str");
698
- iv_parent = rb_intern("parent");
699
714
  sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
700
715
  sym_doctype = ID2SYM(rb_intern("doctype"));
701
716
  sym_procins = ID2SYM(rb_intern("procins"));
702
717
  sym_stag = ID2SYM(rb_intern("stag"));
703
718
  sym_etag = ID2SYM(rb_intern("etag"));
704
719
  sym_emptytag = ID2SYM(rb_intern("emptytag"));
720
+ sym_allowed = ID2SYM(rb_intern("allowed"));
721
+ sym_children = ID2SYM(rb_intern("children"));
705
722
  sym_comment = ID2SYM(rb_intern("comment"));
706
723
  sym_cdata = ID2SYM(rb_intern("cdata"));
724
+ sym_name = ID2SYM(rb_intern("name"));
725
+ sym_parent = ID2SYM(rb_intern("parent"));
726
+ sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
727
+ sym_raw_string = ID2SYM(rb_intern("raw_string"));
728
+ sym_tagno = ID2SYM(rb_intern("tagno"));
707
729
  sym_text = ID2SYM(rb_intern("text"));
708
730
  sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
709
731
  sym_CDATA = ID2SYM(rb_intern("CDATA"));
710
732
 
733
+ mHpricot = rb_define_module("Hpricot");
734
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
735
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
736
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
737
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
738
+
739
+ structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
740
+ sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
741
+ sym_tagno, sym_children));
742
+ structAttr = make_hpricot_struct(rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes));
743
+ structBasic = make_hpricot_struct(rb_ary_new3(2, sym_name, sym_parent));
744
+
745
+ cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
746
+ cCData = rb_define_class_under(mHpricot, "CData", structBasic);
747
+ rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
748
+ rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
749
+ cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
750
+ rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
751
+ rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
752
+ cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
753
+ rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
754
+ rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
755
+ rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
756
+ rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
757
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
758
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
759
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
760
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
761
+ cElem = rb_define_class_under(mHpricot, "Elem", structElem);
762
+ rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
763
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
764
+ rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
765
+ rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
766
+ cText = rb_define_class_under(mHpricot, "Text", structBasic);
767
+ rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
768
+ rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
769
+ rb_define_method(cText, "content", hpricot_ele_get_name, 0);
770
+ rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
771
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
772
+ rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
773
+ rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
774
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
775
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
776
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
777
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
778
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
779
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
780
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
781
+ rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
782
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
783
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
784
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
785
+
711
786
  rb_const_set(mHpricot, rb_intern("ProcInsParse"),
712
787
  reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
713
788
  }