adamh-hpricot 0.6.211 → 0.7.229

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,20 +19,26 @@ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
19
19
  #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
20
20
 
21
21
  static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
22
- sym_cdata, sym_text, sym_EMPTY, sym_CDATA;
22
+ sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
23
+ sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;
23
24
  static VALUE mHpricot, rb_eHpricotParseError;
24
- static VALUE cBaseEle, cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cETag, cText,
25
+ static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,
25
26
  cXMLDecl, cProcIns, symAllow, symDeny;
26
27
  static ID s_ElementContent;
27
28
  static ID s_downcase, s_new, s_parent, s_read, s_to_str;
28
- static ID iv_parent;
29
29
  static VALUE reProcInsParse;
30
30
 
31
- typedef struct {
32
- int name;
33
- VALUE tag, attr, etag, raw, EC;
34
- VALUE parent, children;
35
- } hpricot_ele;
31
+ #define H_ELE_TAG 0
32
+ #define H_ELE_PARENT 1
33
+ #define H_ELE_ATTR 2
34
+ #define H_ELE_ETAG 3
35
+ #define H_ELE_RAW 4
36
+ #define H_ELE_EC 5
37
+ #define H_ELE_HASH 6
38
+ #define H_ELE_CHILDREN 7
39
+
40
+ #define H_ELE_GET(ele, idx) RSTRUCT_PTR(ele)[idx]
41
+ #define H_ELE_SET(ele, idx, val) RSTRUCT_PTR(ele)[idx] = val
36
42
 
37
43
  #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
38
44
 
@@ -60,7 +66,7 @@ typedef struct {
60
66
 
61
67
  #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
62
68
 
63
- #define SLIDE(N) if ( mark_##N > ts ) mark_##N = buf + (mark_##N - ts);
69
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
64
70
 
65
71
  #define ATTR(K, V) \
66
72
  if (!NIL_P(K)) { \
@@ -107,7 +113,7 @@ typedef struct {
107
113
  action tag { SET(tag, p); }
108
114
  action tagc { SET(tag, p-1); }
109
115
  action aval { SET(aval, p); }
110
- action aunq {
116
+ action aunq {
111
117
  if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
112
118
  else { SET(aval, p); }
113
119
  }
@@ -118,14 +124,16 @@ typedef struct {
118
124
  action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
119
125
  action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
120
126
 
121
- action new_attr {
127
+ action new_attr {
122
128
  akey = Qnil;
123
129
  aval = Qnil;
124
130
  mark_akey = NULL;
125
131
  mark_aval = NULL;
126
132
  }
127
133
 
128
- action save_attr {
134
+ action save_attr {
135
+ if (!S->xml)
136
+ akey = rb_funcall(akey, s_downcase, 0);
129
137
  ATTR(akey, aval);
130
138
  }
131
139
 
@@ -144,7 +152,7 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
144
152
  raw = tag;
145
153
  }
146
154
  ary = rb_ary_new3(4, sym, tag, attr, raw);
147
- if (taint) {
155
+ if (taint) {
148
156
  OBJ_TAINT(ary);
149
157
  OBJ_TAINT(tag);
150
158
  OBJ_TAINT(attr);
@@ -153,6 +161,7 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
153
161
  rb_yield(ary);
154
162
  }
155
163
 
164
+ #ifndef RHASH_TBL
156
165
  /* rb_hash_lookup() is only in Ruby 1.8.7 */
157
166
  static VALUE
158
167
  our_rb_hash_lookup(VALUE hash, VALUE key)
@@ -165,17 +174,17 @@ our_rb_hash_lookup(VALUE hash, VALUE key)
165
174
 
166
175
  return val;
167
176
  }
177
+ #define rb_hash_lookup our_rb_hash_lookup
178
+ #endif
168
179
 
169
180
  static void
170
181
  rb_hpricot_add(VALUE focus, VALUE ele)
171
182
  {
172
- hpricot_ele *he, *he2;
173
- Data_Get_Struct(focus, hpricot_ele, he);
174
- Data_Get_Struct(ele, hpricot_ele, he2);
175
- if (NIL_P(he->children))
176
- he->children = rb_ary_new();
177
- rb_ary_push(he->children, ele);
178
- he2->parent = focus;
183
+ VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
184
+ if (NIL_P(children))
185
+ H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
186
+ rb_ary_push(children, ele);
187
+ H_ELE_SET(ele, H_ELE_PARENT, focus);
179
188
  }
180
189
 
181
190
  typedef struct {
@@ -186,102 +195,70 @@ typedef struct {
186
195
  unsigned char xml, strict, fixup;
187
196
  } hpricot_state;
188
197
 
189
- static void
190
- hpricot_ele_mark(hpricot_ele *he)
191
- {
192
- rb_gc_mark(he->tag);
193
- rb_gc_mark(he->attr);
194
- rb_gc_mark(he->etag);
195
- rb_gc_mark(he->raw);
196
- rb_gc_mark(he->parent);
197
- rb_gc_mark(he->children);
198
- }
199
-
200
- static void
201
- hpricot_ele_free(hpricot_ele *he)
202
- {
203
- free(he);
204
- }
205
-
206
- #define H_PROP(prop) \
198
+ #define H_PROP(prop, idx) \
207
199
  static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
208
- hpricot_ele *he; \
209
- Data_Get_Struct(self, hpricot_ele, he); \
210
- he->prop = x; \
200
+ H_ELE_SET(self, idx, x); \
211
201
  return self; \
212
202
  } \
203
+ static VALUE hpricot_ele_clear_##prop(VALUE self) { \
204
+ H_ELE_SET(self, idx, Qnil); \
205
+ return Qtrue; \
206
+ } \
213
207
  static VALUE hpricot_ele_get_##prop(VALUE self) { \
214
- hpricot_ele *he; \
215
- Data_Get_Struct(self, hpricot_ele, he); \
216
- return he->prop; \
208
+ return H_ELE_GET(self, idx); \
217
209
  }
218
210
 
219
211
  #define H_ATTR(prop) \
220
212
  static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
221
- hpricot_ele *he; \
222
- Data_Get_Struct(self, hpricot_ele, he); \
223
- rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
213
+ rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
224
214
  return self; \
225
215
  } \
226
216
  static VALUE hpricot_ele_get_##prop(VALUE self) { \
227
- hpricot_ele *he; \
228
- Data_Get_Struct(self, hpricot_ele, he); \
229
- return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
217
+ return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
230
218
  }
231
219
 
232
- H_PROP(tag);
233
- H_PROP(attr);
234
- H_PROP(etag);
235
- H_PROP(parent);
236
- H_PROP(children);
220
+ H_PROP(name, H_ELE_TAG);
221
+ H_PROP(raw, H_ELE_RAW);
222
+ H_PROP(parent, H_ELE_PARENT);
223
+ H_PROP(attr, H_ELE_ATTR);
224
+ H_PROP(etag, H_ELE_ETAG);
225
+ H_PROP(children, H_ELE_CHILDREN);
226
+ H_ATTR(target);
237
227
  H_ATTR(encoding);
238
228
  H_ATTR(version);
239
229
  H_ATTR(standalone);
240
230
  H_ATTR(system_id);
241
231
  H_ATTR(public_id);
242
232
 
243
- static VALUE
244
- hpricot_ele_get_raw(VALUE self, VALUE x) {
245
- hpricot_ele *he;
246
- Data_Get_Struct(self, hpricot_ele, he);
247
- return he->raw;
248
- }
249
-
250
- static VALUE
251
- hpricot_ele_clear_raw(VALUE self)
252
- {
253
- hpricot_ele *he;
254
- Data_Get_Struct(self, hpricot_ele, he);
255
- he->raw = Qnil;
256
- return Qtrue;
257
- }
258
-
259
233
  #define H_ELE(klass) \
260
- hpricot_ele *he = ALLOC(hpricot_ele); \
261
- he->name = 0; \
262
- he->tag = tag; \
263
- he->attr = attr; \
264
- he->raw = Qnil; \
265
- he->EC = ec; \
266
- he->etag = he->parent = he->children = Qnil; \
267
- if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_etag || sym == sym_doctype)) { \
268
- he->raw = rb_str_new(raw, rawlen); \
234
+ ele = rb_obj_alloc(klass); \
235
+ if (klass == cElem) { \
236
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
237
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
238
+ H_ELE_SET(ele, H_ELE_EC, ec); \
239
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
240
+ H_ELE_SET(ele, H_ELE_RAW, rb_str_new(raw, rawlen)); \
241
+ } \
242
+ } else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
243
+ if (klass == cBogusETag) { \
244
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
245
+ if (raw != NULL) \
246
+ H_ELE_SET(ele, H_ELE_ATTR, rb_str_new(raw, rawlen)); \
247
+ } else { \
248
+ if (klass == cDocType) \
249
+ ATTR(ID2SYM(rb_intern("target")), tag); \
250
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
251
+ if (klass != cProcIns) { \
252
+ tag = Qnil; \
253
+ if (raw != NULL) tag = rb_str_new(raw, rawlen); \
254
+ } \
255
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
256
+ } \
257
+ } else { \
258
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
269
259
  } \
270
- ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
271
260
  S->last = ele
272
261
 
273
- VALUE
274
- hpricot_ele_alloc(VALUE klass)
275
- {
276
- VALUE ele;
277
- hpricot_ele *he = ALLOC(hpricot_ele);
278
- he->name = 0;
279
- he->tag = he->attr = he->raw = he->EC = Qnil;
280
- he->etag = he->parent = he->children = Qnil;
281
- ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
282
- return ele;
283
- }
284
-
285
262
  //
286
263
  // the swift, compact parser logic. most of the complicated stuff is done
287
264
  // in the lexer. this step just pairs up the start and end tags.
@@ -295,22 +272,23 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
295
272
  // in html mode, fix up start tags incorrectly formed as empty tags
296
273
  //
297
274
  if (!S->xml) {
298
- hpricot_ele *last;
299
- Data_Get_Struct(S->focus, hpricot_ele, last);
300
- if (last->EC == sym_CDATA &&
301
- (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
302
- !(sym == sym_etag && rb_str_hash(tag) == last->name))
303
- {
304
- sym = sym_text;
305
- tag = rb_str_new(raw, rawlen);
306
- }
307
-
308
275
  if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
309
276
  ec = rb_hash_aref(S->EC, tag);
310
277
  if (NIL_P(ec)) {
311
278
  tag = rb_funcall(tag, s_downcase, 0);
312
279
  ec = rb_hash_aref(S->EC, tag);
313
280
  }
281
+ }
282
+
283
+ if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
284
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
285
+ !(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
286
+ {
287
+ sym = sym_text;
288
+ tag = rb_str_new(raw, rawlen);
289
+ }
290
+
291
+ if (!NIL_P(ec)) {
314
292
  if (sym == sym_emptytag) {
315
293
  if (ec != sym_EMPTY)
316
294
  sym = sym_stag;
@@ -322,19 +300,19 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
322
300
  }
323
301
 
324
302
  if (sym == sym_emptytag || sym == sym_stag) {
303
+ VALUE name = INT2FIX(rb_str_hash(tag));
325
304
  H_ELE(cElem);
326
- he->name = rb_str_hash(tag);
305
+ H_ELE_SET(ele, H_ELE_HASH, name);
327
306
 
328
307
  if (!S->xml) {
329
308
  VALUE match = Qnil, e = S->focus;
330
309
  while (e != S->doc)
331
310
  {
332
- hpricot_ele *hee;
333
- Data_Get_Struct(e, hpricot_ele, hee);
311
+ VALUE hEC = H_ELE_GET(e, H_ELE_EC);
334
312
 
335
- if (TYPE(hee->EC) == T_HASH)
313
+ if (TYPE(hEC) == T_HASH)
336
314
  {
337
- VALUE has = our_rb_hash_lookup(hee->EC, INT2NUM(he->name));
315
+ VALUE has = rb_hash_lookup(hEC, name);
338
316
  if (has != Qnil) {
339
317
  if (has == Qtrue) {
340
318
  if (match == Qnil)
@@ -347,7 +325,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
347
325
  }
348
326
  }
349
327
 
350
- e = hee->parent;
328
+ e = H_ELE_GET(e, H_ELE_PARENT);
351
329
  }
352
330
 
353
331
  if (match == Qnil)
@@ -369,8 +347,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
369
347
  }
370
348
  }
371
349
  } else if (sym == sym_etag) {
372
- int name;
373
- VALUE match = Qnil, e = S->focus;
350
+ VALUE name, match = Qnil, e = S->focus;
374
351
  if (S->strict) {
375
352
  if (NIL_P(rb_hash_aref(S->EC, tag))) {
376
353
  tag = rb_str_new2("div");
@@ -383,19 +360,16 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
383
360
  //
384
361
  // (see also: the search above for fixups)
385
362
  //
386
- name = rb_str_hash(tag);
363
+ name = INT2FIX(rb_str_hash(tag));
387
364
  while (e != S->doc)
388
365
  {
389
- hpricot_ele *he;
390
- Data_Get_Struct(e, hpricot_ele, he);
391
-
392
- if (he->name == name)
366
+ if (H_ELE_GET(e, H_ELE_HASH) == name)
393
367
  {
394
368
  match = e;
395
369
  break;
396
370
  }
397
371
 
398
- e = he->parent;
372
+ e = H_ELE_GET(e, H_ELE_PARENT);
399
373
  }
400
374
 
401
375
  if (NIL_P(match))
@@ -405,10 +379,11 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
405
379
  }
406
380
  else
407
381
  {
408
- H_ELE(cETag);
409
- Data_Get_Struct(match, hpricot_ele, he);
410
- he->etag = ele;
411
- S->focus = he->parent;
382
+ VALUE ele = Qnil;
383
+ if (raw != NULL)
384
+ ele = rb_str_new(raw, rawlen);
385
+ H_ELE_SET(match, H_ELE_ETAG, ele);
386
+ S->focus = H_ELE_GET(match, H_ELE_PARENT);
412
387
  S->last = Qnil;
413
388
  }
414
389
  } else if (sym == sym_cdata) {
@@ -429,15 +404,13 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
429
404
  tag = rb_reg_nth_match(1, match);
430
405
  attr = rb_reg_nth_match(2, match);
431
406
  {
432
- H_ELE(cProcIns);
433
- rb_hpricot_add(S->focus, ele);
407
+ H_ELE(cProcIns);
408
+ rb_hpricot_add(S->focus, ele);
434
409
  }
435
410
  } else if (sym == sym_text) {
436
411
  // TODO: add raw_string as well?
437
412
  if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
438
- hpricot_ele *he;
439
- Data_Get_Struct(S->last, hpricot_ele, he);
440
- rb_str_append(he->tag, tag);
413
+ rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
441
414
  } else {
442
415
  H_ELE(cText);
443
416
  rb_hpricot_add(S->focus, ele);
@@ -450,7 +423,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
450
423
 
451
424
  VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
452
425
  {
453
- int cs, act, have = 0, nread = 0, curline = 1, text = 0;
426
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
454
427
  char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
455
428
 
456
429
  hpricot_state *S = NULL;
@@ -460,12 +433,13 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
460
433
  int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
461
434
 
462
435
  rb_scan_args(argc, argv, "11", &port, &opts);
463
- taint = OBJ_TAINTED( port );
464
- if ( !rb_respond_to( port, s_read ) )
436
+ taint = OBJ_TAINTED(port);
437
+ io = rb_respond_to(port, s_read);
438
+ if (!io)
465
439
  {
466
- if ( rb_respond_to( port, s_to_str ) )
440
+ if (rb_respond_to(port, s_to_str))
467
441
  {
468
- port = rb_funcall( port, s_to_str, 0 );
442
+ port = rb_funcall(port, s_to_str, 0);
469
443
  StringValue(port);
470
444
  }
471
445
  else
@@ -479,11 +453,8 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
479
453
 
480
454
  if (!rb_block_given_p())
481
455
  {
482
- hpricot_ele *he = ALLOC(hpricot_ele);
483
456
  S = ALLOC(hpricot_state);
484
- MEMZERO(he, hpricot_ele, 1);
485
- he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
486
- S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
457
+ S->doc = rb_obj_alloc(cDoc);
487
458
  rb_gc_register_address(&S->doc);
488
459
  S->focus = S->doc;
489
460
  S->last = Qnil;
@@ -503,65 +474,68 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
503
474
  buffer_size = NUM2INT(bufsize);
504
475
  }
505
476
  }
506
- buf = ALLOC_N(char, buffer_size);
477
+
478
+ if (io)
479
+ buf = ALLOC_N(char, buffer_size);
507
480
 
508
481
  %% write init;
509
-
510
- while ( !done ) {
482
+
483
+ while (!done) {
511
484
  VALUE str;
512
485
  char *p, *pe;
513
486
  int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
514
487
 
515
- if ( space == 0 ) {
516
- /* We've used up the entire buffer storing an already-parsed token
517
- * prefix that must be preserved. Likely caused by super-long attributes.
518
- * Increase buffer size and continue */
519
- tokstart_diff = ts - buf;
520
- tokend_diff = te - buf;
521
- mark_tag_diff = mark_tag - buf;
522
- mark_akey_diff = mark_akey - buf;
523
- mark_aval_diff = mark_aval - buf;
524
-
525
- buffer_size += BUFSIZE;
526
- REALLOC_N(buf, char, buffer_size);
527
-
528
- space = buffer_size - have;
529
-
530
- ts= buf + tokstart_diff;
531
- te = buf + tokend_diff;
532
- mark_tag = buf + mark_tag_diff;
533
- mark_akey = buf + mark_akey_diff;
534
- mark_aval = buf + mark_aval_diff;
535
- }
536
- p = buf + have;
537
-
538
- if ( rb_respond_to( port, s_read ) )
488
+ if (io)
539
489
  {
490
+ if (space == 0) {
491
+ /* We've used up the entire buffer storing an already-parsed token
492
+ * prefix that must be preserved. Likely caused by super-long attributes.
493
+ * Increase buffer size and continue */
494
+ tokstart_diff = ts - buf;
495
+ tokend_diff = te - buf;
496
+ mark_tag_diff = mark_tag - buf;
497
+ mark_akey_diff = mark_akey - buf;
498
+ mark_aval_diff = mark_aval - buf;
499
+
500
+ buffer_size += BUFSIZE;
501
+ REALLOC_N(buf, char, buffer_size);
502
+
503
+ space = buffer_size - have;
504
+
505
+ ts = buf + tokstart_diff;
506
+ te = buf + tokend_diff;
507
+ mark_tag = buf + mark_tag_diff;
508
+ mark_akey = buf + mark_akey_diff;
509
+ mark_aval = buf + mark_aval_diff;
510
+ }
511
+ p = buf + have;
512
+
540
513
  str = rb_funcall(port, s_read, 1, INT2FIX(space));
541
514
  len = RSTRING_LEN(str);
542
515
  memcpy(p, StringValuePtr(str), len);
543
516
  }
544
517
  else
545
518
  {
546
- len = RSTRING_LEN(port) - nread;
547
- if (len > space) len = space;
548
- memcpy(p, StringValuePtr(port) + nread, len);
519
+ p = RSTRING_PTR(port);
520
+ len = RSTRING_LEN(port) + 1;
521
+ done = 1;
549
522
  }
550
523
 
551
524
  nread += len;
552
525
 
553
526
  /* If this is the last buffer, tack on an EOF. */
554
- if ( len < space ) {
527
+ if (io && len < space) {
555
528
  p[len++] = 0;
556
529
  done = 1;
557
530
  }
558
531
 
559
532
  pe = p + len;
560
533
  %% write exec;
561
-
562
- if ( cs == hpricot_scan_error ) {
563
- free(buf);
564
- if ( !NIL_P(tag) )
534
+
535
+ if (cs == hpricot_scan_error) {
536
+ if (buf != NULL)
537
+ free(buf);
538
+ if (!NIL_P(tag))
565
539
  {
566
540
  rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
567
541
  }
@@ -570,8 +544,8 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
570
544
  rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
571
545
  }
572
546
  }
573
-
574
- if ( done && ele_open )
547
+
548
+ if (done && ele_open)
575
549
  {
576
550
  ele_open = 0;
577
551
  if (ts > 0) {
@@ -581,11 +555,11 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
581
555
  }
582
556
  }
583
557
 
584
- if ( ts == 0 )
558
+ if (ts == 0)
585
559
  {
586
560
  have = 0;
587
561
  /* text nodes have no ts because each byte is parsed alone */
588
- if ( mark_tag != NULL && text == 1 )
562
+ if (mark_tag != NULL && text == 1)
589
563
  {
590
564
  if (done)
591
565
  {
@@ -600,12 +574,15 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
600
574
  CAT(tag, p);
601
575
  }
602
576
  }
603
- mark_tag = buf;
577
+ if (io)
578
+ mark_tag = buf;
579
+ else
580
+ mark_tag = RSTRING_PTR(port);
604
581
  }
605
- else
582
+ else if (io)
606
583
  {
607
584
  have = pe - ts;
608
- memmove( buf, ts, have );
585
+ memmove(buf, ts, have);
609
586
  SLIDE(tag);
610
587
  SLIDE(akey);
611
588
  SLIDE(aval);
@@ -613,7 +590,9 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
613
590
  ts = buf;
614
591
  }
615
592
  }
616
- free(buf);
593
+
594
+ if (buf != NULL)
595
+ free(buf);
617
596
 
618
597
  if (S != NULL)
619
598
  {
@@ -626,66 +605,103 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
626
605
  return Qnil;
627
606
  }
628
607
 
629
- void Init_hpricot_scan()
608
+ static VALUE
609
+ alloc_hpricot_struct(VALUE klass)
630
610
  {
631
- mHpricot = rb_define_module("Hpricot");
632
- rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
633
- rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
634
- rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
635
- rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
611
+ VALUE size;
612
+ long n;
613
+ NEWOBJ(st, struct RStruct);
614
+ OBJSETUP(st, klass, T_STRUCT);
615
+
616
+ size = rb_struct_iv_get(klass, "__size__");
617
+ n = FIX2LONG(size);
618
+
619
+ #ifndef RSTRUCT_EMBED_LEN_MAX
620
+ st->ptr = ALLOC_N(VALUE, n);
621
+ rb_mem_clear(st->ptr, n);
622
+ st->len = n;
623
+ #else
624
+ if (0 < n && n <= RSTRUCT_EMBED_LEN_MAX) {
625
+ RBASIC(st)->flags &= ~RSTRUCT_EMBED_LEN_MASK;
626
+ RBASIC(st)->flags |= n << RSTRUCT_EMBED_LEN_SHIFT;
627
+ rb_mem_clear(st->as.ary, n);
628
+ } else {
629
+ st->as.heap.ptr = ALLOC_N(VALUE, n);
630
+ rb_mem_clear(st->as.heap.ptr, n);
631
+ st->as.heap.len = n;
632
+ }
633
+ #endif
636
634
 
637
- cDoc = rb_define_class_under(mHpricot, "Doc", rb_cObject);
638
- rb_define_alloc_func(cDoc, hpricot_ele_alloc);
639
- rb_define_method(cDoc, "children", hpricot_ele_get_children, 0);
640
- rb_define_method(cDoc, "children=", hpricot_ele_set_children, 1);
641
-
642
- cBaseEle = rb_define_class_under(mHpricot, "BaseEle", rb_cObject);
643
- rb_define_alloc_func(cBaseEle, hpricot_ele_alloc);
644
- rb_define_method(cBaseEle, "raw_string", hpricot_ele_get_raw, 0);
645
- rb_define_method(cBaseEle, "clear_raw", hpricot_ele_clear_raw, 0);
646
- rb_define_method(cBaseEle, "parent", hpricot_ele_get_parent, 0);
647
- rb_define_method(cBaseEle, "parent=", hpricot_ele_set_parent, 1);
648
- cCData = rb_define_class_under(mHpricot, "CData", cBaseEle);
649
- rb_define_method(cCData, "content", hpricot_ele_get_tag, 0);
650
- rb_define_method(cCData, "content=", hpricot_ele_set_tag, 1);
651
- cComment = rb_define_class_under(mHpricot, "Comment", cBaseEle);
652
- rb_define_method(cComment, "content", hpricot_ele_get_tag, 0);
653
- rb_define_method(cComment, "content=", hpricot_ele_set_tag, 1);
654
- cDocType = rb_define_class_under(mHpricot, "DocType", cBaseEle);
655
- rb_define_method(cDocType, "target", hpricot_ele_get_tag, 0);
656
- rb_define_method(cDocType, "target=", hpricot_ele_set_tag, 1);
657
- rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
658
- rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
659
- rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
660
- rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
661
- cElem = rb_define_class_under(mHpricot, "Elem", cBaseEle);
662
- rb_define_method(cElem, "raw_attributes", hpricot_ele_get_attr, 0);
663
- rb_define_method(cElem, "raw_attributes=", hpricot_ele_set_attr, 1);
664
- rb_define_method(cElem, "children", hpricot_ele_get_children, 0);
665
- rb_define_method(cElem, "children=", hpricot_ele_set_children, 1);
666
- rb_define_method(cElem, "etag", hpricot_ele_get_etag, 0);
667
- rb_define_method(cElem, "etag=", hpricot_ele_set_etag, 1);
668
- rb_define_method(cElem, "name", hpricot_ele_get_tag, 0);
669
- rb_define_method(cElem, "name=", hpricot_ele_set_tag, 1);
670
- cETag = rb_define_class_under(mHpricot, "ETag", cBaseEle);
671
- rb_define_method(cETag, "name", hpricot_ele_get_tag, 0);
672
- rb_define_method(cETag, "name=", hpricot_ele_set_tag, 1);
673
- cBogusETag = rb_define_class_under(mHpricot, "BogusETag", cETag);
674
- cText = rb_define_class_under(mHpricot, "Text", cBaseEle);
675
- rb_define_method(cText, "content", hpricot_ele_get_tag, 0);
676
- rb_define_method(cText, "content=", hpricot_ele_set_tag, 1);
677
- cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", cBaseEle);
678
- rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
679
- rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
680
- rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
681
- rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
682
- rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
683
- rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
684
- cProcIns = rb_define_class_under(mHpricot, "ProcIns", cBaseEle);
685
- rb_define_method(cProcIns, "target", hpricot_ele_get_tag, 0);
686
- rb_define_method(cProcIns, "target=", hpricot_ele_set_tag, 1);
687
- rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
688
- rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
635
+ return (VALUE)st;
636
+ }
637
+
638
+ static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);}
639
+ static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);}
640
+ static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);}
641
+ static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);}
642
+ static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);}
643
+ static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);}
644
+ static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);}
645
+ static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);}
646
+ static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);}
647
+ static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}
648
+
649
+ static VALUE (*ref_func[10])() = {
650
+ hpricot_struct_ref0,
651
+ hpricot_struct_ref1,
652
+ hpricot_struct_ref2,
653
+ hpricot_struct_ref3,
654
+ hpricot_struct_ref4,
655
+ hpricot_struct_ref5,
656
+ hpricot_struct_ref6,
657
+ hpricot_struct_ref7,
658
+ hpricot_struct_ref8,
659
+ hpricot_struct_ref9,
660
+ };
661
+
662
+ static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);}
663
+ static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);}
664
+ static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);}
665
+ static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);}
666
+ static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);}
667
+ static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);}
668
+ static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);}
669
+ static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);}
670
+ static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);}
671
+ static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}
672
+
673
+ static VALUE (*set_func[10])() = {
674
+ hpricot_struct_set0,
675
+ hpricot_struct_set1,
676
+ hpricot_struct_set2,
677
+ hpricot_struct_set3,
678
+ hpricot_struct_set4,
679
+ hpricot_struct_set5,
680
+ hpricot_struct_set6,
681
+ hpricot_struct_set7,
682
+ hpricot_struct_set8,
683
+ hpricot_struct_set9,
684
+ };
685
+
686
+ static VALUE
687
+ make_hpricot_struct(VALUE members)
688
+ {
689
+ int i = 0;
690
+ VALUE klass = rb_class_new(rb_cObject);
691
+ rb_iv_set(klass, "__size__", INT2NUM(RARRAY_LEN(members)));
692
+ rb_define_alloc_func(klass, alloc_hpricot_struct);
693
+ rb_define_singleton_method(klass, "new", rb_class_new_instance, -1);
694
+ for (i = 0; i < RARRAY_LEN(members); i++) {
695
+ ID id = SYM2ID(RARRAY_PTR(members)[i]);
696
+ rb_define_method_id(klass, id, ref_func[i], 0);
697
+ rb_define_method_id(klass, rb_id_attrset(id), set_func[i], 1);
698
+ }
699
+ return klass;
700
+ }
701
+
702
+ void Init_hpricot_scan()
703
+ {
704
+ VALUE structElem, structAttr, structBasic;
689
705
 
690
706
  s_ElementContent = rb_intern("ElementContent");
691
707
  symAllow = ID2SYM(rb_intern("allow"));
@@ -695,19 +711,78 @@ void Init_hpricot_scan()
695
711
  s_parent = rb_intern("parent");
696
712
  s_read = rb_intern("read");
697
713
  s_to_str = rb_intern("to_str");
698
- iv_parent = rb_intern("parent");
699
714
  sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
700
715
  sym_doctype = ID2SYM(rb_intern("doctype"));
701
716
  sym_procins = ID2SYM(rb_intern("procins"));
702
717
  sym_stag = ID2SYM(rb_intern("stag"));
703
718
  sym_etag = ID2SYM(rb_intern("etag"));
704
719
  sym_emptytag = ID2SYM(rb_intern("emptytag"));
720
+ sym_allowed = ID2SYM(rb_intern("allowed"));
721
+ sym_children = ID2SYM(rb_intern("children"));
705
722
  sym_comment = ID2SYM(rb_intern("comment"));
706
723
  sym_cdata = ID2SYM(rb_intern("cdata"));
724
+ sym_name = ID2SYM(rb_intern("name"));
725
+ sym_parent = ID2SYM(rb_intern("parent"));
726
+ sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
727
+ sym_raw_string = ID2SYM(rb_intern("raw_string"));
728
+ sym_tagno = ID2SYM(rb_intern("tagno"));
707
729
  sym_text = ID2SYM(rb_intern("text"));
708
730
  sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
709
731
  sym_CDATA = ID2SYM(rb_intern("CDATA"));
710
732
 
733
+ mHpricot = rb_define_module("Hpricot");
734
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
735
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
736
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
737
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
738
+
739
+ structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
740
+ sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
741
+ sym_tagno, sym_children));
742
+ structAttr = make_hpricot_struct(rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes));
743
+ structBasic = make_hpricot_struct(rb_ary_new3(2, sym_name, sym_parent));
744
+
745
+ cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
746
+ cCData = rb_define_class_under(mHpricot, "CData", structBasic);
747
+ rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
748
+ rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
749
+ cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
750
+ rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
751
+ rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
752
+ cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
753
+ rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
754
+ rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
755
+ rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
756
+ rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
757
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
758
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
759
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
760
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
761
+ cElem = rb_define_class_under(mHpricot, "Elem", structElem);
762
+ rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
763
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
764
+ rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
765
+ rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
766
+ cText = rb_define_class_under(mHpricot, "Text", structBasic);
767
+ rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
768
+ rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
769
+ rb_define_method(cText, "content", hpricot_ele_get_name, 0);
770
+ rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
771
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
772
+ rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
773
+ rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
774
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
775
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
776
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
777
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
778
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
779
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
780
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
781
+ rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
782
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
783
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
784
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
785
+
711
786
  rb_const_set(mHpricot, rb_intern("ProcInsParse"),
712
787
  reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
713
788
  }