adamh-hpricot 0.6.211 → 0.7.229
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -0
- data/Rakefile +3 -3
- data/ext/fast_xs/fast_xs.c +1 -0
- data/ext/hpricot_scan/hpricot_css.c +107 -103
- data/ext/hpricot_scan/hpricot_scan.c +1159 -923
- data/ext/hpricot_scan/hpricot_scan.rl +312 -237
- data/lib/hpricot/elements.rb +1 -1
- data/lib/hpricot/inspect.rb +2 -2
- data/lib/hpricot/modules.rb +2 -0
- data/lib/hpricot/tag.rb +43 -22
- data/lib/hpricot/traverse.rb +1 -0
- data/test/test_alter.rb +20 -2
- data/test/test_parser.rb +19 -0
- data/test/test_preserved.rb +9 -0
- metadata +2 -2
@@ -19,20 +19,26 @@ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
|
|
19
19
|
#define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
|
20
20
|
|
21
21
|
static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
|
22
|
-
sym_cdata,
|
22
|
+
sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
|
23
|
+
sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;
|
23
24
|
static VALUE mHpricot, rb_eHpricotParseError;
|
24
|
-
static VALUE
|
25
|
+
static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,
|
25
26
|
cXMLDecl, cProcIns, symAllow, symDeny;
|
26
27
|
static ID s_ElementContent;
|
27
28
|
static ID s_downcase, s_new, s_parent, s_read, s_to_str;
|
28
|
-
static ID iv_parent;
|
29
29
|
static VALUE reProcInsParse;
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
31
|
+
#define H_ELE_TAG 0
|
32
|
+
#define H_ELE_PARENT 1
|
33
|
+
#define H_ELE_ATTR 2
|
34
|
+
#define H_ELE_ETAG 3
|
35
|
+
#define H_ELE_RAW 4
|
36
|
+
#define H_ELE_EC 5
|
37
|
+
#define H_ELE_HASH 6
|
38
|
+
#define H_ELE_CHILDREN 7
|
39
|
+
|
40
|
+
#define H_ELE_GET(ele, idx) RSTRUCT_PTR(ele)[idx]
|
41
|
+
#define H_ELE_SET(ele, idx, val) RSTRUCT_PTR(ele)[idx] = val
|
36
42
|
|
37
43
|
#define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
|
38
44
|
|
@@ -60,7 +66,7 @@ typedef struct {
|
|
60
66
|
|
61
67
|
#define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
|
62
68
|
|
63
|
-
#define SLIDE(N) if (
|
69
|
+
#define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
|
64
70
|
|
65
71
|
#define ATTR(K, V) \
|
66
72
|
if (!NIL_P(K)) { \
|
@@ -107,7 +113,7 @@ typedef struct {
|
|
107
113
|
action tag { SET(tag, p); }
|
108
114
|
action tagc { SET(tag, p-1); }
|
109
115
|
action aval { SET(aval, p); }
|
110
|
-
action aunq {
|
116
|
+
action aunq {
|
111
117
|
if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
|
112
118
|
else { SET(aval, p); }
|
113
119
|
}
|
@@ -118,14 +124,16 @@ typedef struct {
|
|
118
124
|
action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
|
119
125
|
action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
|
120
126
|
|
121
|
-
action new_attr {
|
127
|
+
action new_attr {
|
122
128
|
akey = Qnil;
|
123
129
|
aval = Qnil;
|
124
130
|
mark_akey = NULL;
|
125
131
|
mark_aval = NULL;
|
126
132
|
}
|
127
133
|
|
128
|
-
action save_attr {
|
134
|
+
action save_attr {
|
135
|
+
if (!S->xml)
|
136
|
+
akey = rb_funcall(akey, s_downcase, 0);
|
129
137
|
ATTR(akey, aval);
|
130
138
|
}
|
131
139
|
|
@@ -144,7 +152,7 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
|
|
144
152
|
raw = tag;
|
145
153
|
}
|
146
154
|
ary = rb_ary_new3(4, sym, tag, attr, raw);
|
147
|
-
if (taint) {
|
155
|
+
if (taint) {
|
148
156
|
OBJ_TAINT(ary);
|
149
157
|
OBJ_TAINT(tag);
|
150
158
|
OBJ_TAINT(attr);
|
@@ -153,6 +161,7 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
|
|
153
161
|
rb_yield(ary);
|
154
162
|
}
|
155
163
|
|
164
|
+
#ifndef RHASH_TBL
|
156
165
|
/* rb_hash_lookup() is only in Ruby 1.8.7 */
|
157
166
|
static VALUE
|
158
167
|
our_rb_hash_lookup(VALUE hash, VALUE key)
|
@@ -165,17 +174,17 @@ our_rb_hash_lookup(VALUE hash, VALUE key)
|
|
165
174
|
|
166
175
|
return val;
|
167
176
|
}
|
177
|
+
#define rb_hash_lookup our_rb_hash_lookup
|
178
|
+
#endif
|
168
179
|
|
169
180
|
static void
|
170
181
|
rb_hpricot_add(VALUE focus, VALUE ele)
|
171
182
|
{
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
rb_ary_push(he->children, ele);
|
178
|
-
he2->parent = focus;
|
183
|
+
VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
|
184
|
+
if (NIL_P(children))
|
185
|
+
H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
|
186
|
+
rb_ary_push(children, ele);
|
187
|
+
H_ELE_SET(ele, H_ELE_PARENT, focus);
|
179
188
|
}
|
180
189
|
|
181
190
|
typedef struct {
|
@@ -186,102 +195,70 @@ typedef struct {
|
|
186
195
|
unsigned char xml, strict, fixup;
|
187
196
|
} hpricot_state;
|
188
197
|
|
189
|
-
|
190
|
-
hpricot_ele_mark(hpricot_ele *he)
|
191
|
-
{
|
192
|
-
rb_gc_mark(he->tag);
|
193
|
-
rb_gc_mark(he->attr);
|
194
|
-
rb_gc_mark(he->etag);
|
195
|
-
rb_gc_mark(he->raw);
|
196
|
-
rb_gc_mark(he->parent);
|
197
|
-
rb_gc_mark(he->children);
|
198
|
-
}
|
199
|
-
|
200
|
-
static void
|
201
|
-
hpricot_ele_free(hpricot_ele *he)
|
202
|
-
{
|
203
|
-
free(he);
|
204
|
-
}
|
205
|
-
|
206
|
-
#define H_PROP(prop) \
|
198
|
+
#define H_PROP(prop, idx) \
|
207
199
|
static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
|
208
|
-
|
209
|
-
Data_Get_Struct(self, hpricot_ele, he); \
|
210
|
-
he->prop = x; \
|
200
|
+
H_ELE_SET(self, idx, x); \
|
211
201
|
return self; \
|
212
202
|
} \
|
203
|
+
static VALUE hpricot_ele_clear_##prop(VALUE self) { \
|
204
|
+
H_ELE_SET(self, idx, Qnil); \
|
205
|
+
return Qtrue; \
|
206
|
+
} \
|
213
207
|
static VALUE hpricot_ele_get_##prop(VALUE self) { \
|
214
|
-
|
215
|
-
Data_Get_Struct(self, hpricot_ele, he); \
|
216
|
-
return he->prop; \
|
208
|
+
return H_ELE_GET(self, idx); \
|
217
209
|
}
|
218
210
|
|
219
211
|
#define H_ATTR(prop) \
|
220
212
|
static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
|
221
|
-
|
222
|
-
Data_Get_Struct(self, hpricot_ele, he); \
|
223
|
-
rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
|
213
|
+
rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
|
224
214
|
return self; \
|
225
215
|
} \
|
226
216
|
static VALUE hpricot_ele_get_##prop(VALUE self) { \
|
227
|
-
|
228
|
-
Data_Get_Struct(self, hpricot_ele, he); \
|
229
|
-
return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
|
217
|
+
return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
|
230
218
|
}
|
231
219
|
|
232
|
-
H_PROP(
|
233
|
-
H_PROP(
|
234
|
-
H_PROP(
|
235
|
-
H_PROP(
|
236
|
-
H_PROP(
|
220
|
+
H_PROP(name, H_ELE_TAG);
|
221
|
+
H_PROP(raw, H_ELE_RAW);
|
222
|
+
H_PROP(parent, H_ELE_PARENT);
|
223
|
+
H_PROP(attr, H_ELE_ATTR);
|
224
|
+
H_PROP(etag, H_ELE_ETAG);
|
225
|
+
H_PROP(children, H_ELE_CHILDREN);
|
226
|
+
H_ATTR(target);
|
237
227
|
H_ATTR(encoding);
|
238
228
|
H_ATTR(version);
|
239
229
|
H_ATTR(standalone);
|
240
230
|
H_ATTR(system_id);
|
241
231
|
H_ATTR(public_id);
|
242
232
|
|
243
|
-
static VALUE
|
244
|
-
hpricot_ele_get_raw(VALUE self, VALUE x) {
|
245
|
-
hpricot_ele *he;
|
246
|
-
Data_Get_Struct(self, hpricot_ele, he);
|
247
|
-
return he->raw;
|
248
|
-
}
|
249
|
-
|
250
|
-
static VALUE
|
251
|
-
hpricot_ele_clear_raw(VALUE self)
|
252
|
-
{
|
253
|
-
hpricot_ele *he;
|
254
|
-
Data_Get_Struct(self, hpricot_ele, he);
|
255
|
-
he->raw = Qnil;
|
256
|
-
return Qtrue;
|
257
|
-
}
|
258
|
-
|
259
233
|
#define H_ELE(klass) \
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
234
|
+
ele = rb_obj_alloc(klass); \
|
235
|
+
if (klass == cElem) { \
|
236
|
+
H_ELE_SET(ele, H_ELE_TAG, tag); \
|
237
|
+
H_ELE_SET(ele, H_ELE_ATTR, attr); \
|
238
|
+
H_ELE_SET(ele, H_ELE_EC, ec); \
|
239
|
+
if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
|
240
|
+
H_ELE_SET(ele, H_ELE_RAW, rb_str_new(raw, rawlen)); \
|
241
|
+
} \
|
242
|
+
} else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
|
243
|
+
if (klass == cBogusETag) { \
|
244
|
+
H_ELE_SET(ele, H_ELE_TAG, tag); \
|
245
|
+
if (raw != NULL) \
|
246
|
+
H_ELE_SET(ele, H_ELE_ATTR, rb_str_new(raw, rawlen)); \
|
247
|
+
} else { \
|
248
|
+
if (klass == cDocType) \
|
249
|
+
ATTR(ID2SYM(rb_intern("target")), tag); \
|
250
|
+
H_ELE_SET(ele, H_ELE_ATTR, attr); \
|
251
|
+
if (klass != cProcIns) { \
|
252
|
+
tag = Qnil; \
|
253
|
+
if (raw != NULL) tag = rb_str_new(raw, rawlen); \
|
254
|
+
} \
|
255
|
+
H_ELE_SET(ele, H_ELE_TAG, tag); \
|
256
|
+
} \
|
257
|
+
} else { \
|
258
|
+
H_ELE_SET(ele, H_ELE_TAG, tag); \
|
269
259
|
} \
|
270
|
-
ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
|
271
260
|
S->last = ele
|
272
261
|
|
273
|
-
VALUE
|
274
|
-
hpricot_ele_alloc(VALUE klass)
|
275
|
-
{
|
276
|
-
VALUE ele;
|
277
|
-
hpricot_ele *he = ALLOC(hpricot_ele);
|
278
|
-
he->name = 0;
|
279
|
-
he->tag = he->attr = he->raw = he->EC = Qnil;
|
280
|
-
he->etag = he->parent = he->children = Qnil;
|
281
|
-
ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
|
282
|
-
return ele;
|
283
|
-
}
|
284
|
-
|
285
262
|
//
|
286
263
|
// the swift, compact parser logic. most of the complicated stuff is done
|
287
264
|
// in the lexer. this step just pairs up the start and end tags.
|
@@ -295,22 +272,23 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
295
272
|
// in html mode, fix up start tags incorrectly formed as empty tags
|
296
273
|
//
|
297
274
|
if (!S->xml) {
|
298
|
-
hpricot_ele *last;
|
299
|
-
Data_Get_Struct(S->focus, hpricot_ele, last);
|
300
|
-
if (last->EC == sym_CDATA &&
|
301
|
-
(sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
|
302
|
-
!(sym == sym_etag && rb_str_hash(tag) == last->name))
|
303
|
-
{
|
304
|
-
sym = sym_text;
|
305
|
-
tag = rb_str_new(raw, rawlen);
|
306
|
-
}
|
307
|
-
|
308
275
|
if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
|
309
276
|
ec = rb_hash_aref(S->EC, tag);
|
310
277
|
if (NIL_P(ec)) {
|
311
278
|
tag = rb_funcall(tag, s_downcase, 0);
|
312
279
|
ec = rb_hash_aref(S->EC, tag);
|
313
280
|
}
|
281
|
+
}
|
282
|
+
|
283
|
+
if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
|
284
|
+
(sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
|
285
|
+
!(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
|
286
|
+
{
|
287
|
+
sym = sym_text;
|
288
|
+
tag = rb_str_new(raw, rawlen);
|
289
|
+
}
|
290
|
+
|
291
|
+
if (!NIL_P(ec)) {
|
314
292
|
if (sym == sym_emptytag) {
|
315
293
|
if (ec != sym_EMPTY)
|
316
294
|
sym = sym_stag;
|
@@ -322,19 +300,19 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
322
300
|
}
|
323
301
|
|
324
302
|
if (sym == sym_emptytag || sym == sym_stag) {
|
303
|
+
VALUE name = INT2FIX(rb_str_hash(tag));
|
325
304
|
H_ELE(cElem);
|
326
|
-
|
305
|
+
H_ELE_SET(ele, H_ELE_HASH, name);
|
327
306
|
|
328
307
|
if (!S->xml) {
|
329
308
|
VALUE match = Qnil, e = S->focus;
|
330
309
|
while (e != S->doc)
|
331
310
|
{
|
332
|
-
|
333
|
-
Data_Get_Struct(e, hpricot_ele, hee);
|
311
|
+
VALUE hEC = H_ELE_GET(e, H_ELE_EC);
|
334
312
|
|
335
|
-
if (TYPE(
|
313
|
+
if (TYPE(hEC) == T_HASH)
|
336
314
|
{
|
337
|
-
VALUE has =
|
315
|
+
VALUE has = rb_hash_lookup(hEC, name);
|
338
316
|
if (has != Qnil) {
|
339
317
|
if (has == Qtrue) {
|
340
318
|
if (match == Qnil)
|
@@ -347,7 +325,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
347
325
|
}
|
348
326
|
}
|
349
327
|
|
350
|
-
e =
|
328
|
+
e = H_ELE_GET(e, H_ELE_PARENT);
|
351
329
|
}
|
352
330
|
|
353
331
|
if (match == Qnil)
|
@@ -369,8 +347,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
369
347
|
}
|
370
348
|
}
|
371
349
|
} else if (sym == sym_etag) {
|
372
|
-
|
373
|
-
VALUE match = Qnil, e = S->focus;
|
350
|
+
VALUE name, match = Qnil, e = S->focus;
|
374
351
|
if (S->strict) {
|
375
352
|
if (NIL_P(rb_hash_aref(S->EC, tag))) {
|
376
353
|
tag = rb_str_new2("div");
|
@@ -383,19 +360,16 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
383
360
|
//
|
384
361
|
// (see also: the search above for fixups)
|
385
362
|
//
|
386
|
-
name = rb_str_hash(tag);
|
363
|
+
name = INT2FIX(rb_str_hash(tag));
|
387
364
|
while (e != S->doc)
|
388
365
|
{
|
389
|
-
|
390
|
-
Data_Get_Struct(e, hpricot_ele, he);
|
391
|
-
|
392
|
-
if (he->name == name)
|
366
|
+
if (H_ELE_GET(e, H_ELE_HASH) == name)
|
393
367
|
{
|
394
368
|
match = e;
|
395
369
|
break;
|
396
370
|
}
|
397
371
|
|
398
|
-
e =
|
372
|
+
e = H_ELE_GET(e, H_ELE_PARENT);
|
399
373
|
}
|
400
374
|
|
401
375
|
if (NIL_P(match))
|
@@ -405,10 +379,11 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
405
379
|
}
|
406
380
|
else
|
407
381
|
{
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
382
|
+
VALUE ele = Qnil;
|
383
|
+
if (raw != NULL)
|
384
|
+
ele = rb_str_new(raw, rawlen);
|
385
|
+
H_ELE_SET(match, H_ELE_ETAG, ele);
|
386
|
+
S->focus = H_ELE_GET(match, H_ELE_PARENT);
|
412
387
|
S->last = Qnil;
|
413
388
|
}
|
414
389
|
} else if (sym == sym_cdata) {
|
@@ -429,15 +404,13 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
429
404
|
tag = rb_reg_nth_match(1, match);
|
430
405
|
attr = rb_reg_nth_match(2, match);
|
431
406
|
{
|
432
|
-
|
433
|
-
|
407
|
+
H_ELE(cProcIns);
|
408
|
+
rb_hpricot_add(S->focus, ele);
|
434
409
|
}
|
435
410
|
} else if (sym == sym_text) {
|
436
411
|
// TODO: add raw_string as well?
|
437
412
|
if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
|
438
|
-
|
439
|
-
Data_Get_Struct(S->last, hpricot_ele, he);
|
440
|
-
rb_str_append(he->tag, tag);
|
413
|
+
rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
|
441
414
|
} else {
|
442
415
|
H_ELE(cText);
|
443
416
|
rb_hpricot_add(S->focus, ele);
|
@@ -450,7 +423,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
450
423
|
|
451
424
|
VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
452
425
|
{
|
453
|
-
int cs, act, have = 0, nread = 0, curline = 1, text = 0;
|
426
|
+
int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
|
454
427
|
char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
|
455
428
|
|
456
429
|
hpricot_state *S = NULL;
|
@@ -460,12 +433,13 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
460
433
|
int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
|
461
434
|
|
462
435
|
rb_scan_args(argc, argv, "11", &port, &opts);
|
463
|
-
taint = OBJ_TAINTED(
|
464
|
-
|
436
|
+
taint = OBJ_TAINTED(port);
|
437
|
+
io = rb_respond_to(port, s_read);
|
438
|
+
if (!io)
|
465
439
|
{
|
466
|
-
if (
|
440
|
+
if (rb_respond_to(port, s_to_str))
|
467
441
|
{
|
468
|
-
port = rb_funcall(
|
442
|
+
port = rb_funcall(port, s_to_str, 0);
|
469
443
|
StringValue(port);
|
470
444
|
}
|
471
445
|
else
|
@@ -479,11 +453,8 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
479
453
|
|
480
454
|
if (!rb_block_given_p())
|
481
455
|
{
|
482
|
-
hpricot_ele *he = ALLOC(hpricot_ele);
|
483
456
|
S = ALLOC(hpricot_state);
|
484
|
-
|
485
|
-
he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
|
486
|
-
S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
|
457
|
+
S->doc = rb_obj_alloc(cDoc);
|
487
458
|
rb_gc_register_address(&S->doc);
|
488
459
|
S->focus = S->doc;
|
489
460
|
S->last = Qnil;
|
@@ -503,65 +474,68 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
503
474
|
buffer_size = NUM2INT(bufsize);
|
504
475
|
}
|
505
476
|
}
|
506
|
-
|
477
|
+
|
478
|
+
if (io)
|
479
|
+
buf = ALLOC_N(char, buffer_size);
|
507
480
|
|
508
481
|
%% write init;
|
509
|
-
|
510
|
-
while (
|
482
|
+
|
483
|
+
while (!done) {
|
511
484
|
VALUE str;
|
512
485
|
char *p, *pe;
|
513
486
|
int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
|
514
487
|
|
515
|
-
if (
|
516
|
-
/* We've used up the entire buffer storing an already-parsed token
|
517
|
-
* prefix that must be preserved. Likely caused by super-long attributes.
|
518
|
-
* Increase buffer size and continue */
|
519
|
-
tokstart_diff = ts - buf;
|
520
|
-
tokend_diff = te - buf;
|
521
|
-
mark_tag_diff = mark_tag - buf;
|
522
|
-
mark_akey_diff = mark_akey - buf;
|
523
|
-
mark_aval_diff = mark_aval - buf;
|
524
|
-
|
525
|
-
buffer_size += BUFSIZE;
|
526
|
-
REALLOC_N(buf, char, buffer_size);
|
527
|
-
|
528
|
-
space = buffer_size - have;
|
529
|
-
|
530
|
-
ts= buf + tokstart_diff;
|
531
|
-
te = buf + tokend_diff;
|
532
|
-
mark_tag = buf + mark_tag_diff;
|
533
|
-
mark_akey = buf + mark_akey_diff;
|
534
|
-
mark_aval = buf + mark_aval_diff;
|
535
|
-
}
|
536
|
-
p = buf + have;
|
537
|
-
|
538
|
-
if ( rb_respond_to( port, s_read ) )
|
488
|
+
if (io)
|
539
489
|
{
|
490
|
+
if (space == 0) {
|
491
|
+
/* We've used up the entire buffer storing an already-parsed token
|
492
|
+
* prefix that must be preserved. Likely caused by super-long attributes.
|
493
|
+
* Increase buffer size and continue */
|
494
|
+
tokstart_diff = ts - buf;
|
495
|
+
tokend_diff = te - buf;
|
496
|
+
mark_tag_diff = mark_tag - buf;
|
497
|
+
mark_akey_diff = mark_akey - buf;
|
498
|
+
mark_aval_diff = mark_aval - buf;
|
499
|
+
|
500
|
+
buffer_size += BUFSIZE;
|
501
|
+
REALLOC_N(buf, char, buffer_size);
|
502
|
+
|
503
|
+
space = buffer_size - have;
|
504
|
+
|
505
|
+
ts = buf + tokstart_diff;
|
506
|
+
te = buf + tokend_diff;
|
507
|
+
mark_tag = buf + mark_tag_diff;
|
508
|
+
mark_akey = buf + mark_akey_diff;
|
509
|
+
mark_aval = buf + mark_aval_diff;
|
510
|
+
}
|
511
|
+
p = buf + have;
|
512
|
+
|
540
513
|
str = rb_funcall(port, s_read, 1, INT2FIX(space));
|
541
514
|
len = RSTRING_LEN(str);
|
542
515
|
memcpy(p, StringValuePtr(str), len);
|
543
516
|
}
|
544
517
|
else
|
545
518
|
{
|
546
|
-
|
547
|
-
|
548
|
-
|
519
|
+
p = RSTRING_PTR(port);
|
520
|
+
len = RSTRING_LEN(port) + 1;
|
521
|
+
done = 1;
|
549
522
|
}
|
550
523
|
|
551
524
|
nread += len;
|
552
525
|
|
553
526
|
/* If this is the last buffer, tack on an EOF. */
|
554
|
-
if ( len < space
|
527
|
+
if (io && len < space) {
|
555
528
|
p[len++] = 0;
|
556
529
|
done = 1;
|
557
530
|
}
|
558
531
|
|
559
532
|
pe = p + len;
|
560
533
|
%% write exec;
|
561
|
-
|
562
|
-
if (
|
563
|
-
|
564
|
-
|
534
|
+
|
535
|
+
if (cs == hpricot_scan_error) {
|
536
|
+
if (buf != NULL)
|
537
|
+
free(buf);
|
538
|
+
if (!NIL_P(tag))
|
565
539
|
{
|
566
540
|
rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
|
567
541
|
}
|
@@ -570,8 +544,8 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
570
544
|
rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
|
571
545
|
}
|
572
546
|
}
|
573
|
-
|
574
|
-
if (
|
547
|
+
|
548
|
+
if (done && ele_open)
|
575
549
|
{
|
576
550
|
ele_open = 0;
|
577
551
|
if (ts > 0) {
|
@@ -581,11 +555,11 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
581
555
|
}
|
582
556
|
}
|
583
557
|
|
584
|
-
if (
|
558
|
+
if (ts == 0)
|
585
559
|
{
|
586
560
|
have = 0;
|
587
561
|
/* text nodes have no ts because each byte is parsed alone */
|
588
|
-
if (
|
562
|
+
if (mark_tag != NULL && text == 1)
|
589
563
|
{
|
590
564
|
if (done)
|
591
565
|
{
|
@@ -600,12 +574,15 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
600
574
|
CAT(tag, p);
|
601
575
|
}
|
602
576
|
}
|
603
|
-
|
577
|
+
if (io)
|
578
|
+
mark_tag = buf;
|
579
|
+
else
|
580
|
+
mark_tag = RSTRING_PTR(port);
|
604
581
|
}
|
605
|
-
else
|
582
|
+
else if (io)
|
606
583
|
{
|
607
584
|
have = pe - ts;
|
608
|
-
memmove(
|
585
|
+
memmove(buf, ts, have);
|
609
586
|
SLIDE(tag);
|
610
587
|
SLIDE(akey);
|
611
588
|
SLIDE(aval);
|
@@ -613,7 +590,9 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
613
590
|
ts = buf;
|
614
591
|
}
|
615
592
|
}
|
616
|
-
|
593
|
+
|
594
|
+
if (buf != NULL)
|
595
|
+
free(buf);
|
617
596
|
|
618
597
|
if (S != NULL)
|
619
598
|
{
|
@@ -626,66 +605,103 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
626
605
|
return Qnil;
|
627
606
|
}
|
628
607
|
|
629
|
-
|
608
|
+
static VALUE
|
609
|
+
alloc_hpricot_struct(VALUE klass)
|
630
610
|
{
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
611
|
+
VALUE size;
|
612
|
+
long n;
|
613
|
+
NEWOBJ(st, struct RStruct);
|
614
|
+
OBJSETUP(st, klass, T_STRUCT);
|
615
|
+
|
616
|
+
size = rb_struct_iv_get(klass, "__size__");
|
617
|
+
n = FIX2LONG(size);
|
618
|
+
|
619
|
+
#ifndef RSTRUCT_EMBED_LEN_MAX
|
620
|
+
st->ptr = ALLOC_N(VALUE, n);
|
621
|
+
rb_mem_clear(st->ptr, n);
|
622
|
+
st->len = n;
|
623
|
+
#else
|
624
|
+
if (0 < n && n <= RSTRUCT_EMBED_LEN_MAX) {
|
625
|
+
RBASIC(st)->flags &= ~RSTRUCT_EMBED_LEN_MASK;
|
626
|
+
RBASIC(st)->flags |= n << RSTRUCT_EMBED_LEN_SHIFT;
|
627
|
+
rb_mem_clear(st->as.ary, n);
|
628
|
+
} else {
|
629
|
+
st->as.heap.ptr = ALLOC_N(VALUE, n);
|
630
|
+
rb_mem_clear(st->as.heap.ptr, n);
|
631
|
+
st->as.heap.len = n;
|
632
|
+
}
|
633
|
+
#endif
|
636
634
|
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
635
|
+
return (VALUE)st;
|
636
|
+
}
|
637
|
+
|
638
|
+
static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);}
|
639
|
+
static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);}
|
640
|
+
static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);}
|
641
|
+
static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);}
|
642
|
+
static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);}
|
643
|
+
static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);}
|
644
|
+
static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);}
|
645
|
+
static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);}
|
646
|
+
static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);}
|
647
|
+
static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}
|
648
|
+
|
649
|
+
static VALUE (*ref_func[10])() = {
|
650
|
+
hpricot_struct_ref0,
|
651
|
+
hpricot_struct_ref1,
|
652
|
+
hpricot_struct_ref2,
|
653
|
+
hpricot_struct_ref3,
|
654
|
+
hpricot_struct_ref4,
|
655
|
+
hpricot_struct_ref5,
|
656
|
+
hpricot_struct_ref6,
|
657
|
+
hpricot_struct_ref7,
|
658
|
+
hpricot_struct_ref8,
|
659
|
+
hpricot_struct_ref9,
|
660
|
+
};
|
661
|
+
|
662
|
+
static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);}
|
663
|
+
static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);}
|
664
|
+
static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);}
|
665
|
+
static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);}
|
666
|
+
static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);}
|
667
|
+
static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);}
|
668
|
+
static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);}
|
669
|
+
static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);}
|
670
|
+
static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);}
|
671
|
+
static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}
|
672
|
+
|
673
|
+
static VALUE (*set_func[10])() = {
|
674
|
+
hpricot_struct_set0,
|
675
|
+
hpricot_struct_set1,
|
676
|
+
hpricot_struct_set2,
|
677
|
+
hpricot_struct_set3,
|
678
|
+
hpricot_struct_set4,
|
679
|
+
hpricot_struct_set5,
|
680
|
+
hpricot_struct_set6,
|
681
|
+
hpricot_struct_set7,
|
682
|
+
hpricot_struct_set8,
|
683
|
+
hpricot_struct_set9,
|
684
|
+
};
|
685
|
+
|
686
|
+
static VALUE
|
687
|
+
make_hpricot_struct(VALUE members)
|
688
|
+
{
|
689
|
+
int i = 0;
|
690
|
+
VALUE klass = rb_class_new(rb_cObject);
|
691
|
+
rb_iv_set(klass, "__size__", INT2NUM(RARRAY_LEN(members)));
|
692
|
+
rb_define_alloc_func(klass, alloc_hpricot_struct);
|
693
|
+
rb_define_singleton_method(klass, "new", rb_class_new_instance, -1);
|
694
|
+
for (i = 0; i < RARRAY_LEN(members); i++) {
|
695
|
+
ID id = SYM2ID(RARRAY_PTR(members)[i]);
|
696
|
+
rb_define_method_id(klass, id, ref_func[i], 0);
|
697
|
+
rb_define_method_id(klass, rb_id_attrset(id), set_func[i], 1);
|
698
|
+
}
|
699
|
+
return klass;
|
700
|
+
}
|
701
|
+
|
702
|
+
void Init_hpricot_scan()
|
703
|
+
{
|
704
|
+
VALUE structElem, structAttr, structBasic;
|
689
705
|
|
690
706
|
s_ElementContent = rb_intern("ElementContent");
|
691
707
|
symAllow = ID2SYM(rb_intern("allow"));
|
@@ -695,19 +711,78 @@ void Init_hpricot_scan()
|
|
695
711
|
s_parent = rb_intern("parent");
|
696
712
|
s_read = rb_intern("read");
|
697
713
|
s_to_str = rb_intern("to_str");
|
698
|
-
iv_parent = rb_intern("parent");
|
699
714
|
sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
|
700
715
|
sym_doctype = ID2SYM(rb_intern("doctype"));
|
701
716
|
sym_procins = ID2SYM(rb_intern("procins"));
|
702
717
|
sym_stag = ID2SYM(rb_intern("stag"));
|
703
718
|
sym_etag = ID2SYM(rb_intern("etag"));
|
704
719
|
sym_emptytag = ID2SYM(rb_intern("emptytag"));
|
720
|
+
sym_allowed = ID2SYM(rb_intern("allowed"));
|
721
|
+
sym_children = ID2SYM(rb_intern("children"));
|
705
722
|
sym_comment = ID2SYM(rb_intern("comment"));
|
706
723
|
sym_cdata = ID2SYM(rb_intern("cdata"));
|
724
|
+
sym_name = ID2SYM(rb_intern("name"));
|
725
|
+
sym_parent = ID2SYM(rb_intern("parent"));
|
726
|
+
sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
|
727
|
+
sym_raw_string = ID2SYM(rb_intern("raw_string"));
|
728
|
+
sym_tagno = ID2SYM(rb_intern("tagno"));
|
707
729
|
sym_text = ID2SYM(rb_intern("text"));
|
708
730
|
sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
|
709
731
|
sym_CDATA = ID2SYM(rb_intern("CDATA"));
|
710
732
|
|
733
|
+
mHpricot = rb_define_module("Hpricot");
|
734
|
+
rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
|
735
|
+
rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
|
736
|
+
rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
|
737
|
+
rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
|
738
|
+
|
739
|
+
structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
|
740
|
+
sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
|
741
|
+
sym_tagno, sym_children));
|
742
|
+
structAttr = make_hpricot_struct(rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes));
|
743
|
+
structBasic = make_hpricot_struct(rb_ary_new3(2, sym_name, sym_parent));
|
744
|
+
|
745
|
+
cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
|
746
|
+
cCData = rb_define_class_under(mHpricot, "CData", structBasic);
|
747
|
+
rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
|
748
|
+
rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
|
749
|
+
cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
|
750
|
+
rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
|
751
|
+
rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
|
752
|
+
cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
|
753
|
+
rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
|
754
|
+
rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
|
755
|
+
rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
|
756
|
+
rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
|
757
|
+
rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
|
758
|
+
rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
|
759
|
+
rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
|
760
|
+
rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
|
761
|
+
cElem = rb_define_class_under(mHpricot, "Elem", structElem);
|
762
|
+
rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
|
763
|
+
cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
|
764
|
+
rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
|
765
|
+
rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
|
766
|
+
cText = rb_define_class_under(mHpricot, "Text", structBasic);
|
767
|
+
rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
|
768
|
+
rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
|
769
|
+
rb_define_method(cText, "content", hpricot_ele_get_name, 0);
|
770
|
+
rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
|
771
|
+
cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
|
772
|
+
rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
|
773
|
+
rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
|
774
|
+
rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
|
775
|
+
rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
|
776
|
+
rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
|
777
|
+
rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
|
778
|
+
rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
|
779
|
+
rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
|
780
|
+
cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
|
781
|
+
rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
|
782
|
+
rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
|
783
|
+
rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
|
784
|
+
rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
|
785
|
+
|
711
786
|
rb_const_set(mHpricot, rb_intern("ProcInsParse"),
|
712
787
|
reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
|
713
788
|
}
|