why-hpricot 0.6.210 → 0.7.229
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -0
- data/Rakefile +14 -9
- data/ext/fast_xs/fast_xs.c +2 -1
- data/ext/hpricot_scan/HpricotScanService.java +1122 -342
- data/ext/hpricot_scan/hpricot_css.c +2112 -2116
- data/ext/hpricot_scan/hpricot_scan.c +1169 -923
- data/ext/hpricot_scan/hpricot_scan.java.rl +1078 -299
- data/ext/hpricot_scan/hpricot_scan.rl +327 -237
- data/lib/hpricot/elements.rb +1 -1
- data/lib/hpricot/inspect.rb +2 -2
- data/lib/hpricot/modules.rb +2 -0
- data/lib/hpricot/tag.rb +43 -22
- data/lib/hpricot/traverse.rb +1 -0
- data/test/test_alter.rb +20 -2
- data/test/test_parser.rb +19 -0
- data/test/test_preserved.rb +9 -0
- metadata +6 -6
@@ -19,20 +19,26 @@ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
|
|
19
19
|
#define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
|
20
20
|
|
21
21
|
static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
|
22
|
-
sym_cdata,
|
22
|
+
sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
|
23
|
+
sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;
|
23
24
|
static VALUE mHpricot, rb_eHpricotParseError;
|
24
|
-
static VALUE
|
25
|
+
static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,
|
25
26
|
cXMLDecl, cProcIns, symAllow, symDeny;
|
26
27
|
static ID s_ElementContent;
|
27
28
|
static ID s_downcase, s_new, s_parent, s_read, s_to_str;
|
28
|
-
static ID iv_parent;
|
29
29
|
static VALUE reProcInsParse;
|
30
30
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
31
|
+
#define H_ELE_TAG 0
|
32
|
+
#define H_ELE_PARENT 1
|
33
|
+
#define H_ELE_ATTR 2
|
34
|
+
#define H_ELE_ETAG 3
|
35
|
+
#define H_ELE_RAW 4
|
36
|
+
#define H_ELE_EC 5
|
37
|
+
#define H_ELE_HASH 6
|
38
|
+
#define H_ELE_CHILDREN 7
|
39
|
+
|
40
|
+
#define H_ELE_GET(ele, idx) RSTRUCT_PTR(ele)[idx]
|
41
|
+
#define H_ELE_SET(ele, idx, val) RSTRUCT_PTR(ele)[idx] = val
|
36
42
|
|
37
43
|
#define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
|
38
44
|
|
@@ -60,7 +66,7 @@ typedef struct {
|
|
60
66
|
|
61
67
|
#define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
|
62
68
|
|
63
|
-
#define SLIDE(N) if (
|
69
|
+
#define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
|
64
70
|
|
65
71
|
#define ATTR(K, V) \
|
66
72
|
if (!NIL_P(K)) { \
|
@@ -107,7 +113,7 @@ typedef struct {
|
|
107
113
|
action tag { SET(tag, p); }
|
108
114
|
action tagc { SET(tag, p-1); }
|
109
115
|
action aval { SET(aval, p); }
|
110
|
-
action aunq {
|
116
|
+
action aunq {
|
111
117
|
if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
|
112
118
|
else { SET(aval, p); }
|
113
119
|
}
|
@@ -118,14 +124,16 @@ typedef struct {
|
|
118
124
|
action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
|
119
125
|
action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
|
120
126
|
|
121
|
-
action new_attr {
|
127
|
+
action new_attr {
|
122
128
|
akey = Qnil;
|
123
129
|
aval = Qnil;
|
124
130
|
mark_akey = NULL;
|
125
131
|
mark_aval = NULL;
|
126
132
|
}
|
127
133
|
|
128
|
-
action save_attr {
|
134
|
+
action save_attr {
|
135
|
+
if (!S->xml)
|
136
|
+
akey = rb_funcall(akey, s_downcase, 0);
|
129
137
|
ATTR(akey, aval);
|
130
138
|
}
|
131
139
|
|
@@ -144,7 +152,7 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
|
|
144
152
|
raw = tag;
|
145
153
|
}
|
146
154
|
ary = rb_ary_new3(4, sym, tag, attr, raw);
|
147
|
-
if (taint) {
|
155
|
+
if (taint) {
|
148
156
|
OBJ_TAINT(ary);
|
149
157
|
OBJ_TAINT(tag);
|
150
158
|
OBJ_TAINT(attr);
|
@@ -153,16 +161,30 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
|
|
153
161
|
rb_yield(ary);
|
154
162
|
}
|
155
163
|
|
164
|
+
#ifndef RHASH_TBL
|
165
|
+
/* rb_hash_lookup() is only in Ruby 1.8.7 */
|
166
|
+
static VALUE
|
167
|
+
our_rb_hash_lookup(VALUE hash, VALUE key)
|
168
|
+
{
|
169
|
+
VALUE val;
|
170
|
+
|
171
|
+
if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
|
172
|
+
return Qnil; /* without Hash#default */
|
173
|
+
}
|
174
|
+
|
175
|
+
return val;
|
176
|
+
}
|
177
|
+
#define rb_hash_lookup our_rb_hash_lookup
|
178
|
+
#endif
|
179
|
+
|
156
180
|
static void
|
157
181
|
rb_hpricot_add(VALUE focus, VALUE ele)
|
158
182
|
{
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
rb_ary_push(he->children, ele);
|
165
|
-
he2->parent = focus;
|
183
|
+
VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
|
184
|
+
if (NIL_P(children))
|
185
|
+
H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
|
186
|
+
rb_ary_push(children, ele);
|
187
|
+
H_ELE_SET(ele, H_ELE_PARENT, focus);
|
166
188
|
}
|
167
189
|
|
168
190
|
typedef struct {
|
@@ -173,102 +195,70 @@ typedef struct {
|
|
173
195
|
unsigned char xml, strict, fixup;
|
174
196
|
} hpricot_state;
|
175
197
|
|
176
|
-
|
177
|
-
hpricot_ele_mark(hpricot_ele *he)
|
178
|
-
{
|
179
|
-
rb_gc_mark(he->tag);
|
180
|
-
rb_gc_mark(he->attr);
|
181
|
-
rb_gc_mark(he->etag);
|
182
|
-
rb_gc_mark(he->raw);
|
183
|
-
rb_gc_mark(he->parent);
|
184
|
-
rb_gc_mark(he->children);
|
185
|
-
}
|
186
|
-
|
187
|
-
static void
|
188
|
-
hpricot_ele_free(hpricot_ele *he)
|
189
|
-
{
|
190
|
-
free(he);
|
191
|
-
}
|
192
|
-
|
193
|
-
#define H_PROP(prop) \
|
198
|
+
#define H_PROP(prop, idx) \
|
194
199
|
static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
|
195
|
-
|
196
|
-
Data_Get_Struct(self, hpricot_ele, he); \
|
197
|
-
he->prop = x; \
|
200
|
+
H_ELE_SET(self, idx, x); \
|
198
201
|
return self; \
|
199
202
|
} \
|
203
|
+
static VALUE hpricot_ele_clear_##prop(VALUE self) { \
|
204
|
+
H_ELE_SET(self, idx, Qnil); \
|
205
|
+
return Qtrue; \
|
206
|
+
} \
|
200
207
|
static VALUE hpricot_ele_get_##prop(VALUE self) { \
|
201
|
-
|
202
|
-
Data_Get_Struct(self, hpricot_ele, he); \
|
203
|
-
return he->prop; \
|
208
|
+
return H_ELE_GET(self, idx); \
|
204
209
|
}
|
205
210
|
|
206
211
|
#define H_ATTR(prop) \
|
207
212
|
static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
|
208
|
-
|
209
|
-
Data_Get_Struct(self, hpricot_ele, he); \
|
210
|
-
rb_hash_aset(he->attr, ID2SYM(rb_intern("" # prop)), x); \
|
213
|
+
rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
|
211
214
|
return self; \
|
212
215
|
} \
|
213
216
|
static VALUE hpricot_ele_get_##prop(VALUE self) { \
|
214
|
-
|
215
|
-
Data_Get_Struct(self, hpricot_ele, he); \
|
216
|
-
return rb_hash_aref(he->attr, ID2SYM(rb_intern("" # prop))); \
|
217
|
+
return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
|
217
218
|
}
|
218
219
|
|
219
|
-
H_PROP(
|
220
|
-
H_PROP(
|
221
|
-
H_PROP(
|
222
|
-
H_PROP(
|
223
|
-
H_PROP(
|
220
|
+
H_PROP(name, H_ELE_TAG);
|
221
|
+
H_PROP(raw, H_ELE_RAW);
|
222
|
+
H_PROP(parent, H_ELE_PARENT);
|
223
|
+
H_PROP(attr, H_ELE_ATTR);
|
224
|
+
H_PROP(etag, H_ELE_ETAG);
|
225
|
+
H_PROP(children, H_ELE_CHILDREN);
|
226
|
+
H_ATTR(target);
|
224
227
|
H_ATTR(encoding);
|
225
228
|
H_ATTR(version);
|
226
229
|
H_ATTR(standalone);
|
227
230
|
H_ATTR(system_id);
|
228
231
|
H_ATTR(public_id);
|
229
232
|
|
230
|
-
static VALUE
|
231
|
-
hpricot_ele_get_raw(VALUE self, VALUE x) {
|
232
|
-
hpricot_ele *he;
|
233
|
-
Data_Get_Struct(self, hpricot_ele, he);
|
234
|
-
return he->raw;
|
235
|
-
}
|
236
|
-
|
237
|
-
static VALUE
|
238
|
-
hpricot_ele_clear_raw(VALUE self)
|
239
|
-
{
|
240
|
-
hpricot_ele *he;
|
241
|
-
Data_Get_Struct(self, hpricot_ele, he);
|
242
|
-
he->raw = Qnil;
|
243
|
-
return Qtrue;
|
244
|
-
}
|
245
|
-
|
246
233
|
#define H_ELE(klass) \
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
234
|
+
ele = rb_obj_alloc(klass); \
|
235
|
+
if (klass == cElem) { \
|
236
|
+
H_ELE_SET(ele, H_ELE_TAG, tag); \
|
237
|
+
H_ELE_SET(ele, H_ELE_ATTR, attr); \
|
238
|
+
H_ELE_SET(ele, H_ELE_EC, ec); \
|
239
|
+
if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
|
240
|
+
H_ELE_SET(ele, H_ELE_RAW, rb_str_new(raw, rawlen)); \
|
241
|
+
} \
|
242
|
+
} else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
|
243
|
+
if (klass == cBogusETag) { \
|
244
|
+
H_ELE_SET(ele, H_ELE_TAG, tag); \
|
245
|
+
if (raw != NULL) \
|
246
|
+
H_ELE_SET(ele, H_ELE_ATTR, rb_str_new(raw, rawlen)); \
|
247
|
+
} else { \
|
248
|
+
if (klass == cDocType) \
|
249
|
+
ATTR(ID2SYM(rb_intern("target")), tag); \
|
250
|
+
H_ELE_SET(ele, H_ELE_ATTR, attr); \
|
251
|
+
if (klass != cProcIns) { \
|
252
|
+
tag = Qnil; \
|
253
|
+
if (raw != NULL) tag = rb_str_new(raw, rawlen); \
|
254
|
+
} \
|
255
|
+
H_ELE_SET(ele, H_ELE_TAG, tag); \
|
256
|
+
} \
|
257
|
+
} else { \
|
258
|
+
H_ELE_SET(ele, H_ELE_TAG, tag); \
|
256
259
|
} \
|
257
|
-
ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he); \
|
258
260
|
S->last = ele
|
259
261
|
|
260
|
-
VALUE
|
261
|
-
hpricot_ele_alloc(VALUE klass)
|
262
|
-
{
|
263
|
-
VALUE ele;
|
264
|
-
hpricot_ele *he = ALLOC(hpricot_ele);
|
265
|
-
he->name = 0;
|
266
|
-
he->tag = he->attr = he->raw = he->EC = Qnil;
|
267
|
-
he->etag = he->parent = he->children = Qnil;
|
268
|
-
ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
|
269
|
-
return ele;
|
270
|
-
}
|
271
|
-
|
272
262
|
//
|
273
263
|
// the swift, compact parser logic. most of the complicated stuff is done
|
274
264
|
// in the lexer. this step just pairs up the start and end tags.
|
@@ -282,22 +272,23 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
282
272
|
// in html mode, fix up start tags incorrectly formed as empty tags
|
283
273
|
//
|
284
274
|
if (!S->xml) {
|
285
|
-
hpricot_ele *last;
|
286
|
-
Data_Get_Struct(S->focus, hpricot_ele, last);
|
287
|
-
if (last->EC == sym_CDATA &&
|
288
|
-
(sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
|
289
|
-
!(sym == sym_etag && rb_str_hash(tag) == last->name))
|
290
|
-
{
|
291
|
-
sym = sym_text;
|
292
|
-
tag = rb_str_new(raw, rawlen);
|
293
|
-
}
|
294
|
-
|
295
275
|
if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
|
296
276
|
ec = rb_hash_aref(S->EC, tag);
|
297
277
|
if (NIL_P(ec)) {
|
298
278
|
tag = rb_funcall(tag, s_downcase, 0);
|
299
279
|
ec = rb_hash_aref(S->EC, tag);
|
300
280
|
}
|
281
|
+
}
|
282
|
+
|
283
|
+
if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
|
284
|
+
(sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
|
285
|
+
!(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
|
286
|
+
{
|
287
|
+
sym = sym_text;
|
288
|
+
tag = rb_str_new(raw, rawlen);
|
289
|
+
}
|
290
|
+
|
291
|
+
if (!NIL_P(ec)) {
|
301
292
|
if (sym == sym_emptytag) {
|
302
293
|
if (ec != sym_EMPTY)
|
303
294
|
sym = sym_stag;
|
@@ -309,19 +300,19 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
309
300
|
}
|
310
301
|
|
311
302
|
if (sym == sym_emptytag || sym == sym_stag) {
|
303
|
+
VALUE name = INT2FIX(rb_str_hash(tag));
|
312
304
|
H_ELE(cElem);
|
313
|
-
|
305
|
+
H_ELE_SET(ele, H_ELE_HASH, name);
|
314
306
|
|
315
307
|
if (!S->xml) {
|
316
308
|
VALUE match = Qnil, e = S->focus;
|
317
309
|
while (e != S->doc)
|
318
310
|
{
|
319
|
-
|
320
|
-
Data_Get_Struct(e, hpricot_ele, hee);
|
311
|
+
VALUE hEC = H_ELE_GET(e, H_ELE_EC);
|
321
312
|
|
322
|
-
if (TYPE(
|
313
|
+
if (TYPE(hEC) == T_HASH)
|
323
314
|
{
|
324
|
-
VALUE has = rb_hash_lookup(
|
315
|
+
VALUE has = rb_hash_lookup(hEC, name);
|
325
316
|
if (has != Qnil) {
|
326
317
|
if (has == Qtrue) {
|
327
318
|
if (match == Qnil)
|
@@ -334,7 +325,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
334
325
|
}
|
335
326
|
}
|
336
327
|
|
337
|
-
e =
|
328
|
+
e = H_ELE_GET(e, H_ELE_PARENT);
|
338
329
|
}
|
339
330
|
|
340
331
|
if (match == Qnil)
|
@@ -356,8 +347,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
356
347
|
}
|
357
348
|
}
|
358
349
|
} else if (sym == sym_etag) {
|
359
|
-
|
360
|
-
VALUE match = Qnil, e = S->focus;
|
350
|
+
VALUE name, match = Qnil, e = S->focus;
|
361
351
|
if (S->strict) {
|
362
352
|
if (NIL_P(rb_hash_aref(S->EC, tag))) {
|
363
353
|
tag = rb_str_new2("div");
|
@@ -370,19 +360,16 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
370
360
|
//
|
371
361
|
// (see also: the search above for fixups)
|
372
362
|
//
|
373
|
-
name = rb_str_hash(tag);
|
363
|
+
name = INT2FIX(rb_str_hash(tag));
|
374
364
|
while (e != S->doc)
|
375
365
|
{
|
376
|
-
|
377
|
-
Data_Get_Struct(e, hpricot_ele, he);
|
378
|
-
|
379
|
-
if (he->name == name)
|
366
|
+
if (H_ELE_GET(e, H_ELE_HASH) == name)
|
380
367
|
{
|
381
368
|
match = e;
|
382
369
|
break;
|
383
370
|
}
|
384
371
|
|
385
|
-
e =
|
372
|
+
e = H_ELE_GET(e, H_ELE_PARENT);
|
386
373
|
}
|
387
374
|
|
388
375
|
if (NIL_P(match))
|
@@ -392,10 +379,11 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
392
379
|
}
|
393
380
|
else
|
394
381
|
{
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
382
|
+
VALUE ele = Qnil;
|
383
|
+
if (raw != NULL)
|
384
|
+
ele = rb_str_new(raw, rawlen);
|
385
|
+
H_ELE_SET(match, H_ELE_ETAG, ele);
|
386
|
+
S->focus = H_ELE_GET(match, H_ELE_PARENT);
|
399
387
|
S->last = Qnil;
|
400
388
|
}
|
401
389
|
} else if (sym == sym_cdata) {
|
@@ -415,14 +403,14 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
415
403
|
VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
|
416
404
|
tag = rb_reg_nth_match(1, match);
|
417
405
|
attr = rb_reg_nth_match(2, match);
|
418
|
-
|
419
|
-
|
406
|
+
{
|
407
|
+
H_ELE(cProcIns);
|
408
|
+
rb_hpricot_add(S->focus, ele);
|
409
|
+
}
|
420
410
|
} else if (sym == sym_text) {
|
421
411
|
// TODO: add raw_string as well?
|
422
412
|
if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
|
423
|
-
|
424
|
-
Data_Get_Struct(S->last, hpricot_ele, he);
|
425
|
-
rb_str_append(he->tag, tag);
|
413
|
+
rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
|
426
414
|
} else {
|
427
415
|
H_ELE(cText);
|
428
416
|
rb_hpricot_add(S->focus, ele);
|
@@ -435,7 +423,7 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
435
423
|
|
436
424
|
VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
437
425
|
{
|
438
|
-
int cs, act, have = 0, nread = 0, curline = 1, text = 0;
|
426
|
+
int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
|
439
427
|
char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
|
440
428
|
|
441
429
|
hpricot_state *S = NULL;
|
@@ -445,12 +433,13 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
445
433
|
int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
|
446
434
|
|
447
435
|
rb_scan_args(argc, argv, "11", &port, &opts);
|
448
|
-
taint = OBJ_TAINTED(
|
449
|
-
|
436
|
+
taint = OBJ_TAINTED(port);
|
437
|
+
io = rb_respond_to(port, s_read);
|
438
|
+
if (!io)
|
450
439
|
{
|
451
|
-
if (
|
440
|
+
if (rb_respond_to(port, s_to_str))
|
452
441
|
{
|
453
|
-
port = rb_funcall(
|
442
|
+
port = rb_funcall(port, s_to_str, 0);
|
454
443
|
StringValue(port);
|
455
444
|
}
|
456
445
|
else
|
@@ -465,10 +454,7 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
465
454
|
if (!rb_block_given_p())
|
466
455
|
{
|
467
456
|
S = ALLOC(hpricot_state);
|
468
|
-
|
469
|
-
MEMZERO(he, hpricot_ele, 1);
|
470
|
-
he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
|
471
|
-
S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
|
457
|
+
S->doc = rb_obj_alloc(cDoc);
|
472
458
|
rb_gc_register_address(&S->doc);
|
473
459
|
S->focus = S->doc;
|
474
460
|
S->last = Qnil;
|
@@ -488,65 +474,68 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
488
474
|
buffer_size = NUM2INT(bufsize);
|
489
475
|
}
|
490
476
|
}
|
491
|
-
|
477
|
+
|
478
|
+
if (io)
|
479
|
+
buf = ALLOC_N(char, buffer_size);
|
492
480
|
|
493
481
|
%% write init;
|
494
|
-
|
495
|
-
while (
|
482
|
+
|
483
|
+
while (!done) {
|
496
484
|
VALUE str;
|
497
485
|
char *p, *pe;
|
498
486
|
int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
|
499
487
|
|
500
|
-
if (
|
501
|
-
/* We've used up the entire buffer storing an already-parsed token
|
502
|
-
* prefix that must be preserved. Likely caused by super-long attributes.
|
503
|
-
* Increase buffer size and continue */
|
504
|
-
tokstart_diff = ts - buf;
|
505
|
-
tokend_diff = te - buf;
|
506
|
-
mark_tag_diff = mark_tag - buf;
|
507
|
-
mark_akey_diff = mark_akey - buf;
|
508
|
-
mark_aval_diff = mark_aval - buf;
|
509
|
-
|
510
|
-
buffer_size += BUFSIZE;
|
511
|
-
REALLOC_N(buf, char, buffer_size);
|
512
|
-
|
513
|
-
space = buffer_size - have;
|
514
|
-
|
515
|
-
ts= buf + tokstart_diff;
|
516
|
-
te = buf + tokend_diff;
|
517
|
-
mark_tag = buf + mark_tag_diff;
|
518
|
-
mark_akey = buf + mark_akey_diff;
|
519
|
-
mark_aval = buf + mark_aval_diff;
|
520
|
-
}
|
521
|
-
p = buf + have;
|
522
|
-
|
523
|
-
if ( rb_respond_to( port, s_read ) )
|
488
|
+
if (io)
|
524
489
|
{
|
490
|
+
if (space == 0) {
|
491
|
+
/* We've used up the entire buffer storing an already-parsed token
|
492
|
+
* prefix that must be preserved. Likely caused by super-long attributes.
|
493
|
+
* Increase buffer size and continue */
|
494
|
+
tokstart_diff = ts - buf;
|
495
|
+
tokend_diff = te - buf;
|
496
|
+
mark_tag_diff = mark_tag - buf;
|
497
|
+
mark_akey_diff = mark_akey - buf;
|
498
|
+
mark_aval_diff = mark_aval - buf;
|
499
|
+
|
500
|
+
buffer_size += BUFSIZE;
|
501
|
+
REALLOC_N(buf, char, buffer_size);
|
502
|
+
|
503
|
+
space = buffer_size - have;
|
504
|
+
|
505
|
+
ts = buf + tokstart_diff;
|
506
|
+
te = buf + tokend_diff;
|
507
|
+
mark_tag = buf + mark_tag_diff;
|
508
|
+
mark_akey = buf + mark_akey_diff;
|
509
|
+
mark_aval = buf + mark_aval_diff;
|
510
|
+
}
|
511
|
+
p = buf + have;
|
512
|
+
|
525
513
|
str = rb_funcall(port, s_read, 1, INT2FIX(space));
|
526
514
|
len = RSTRING_LEN(str);
|
527
515
|
memcpy(p, StringValuePtr(str), len);
|
528
516
|
}
|
529
517
|
else
|
530
518
|
{
|
531
|
-
|
532
|
-
|
533
|
-
|
519
|
+
p = RSTRING_PTR(port);
|
520
|
+
len = RSTRING_LEN(port) + 1;
|
521
|
+
done = 1;
|
534
522
|
}
|
535
523
|
|
536
524
|
nread += len;
|
537
525
|
|
538
526
|
/* If this is the last buffer, tack on an EOF. */
|
539
|
-
if ( len < space
|
527
|
+
if (io && len < space) {
|
540
528
|
p[len++] = 0;
|
541
529
|
done = 1;
|
542
530
|
}
|
543
531
|
|
544
532
|
pe = p + len;
|
545
533
|
%% write exec;
|
546
|
-
|
547
|
-
if (
|
548
|
-
|
549
|
-
|
534
|
+
|
535
|
+
if (cs == hpricot_scan_error) {
|
536
|
+
if (buf != NULL)
|
537
|
+
free(buf);
|
538
|
+
if (!NIL_P(tag))
|
550
539
|
{
|
551
540
|
rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
|
552
541
|
}
|
@@ -555,8 +544,8 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
555
544
|
rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
|
556
545
|
}
|
557
546
|
}
|
558
|
-
|
559
|
-
if (
|
547
|
+
|
548
|
+
if (done && ele_open)
|
560
549
|
{
|
561
550
|
ele_open = 0;
|
562
551
|
if (ts > 0) {
|
@@ -566,11 +555,11 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
566
555
|
}
|
567
556
|
}
|
568
557
|
|
569
|
-
if (
|
558
|
+
if (ts == 0)
|
570
559
|
{
|
571
560
|
have = 0;
|
572
561
|
/* text nodes have no ts because each byte is parsed alone */
|
573
|
-
if (
|
562
|
+
if (mark_tag != NULL && text == 1)
|
574
563
|
{
|
575
564
|
if (done)
|
576
565
|
{
|
@@ -585,12 +574,15 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
585
574
|
CAT(tag, p);
|
586
575
|
}
|
587
576
|
}
|
588
|
-
|
577
|
+
if (io)
|
578
|
+
mark_tag = buf;
|
579
|
+
else
|
580
|
+
mark_tag = RSTRING_PTR(port);
|
589
581
|
}
|
590
|
-
else
|
582
|
+
else if (io)
|
591
583
|
{
|
592
584
|
have = pe - ts;
|
593
|
-
memmove(
|
585
|
+
memmove(buf, ts, have);
|
594
586
|
SLIDE(tag);
|
595
587
|
SLIDE(akey);
|
596
588
|
SLIDE(aval);
|
@@ -598,7 +590,9 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
598
590
|
ts = buf;
|
599
591
|
}
|
600
592
|
}
|
601
|
-
|
593
|
+
|
594
|
+
if (buf != NULL)
|
595
|
+
free(buf);
|
602
596
|
|
603
597
|
if (S != NULL)
|
604
598
|
{
|
@@ -611,66 +605,103 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
611
605
|
return Qnil;
|
612
606
|
}
|
613
607
|
|
614
|
-
|
608
|
+
static VALUE
|
609
|
+
alloc_hpricot_struct(VALUE klass)
|
615
610
|
{
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
611
|
+
VALUE size;
|
612
|
+
long n;
|
613
|
+
NEWOBJ(st, struct RStruct);
|
614
|
+
OBJSETUP(st, klass, T_STRUCT);
|
615
|
+
|
616
|
+
size = rb_struct_iv_get(klass, "__size__");
|
617
|
+
n = FIX2LONG(size);
|
618
|
+
|
619
|
+
#ifndef RSTRUCT_EMBED_LEN_MAX
|
620
|
+
st->ptr = ALLOC_N(VALUE, n);
|
621
|
+
rb_mem_clear(st->ptr, n);
|
622
|
+
st->len = n;
|
623
|
+
#else
|
624
|
+
if (0 < n && n <= RSTRUCT_EMBED_LEN_MAX) {
|
625
|
+
RBASIC(st)->flags &= ~RSTRUCT_EMBED_LEN_MASK;
|
626
|
+
RBASIC(st)->flags |= n << RSTRUCT_EMBED_LEN_SHIFT;
|
627
|
+
rb_mem_clear(st->as.ary, n);
|
628
|
+
} else {
|
629
|
+
st->as.heap.ptr = ALLOC_N(VALUE, n);
|
630
|
+
rb_mem_clear(st->as.heap.ptr, n);
|
631
|
+
st->as.heap.len = n;
|
632
|
+
}
|
633
|
+
#endif
|
621
634
|
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
635
|
+
return (VALUE)st;
|
636
|
+
}
|
637
|
+
|
638
|
+
static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);}
|
639
|
+
static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);}
|
640
|
+
static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);}
|
641
|
+
static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);}
|
642
|
+
static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);}
|
643
|
+
static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);}
|
644
|
+
static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);}
|
645
|
+
static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);}
|
646
|
+
static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);}
|
647
|
+
static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}
|
648
|
+
|
649
|
+
static VALUE (*ref_func[10])() = {
|
650
|
+
hpricot_struct_ref0,
|
651
|
+
hpricot_struct_ref1,
|
652
|
+
hpricot_struct_ref2,
|
653
|
+
hpricot_struct_ref3,
|
654
|
+
hpricot_struct_ref4,
|
655
|
+
hpricot_struct_ref5,
|
656
|
+
hpricot_struct_ref6,
|
657
|
+
hpricot_struct_ref7,
|
658
|
+
hpricot_struct_ref8,
|
659
|
+
hpricot_struct_ref9,
|
660
|
+
};
|
661
|
+
|
662
|
+
static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);}
|
663
|
+
static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);}
|
664
|
+
static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);}
|
665
|
+
static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);}
|
666
|
+
static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);}
|
667
|
+
static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);}
|
668
|
+
static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);}
|
669
|
+
static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);}
|
670
|
+
static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);}
|
671
|
+
static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}
|
672
|
+
|
673
|
+
static VALUE (*set_func[10])() = {
|
674
|
+
hpricot_struct_set0,
|
675
|
+
hpricot_struct_set1,
|
676
|
+
hpricot_struct_set2,
|
677
|
+
hpricot_struct_set3,
|
678
|
+
hpricot_struct_set4,
|
679
|
+
hpricot_struct_set5,
|
680
|
+
hpricot_struct_set6,
|
681
|
+
hpricot_struct_set7,
|
682
|
+
hpricot_struct_set8,
|
683
|
+
hpricot_struct_set9,
|
684
|
+
};
|
685
|
+
|
686
|
+
static VALUE
|
687
|
+
make_hpricot_struct(VALUE members)
|
688
|
+
{
|
689
|
+
int i = 0;
|
690
|
+
VALUE klass = rb_class_new(rb_cObject);
|
691
|
+
rb_iv_set(klass, "__size__", INT2NUM(RARRAY_LEN(members)));
|
692
|
+
rb_define_alloc_func(klass, alloc_hpricot_struct);
|
693
|
+
rb_define_singleton_method(klass, "new", rb_class_new_instance, -1);
|
694
|
+
for (i = 0; i < RARRAY_LEN(members); i++) {
|
695
|
+
ID id = SYM2ID(RARRAY_PTR(members)[i]);
|
696
|
+
rb_define_method_id(klass, id, ref_func[i], 0);
|
697
|
+
rb_define_method_id(klass, rb_id_attrset(id), set_func[i], 1);
|
698
|
+
}
|
699
|
+
return klass;
|
700
|
+
}
|
701
|
+
|
702
|
+
void Init_hpricot_scan()
|
703
|
+
{
|
704
|
+
VALUE structElem, structAttr, structBasic;
|
674
705
|
|
675
706
|
s_ElementContent = rb_intern("ElementContent");
|
676
707
|
symAllow = ID2SYM(rb_intern("allow"));
|
@@ -680,19 +711,78 @@ void Init_hpricot_scan()
|
|
680
711
|
s_parent = rb_intern("parent");
|
681
712
|
s_read = rb_intern("read");
|
682
713
|
s_to_str = rb_intern("to_str");
|
683
|
-
iv_parent = rb_intern("parent");
|
684
714
|
sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
|
685
715
|
sym_doctype = ID2SYM(rb_intern("doctype"));
|
686
716
|
sym_procins = ID2SYM(rb_intern("procins"));
|
687
717
|
sym_stag = ID2SYM(rb_intern("stag"));
|
688
718
|
sym_etag = ID2SYM(rb_intern("etag"));
|
689
719
|
sym_emptytag = ID2SYM(rb_intern("emptytag"));
|
720
|
+
sym_allowed = ID2SYM(rb_intern("allowed"));
|
721
|
+
sym_children = ID2SYM(rb_intern("children"));
|
690
722
|
sym_comment = ID2SYM(rb_intern("comment"));
|
691
723
|
sym_cdata = ID2SYM(rb_intern("cdata"));
|
724
|
+
sym_name = ID2SYM(rb_intern("name"));
|
725
|
+
sym_parent = ID2SYM(rb_intern("parent"));
|
726
|
+
sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
|
727
|
+
sym_raw_string = ID2SYM(rb_intern("raw_string"));
|
728
|
+
sym_tagno = ID2SYM(rb_intern("tagno"));
|
692
729
|
sym_text = ID2SYM(rb_intern("text"));
|
693
730
|
sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
|
694
731
|
sym_CDATA = ID2SYM(rb_intern("CDATA"));
|
695
732
|
|
733
|
+
mHpricot = rb_define_module("Hpricot");
|
734
|
+
rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
|
735
|
+
rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
|
736
|
+
rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
|
737
|
+
rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
|
738
|
+
|
739
|
+
structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
|
740
|
+
sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
|
741
|
+
sym_tagno, sym_children));
|
742
|
+
structAttr = make_hpricot_struct(rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes));
|
743
|
+
structBasic = make_hpricot_struct(rb_ary_new3(2, sym_name, sym_parent));
|
744
|
+
|
745
|
+
cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
|
746
|
+
cCData = rb_define_class_under(mHpricot, "CData", structBasic);
|
747
|
+
rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
|
748
|
+
rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
|
749
|
+
cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
|
750
|
+
rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
|
751
|
+
rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
|
752
|
+
cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
|
753
|
+
rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
|
754
|
+
rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
|
755
|
+
rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
|
756
|
+
rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
|
757
|
+
rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
|
758
|
+
rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
|
759
|
+
rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
|
760
|
+
rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
|
761
|
+
cElem = rb_define_class_under(mHpricot, "Elem", structElem);
|
762
|
+
rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
|
763
|
+
cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
|
764
|
+
rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
|
765
|
+
rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
|
766
|
+
cText = rb_define_class_under(mHpricot, "Text", structBasic);
|
767
|
+
rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
|
768
|
+
rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
|
769
|
+
rb_define_method(cText, "content", hpricot_ele_get_name, 0);
|
770
|
+
rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
|
771
|
+
cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
|
772
|
+
rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
|
773
|
+
rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
|
774
|
+
rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
|
775
|
+
rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
|
776
|
+
rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
|
777
|
+
rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
|
778
|
+
rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
|
779
|
+
rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
|
780
|
+
cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
|
781
|
+
rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
|
782
|
+
rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
|
783
|
+
rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
|
784
|
+
rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
|
785
|
+
|
696
786
|
rb_const_set(mHpricot, rb_intern("ProcInsParse"),
|
697
787
|
reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
|
698
788
|
}
|