jerryvos-hpricot 0.8.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/CHANGELOG +75 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +260 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +201 -0
  8. data/ext/hpricot_scan/HpricotScanService.java +1305 -0
  9. data/ext/hpricot_scan/extconf.rb +6 -0
  10. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  11. data/ext/hpricot_scan/hpricot_css.c +3502 -0
  12. data/ext/hpricot_scan/hpricot_scan.c +6768 -0
  13. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  14. data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
  15. data/ext/hpricot_scan/hpricot_scan.rl +786 -0
  16. data/extras/mingw-rbconfig.rb +176 -0
  17. data/lib/hpricot.rb +26 -0
  18. data/lib/hpricot/blankslate.rb +63 -0
  19. data/lib/hpricot/builder.rb +216 -0
  20. data/lib/hpricot/elements.rb +510 -0
  21. data/lib/hpricot/htmlinfo.rb +691 -0
  22. data/lib/hpricot/inspect.rb +103 -0
  23. data/lib/hpricot/modules.rb +40 -0
  24. data/lib/hpricot/parse.rb +38 -0
  25. data/lib/hpricot/tag.rb +200 -0
  26. data/lib/hpricot/tags.rb +164 -0
  27. data/lib/hpricot/traverse.rb +838 -0
  28. data/lib/hpricot/xchar.rb +94 -0
  29. data/test/files/basic.xhtml +17 -0
  30. data/test/files/boingboing.html +2266 -0
  31. data/test/files/cy0.html +3653 -0
  32. data/test/files/immob.html +400 -0
  33. data/test/files/pace_application.html +1320 -0
  34. data/test/files/tenderlove.html +16 -0
  35. data/test/files/uswebgen.html +220 -0
  36. data/test/files/utf8.html +1054 -0
  37. data/test/files/week9.html +1723 -0
  38. data/test/files/why.xml +19 -0
  39. data/test/load_files.rb +7 -0
  40. data/test/test_alter.rb +77 -0
  41. data/test/test_builder.rb +37 -0
  42. data/test/test_parser.rb +420 -0
  43. data/test/test_paths.rb +25 -0
  44. data/test/test_preserved.rb +70 -0
  45. data/test/test_xml.rb +28 -0
  46. metadata +107 -0
@@ -0,0 +1,786 @@
1
+ /*
2
+ * hpricot_scan.rl
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006 why the lucky stiff
8
+ */
9
+ #include <ruby.h>
10
+
11
+ #ifndef RARRAY_LEN
12
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
13
+ #define RSTRING_LEN(str) RSTRING(str)->len
14
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
15
+ #endif
16
+
17
+ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
18
+
19
+ #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
20
+
21
+ static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
22
+ sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
23
+ sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;
24
+ static VALUE mHpricot, rb_eHpricotParseError;
25
+ static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,
26
+ cXMLDecl, cProcIns, symAllow, symDeny;
27
+ static ID s_ElementContent;
28
+ static ID s_downcase, s_new, s_parent, s_read, s_to_str;
29
+ static VALUE reProcInsParse;
30
+
31
+ #define H_ELE_TAG 0
32
+ #define H_ELE_PARENT 1
33
+ #define H_ELE_ATTR 2
34
+ #define H_ELE_ETAG 3
35
+ #define H_ELE_RAW 4
36
+ #define H_ELE_EC 5
37
+ #define H_ELE_HASH 6
38
+ #define H_ELE_CHILDREN 7
39
+
40
+ #define H_ELE_GET(ele, idx) RSTRUCT_PTR(ele)[idx]
41
+ #define H_ELE_SET(ele, idx, val) RSTRUCT_PTR(ele)[idx] = val
42
+
43
+ #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
44
+
45
+ #define ELE(N) \
46
+ if (te > ts || text == 1) { \
47
+ char *raw = NULL; \
48
+ int rawlen = 0; \
49
+ ele_open = 0; text = 0; \
50
+ if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
51
+ raw = ts; rawlen = te - ts; \
52
+ } \
53
+ if (rb_block_given_p()) { \
54
+ VALUE raw_string = Qnil; \
55
+ if (raw != NULL) raw_string = rb_str_new(raw, rawlen); \
56
+ rb_yield_tokens(sym_##N, tag, attr, Qnil, taint); \
57
+ } else \
58
+ rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint); \
59
+ }
60
+
61
+ #define SET(N, E) \
62
+ if (mark_##N == NULL || E == mark_##N) \
63
+ N = rb_str_new2(""); \
64
+ else if (E > mark_##N) \
65
+ N = rb_str_new(mark_##N, E - mark_##N);
66
+
67
+ #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
68
+
69
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
70
+
71
+ #define ATTR(K, V) \
72
+ if (!NIL_P(K)) { \
73
+ if (NIL_P(attr)) attr = rb_hash_new(); \
74
+ rb_hash_aset(attr, K, V); \
75
+ }
76
+
77
+ #define TEXT_PASS() \
78
+ if (text == 0) \
79
+ { \
80
+ if (ele_open == 1) { \
81
+ ele_open = 0; \
82
+ if (ts > 0) { \
83
+ mark_tag = ts; \
84
+ } \
85
+ } else { \
86
+ mark_tag = p; \
87
+ } \
88
+ attr = Qnil; \
89
+ tag = Qnil; \
90
+ text = 1; \
91
+ }
92
+
93
+ #define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
94
+
95
+ %%{
96
+ machine hpricot_scan;
97
+
98
+ action newEle {
99
+ if (text == 1) {
100
+ CAT(tag, p);
101
+ ELE(text);
102
+ text = 0;
103
+ }
104
+ attr = Qnil;
105
+ tag = Qnil;
106
+ mark_tag = NULL;
107
+ ele_open = 1;
108
+ }
109
+
110
+ action _tag { mark_tag = p; }
111
+ action _aval { mark_aval = p; }
112
+ action _akey { mark_akey = p; }
113
+ action tag { SET(tag, p); }
114
+ action tagc { SET(tag, p-1); }
115
+ action aval { SET(aval, p); }
116
+ action aunq {
117
+ if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
118
+ else { SET(aval, p); }
119
+ }
120
+ action akey { SET(akey, p); }
121
+ action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
122
+ action xmlenc { SET(aval, p); ATTR(ID2SYM(rb_intern("encoding")), aval); }
123
+ action xmlsd { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
124
+ action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
125
+ action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
126
+
127
+ action new_attr {
128
+ akey = Qnil;
129
+ aval = Qnil;
130
+ mark_akey = NULL;
131
+ mark_aval = NULL;
132
+ }
133
+
134
+ action save_attr {
135
+ ATTR(akey, aval);
136
+ }
137
+
138
+ include hpricot_common "hpricot_common.rl";
139
+
140
+ }%%
141
+
142
+ %% write data nofinal;
143
+
144
+ #define BUFSIZE 16384
145
+
146
+ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
147
+ {
148
+ VALUE ary;
149
+ if (sym == sym_text) {
150
+ raw = tag;
151
+ }
152
+ ary = rb_ary_new3(4, sym, tag, attr, raw);
153
+ if (taint) {
154
+ OBJ_TAINT(ary);
155
+ OBJ_TAINT(tag);
156
+ OBJ_TAINT(attr);
157
+ OBJ_TAINT(raw);
158
+ }
159
+ rb_yield(ary);
160
+ }
161
+
162
+ #ifndef RHASH_TBL
163
+ /* rb_hash_lookup() is only in Ruby 1.8.7 */
164
+ static VALUE
165
+ our_rb_hash_lookup(VALUE hash, VALUE key)
166
+ {
167
+ VALUE val;
168
+
169
+ if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
170
+ return Qnil; /* without Hash#default */
171
+ }
172
+
173
+ return val;
174
+ }
175
+ #define rb_hash_lookup our_rb_hash_lookup
176
+ #endif
177
+
178
+ static void
179
+ rb_hpricot_add(VALUE focus, VALUE ele)
180
+ {
181
+ VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
182
+ if (NIL_P(children))
183
+ H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
184
+ rb_ary_push(children, ele);
185
+ H_ELE_SET(ele, H_ELE_PARENT, focus);
186
+ }
187
+
188
+ typedef struct {
189
+ VALUE doc;
190
+ VALUE focus;
191
+ VALUE last;
192
+ VALUE EC;
193
+ unsigned char xml, strict, fixup;
194
+ } hpricot_state;
195
+
196
+ #define H_PROP(prop, idx) \
197
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
198
+ H_ELE_SET(self, idx, x); \
199
+ return self; \
200
+ } \
201
+ static VALUE hpricot_ele_clear_##prop(VALUE self) { \
202
+ H_ELE_SET(self, idx, Qnil); \
203
+ return Qtrue; \
204
+ } \
205
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
206
+ return H_ELE_GET(self, idx); \
207
+ }
208
+
209
+ #define H_ATTR(prop) \
210
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
211
+ rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
212
+ return self; \
213
+ } \
214
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
215
+ return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
216
+ }
217
+
218
+ H_PROP(name, H_ELE_TAG);
219
+ H_PROP(raw, H_ELE_RAW);
220
+ H_PROP(parent, H_ELE_PARENT);
221
+ H_PROP(attr, H_ELE_ATTR);
222
+ H_PROP(etag, H_ELE_ETAG);
223
+ H_PROP(children, H_ELE_CHILDREN);
224
+ H_ATTR(target);
225
+ H_ATTR(encoding);
226
+ H_ATTR(version);
227
+ H_ATTR(standalone);
228
+ H_ATTR(system_id);
229
+ H_ATTR(public_id);
230
+
231
+ #define H_ELE(klass) \
232
+ ele = rb_obj_alloc(klass); \
233
+ if (klass == cElem) { \
234
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
235
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
236
+ H_ELE_SET(ele, H_ELE_EC, ec); \
237
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
238
+ H_ELE_SET(ele, H_ELE_RAW, rb_str_new(raw, rawlen)); \
239
+ } \
240
+ } else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
241
+ if (klass == cBogusETag) { \
242
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
243
+ if (raw != NULL) \
244
+ H_ELE_SET(ele, H_ELE_ATTR, rb_str_new(raw, rawlen)); \
245
+ } else { \
246
+ if (klass == cDocType) \
247
+ ATTR(ID2SYM(rb_intern("target")), tag); \
248
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
249
+ if (klass != cProcIns) { \
250
+ tag = Qnil; \
251
+ if (raw != NULL) tag = rb_str_new(raw, rawlen); \
252
+ } \
253
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
254
+ } \
255
+ } else { \
256
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
257
+ } \
258
+ S->last = ele
259
+
260
+ //
261
+ // the swift, compact parser logic. most of the complicated stuff is done
262
+ // in the lexer. this step just pairs up the start and end tags.
263
+ //
264
+ void
265
+ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw, int rawlen, int taint)
266
+ {
267
+ VALUE ele, ec = Qnil;
268
+
269
+ //
270
+ // in html mode, fix up start tags incorrectly formed as empty tags
271
+ //
272
+ if (!S->xml) {
273
+ if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
274
+ ec = rb_hash_aref(S->EC, tag);
275
+ if (NIL_P(ec)) {
276
+ tag = rb_funcall(tag, s_downcase, 0);
277
+ ec = rb_hash_aref(S->EC, tag);
278
+ }
279
+ }
280
+
281
+ if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
282
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
283
+ !(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
284
+ {
285
+ sym = sym_text;
286
+ tag = rb_str_new(raw, rawlen);
287
+ }
288
+
289
+ if (!NIL_P(ec)) {
290
+ if (sym == sym_emptytag) {
291
+ if (ec != sym_EMPTY)
292
+ sym = sym_stag;
293
+ } else if (sym == sym_stag) {
294
+ if (ec == sym_EMPTY)
295
+ sym = sym_emptytag;
296
+ }
297
+ }
298
+ }
299
+
300
+ if (sym == sym_emptytag || sym == sym_stag) {
301
+ VALUE name = INT2FIX(rb_str_hash(tag));
302
+ H_ELE(cElem);
303
+ H_ELE_SET(ele, H_ELE_HASH, name);
304
+
305
+ if (!S->xml) {
306
+ VALUE match = Qnil, e = S->focus;
307
+ while (e != S->doc)
308
+ {
309
+ VALUE hEC = H_ELE_GET(e, H_ELE_EC);
310
+
311
+ if (TYPE(hEC) == T_HASH)
312
+ {
313
+ VALUE has = rb_hash_lookup(hEC, name);
314
+ if (has != Qnil) {
315
+ if (has == Qtrue) {
316
+ if (match == Qnil)
317
+ match = e;
318
+ } else if (has == symAllow) {
319
+ match = S->focus;
320
+ } else if (has == symDeny) {
321
+ match = Qnil;
322
+ }
323
+ }
324
+ }
325
+
326
+ e = H_ELE_GET(e, H_ELE_PARENT);
327
+ }
328
+
329
+ if (match == Qnil)
330
+ match = S->focus;
331
+ S->focus = match;
332
+ }
333
+
334
+ rb_hpricot_add(S->focus, ele);
335
+
336
+ //
337
+ // in the case of a start tag that should be empty, just
338
+ // skip the step that focuses the element. focusing moves
339
+ // us deeper into the document.
340
+ //
341
+ if (sym == sym_stag) {
342
+ if (S->xml || ec != sym_EMPTY) {
343
+ S->focus = ele;
344
+ S->last = Qnil;
345
+ }
346
+ }
347
+ } else if (sym == sym_etag) {
348
+ VALUE name, match = Qnil, e = S->focus;
349
+ if (S->strict) {
350
+ if (NIL_P(rb_hash_aref(S->EC, tag))) {
351
+ tag = rb_str_new2("div");
352
+ }
353
+ }
354
+
355
+ //
356
+ // another optimization will be to improve this very simple
357
+ // O(n) tag search, where n is the depth of the focused tag.
358
+ //
359
+ // (see also: the search above for fixups)
360
+ //
361
+ name = INT2FIX(rb_str_hash(tag));
362
+ while (e != S->doc)
363
+ {
364
+ if (H_ELE_GET(e, H_ELE_HASH) == name)
365
+ {
366
+ match = e;
367
+ break;
368
+ }
369
+
370
+ e = H_ELE_GET(e, H_ELE_PARENT);
371
+ }
372
+
373
+ if (NIL_P(match))
374
+ {
375
+ H_ELE(cBogusETag);
376
+ rb_hpricot_add(S->focus, ele);
377
+ }
378
+ else
379
+ {
380
+ VALUE ele = Qnil;
381
+ if (raw != NULL)
382
+ ele = rb_str_new(raw, rawlen);
383
+ H_ELE_SET(match, H_ELE_ETAG, ele);
384
+ S->focus = H_ELE_GET(match, H_ELE_PARENT);
385
+ S->last = Qnil;
386
+ }
387
+ } else if (sym == sym_cdata) {
388
+ H_ELE(cCData);
389
+ rb_hpricot_add(S->focus, ele);
390
+ } else if (sym == sym_comment) {
391
+ H_ELE(cComment);
392
+ rb_hpricot_add(S->focus, ele);
393
+ } else if (sym == sym_doctype) {
394
+ H_ELE(cDocType);
395
+ if (S->strict) {
396
+ rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
397
+ rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN"));
398
+ }
399
+ rb_hpricot_add(S->focus, ele);
400
+ } else if (sym == sym_procins) {
401
+ VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
402
+ tag = rb_reg_nth_match(1, match);
403
+ attr = rb_reg_nth_match(2, match);
404
+ {
405
+ H_ELE(cProcIns);
406
+ rb_hpricot_add(S->focus, ele);
407
+ }
408
+ } else if (sym == sym_text) {
409
+ // TODO: add raw_string as well?
410
+ if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
411
+ rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
412
+ } else {
413
+ H_ELE(cText);
414
+ rb_hpricot_add(S->focus, ele);
415
+ }
416
+ } else if (sym == sym_xmldecl) {
417
+ H_ELE(cXMLDecl);
418
+ rb_hpricot_add(S->focus, ele);
419
+ }
420
+ }
421
+
422
+ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
423
+ {
424
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
425
+ char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
426
+
427
+ hpricot_state *S = NULL;
428
+ VALUE port, opts;
429
+ VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
430
+ char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
431
+ int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
432
+
433
+ rb_scan_args(argc, argv, "11", &port, &opts);
434
+ taint = OBJ_TAINTED(port);
435
+ io = rb_respond_to(port, s_read);
436
+ if (!io)
437
+ {
438
+ if (rb_respond_to(port, s_to_str))
439
+ {
440
+ port = rb_funcall(port, s_to_str, 0);
441
+ StringValue(port);
442
+ }
443
+ else
444
+ {
445
+ rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
446
+ }
447
+ }
448
+
449
+ if (TYPE(opts) != T_HASH)
450
+ opts = Qnil;
451
+
452
+ if (!rb_block_given_p())
453
+ {
454
+ S = ALLOC(hpricot_state);
455
+ S->doc = rb_obj_alloc(cDoc);
456
+ rb_gc_register_address(&S->doc);
457
+ S->focus = S->doc;
458
+ S->last = Qnil;
459
+ S->xml = OPT(opts, xml);
460
+ S->strict = OPT(opts, xhtml_strict);
461
+ S->fixup = OPT(opts, fixup_tags);
462
+ if (S->strict) S->fixup = 1;
463
+ rb_ivar_set(S->doc, rb_intern("@options"), opts);
464
+
465
+ S->EC = rb_const_get(mHpricot, s_ElementContent);
466
+ }
467
+
468
+ buffer_size = BUFSIZE;
469
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
470
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
471
+ if (!NIL_P(bufsize)) {
472
+ buffer_size = NUM2INT(bufsize);
473
+ }
474
+ }
475
+
476
+ if (io)
477
+ buf = ALLOC_N(char, buffer_size);
478
+
479
+ %% write init;
480
+
481
+ while (!done) {
482
+ VALUE str;
483
+ char *p, *pe;
484
+ int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
485
+
486
+ if (io)
487
+ {
488
+ if (space == 0) {
489
+ /* We've used up the entire buffer storing an already-parsed token
490
+ * prefix that must be preserved. Likely caused by super-long attributes.
491
+ * Increase buffer size and continue */
492
+ tokstart_diff = ts - buf;
493
+ tokend_diff = te - buf;
494
+ mark_tag_diff = mark_tag - buf;
495
+ mark_akey_diff = mark_akey - buf;
496
+ mark_aval_diff = mark_aval - buf;
497
+
498
+ buffer_size += BUFSIZE;
499
+ REALLOC_N(buf, char, buffer_size);
500
+
501
+ space = buffer_size - have;
502
+
503
+ ts = buf + tokstart_diff;
504
+ te = buf + tokend_diff;
505
+ mark_tag = buf + mark_tag_diff;
506
+ mark_akey = buf + mark_akey_diff;
507
+ mark_aval = buf + mark_aval_diff;
508
+ }
509
+ p = buf + have;
510
+
511
+ str = rb_funcall(port, s_read, 1, INT2FIX(space));
512
+ len = RSTRING_LEN(str);
513
+ memcpy(p, StringValuePtr(str), len);
514
+ }
515
+ else
516
+ {
517
+ p = RSTRING_PTR(port);
518
+ len = RSTRING_LEN(port) + 1;
519
+ done = 1;
520
+ }
521
+
522
+ nread += len;
523
+
524
+ /* If this is the last buffer, tack on an EOF. */
525
+ if (io && len < space) {
526
+ p[len++] = 0;
527
+ done = 1;
528
+ }
529
+
530
+ pe = p + len;
531
+ %% write exec;
532
+
533
+ if (cs == hpricot_scan_error) {
534
+ if (buf != NULL)
535
+ free(buf);
536
+ if (!NIL_P(tag))
537
+ {
538
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
539
+ }
540
+ else
541
+ {
542
+ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
543
+ }
544
+ }
545
+
546
+ if (done && ele_open)
547
+ {
548
+ ele_open = 0;
549
+ if (ts > 0) {
550
+ mark_tag = ts;
551
+ ts = 0;
552
+ text = 1;
553
+ }
554
+ }
555
+
556
+ if (ts == 0)
557
+ {
558
+ have = 0;
559
+ /* text nodes have no ts because each byte is parsed alone */
560
+ if (mark_tag != NULL && text == 1)
561
+ {
562
+ if (done)
563
+ {
564
+ if (mark_tag < p-1)
565
+ {
566
+ CAT(tag, p-1);
567
+ ELE(text);
568
+ }
569
+ }
570
+ else
571
+ {
572
+ CAT(tag, p);
573
+ }
574
+ }
575
+ if (io)
576
+ mark_tag = buf;
577
+ else
578
+ mark_tag = RSTRING_PTR(port);
579
+ }
580
+ else if (io)
581
+ {
582
+ have = pe - ts;
583
+ memmove(buf, ts, have);
584
+ SLIDE(tag);
585
+ SLIDE(akey);
586
+ SLIDE(aval);
587
+ te = buf + (te - ts);
588
+ ts = buf;
589
+ }
590
+ }
591
+
592
+ if (buf != NULL)
593
+ free(buf);
594
+
595
+ if (S != NULL)
596
+ {
597
+ VALUE doc = S->doc;
598
+ rb_gc_unregister_address(&S->doc);
599
+ free(S);
600
+ return doc;
601
+ }
602
+
603
+ return Qnil;
604
+ }
605
+
606
+ static VALUE
607
+ alloc_hpricot_struct(VALUE klass)
608
+ {
609
+ VALUE size;
610
+ long n;
611
+ NEWOBJ(st, struct RStruct);
612
+ OBJSETUP(st, klass, T_STRUCT);
613
+
614
+ size = rb_struct_iv_get(klass, "__size__");
615
+ n = FIX2LONG(size);
616
+
617
+ #ifndef RSTRUCT_EMBED_LEN_MAX
618
+ st->ptr = ALLOC_N(VALUE, n);
619
+ rb_mem_clear(st->ptr, n);
620
+ st->len = n;
621
+ #else
622
+ if (0 < n && n <= RSTRUCT_EMBED_LEN_MAX) {
623
+ RBASIC(st)->flags &= ~RSTRUCT_EMBED_LEN_MASK;
624
+ RBASIC(st)->flags |= n << RSTRUCT_EMBED_LEN_SHIFT;
625
+ rb_mem_clear(st->as.ary, n);
626
+ } else {
627
+ st->as.heap.ptr = ALLOC_N(VALUE, n);
628
+ rb_mem_clear(st->as.heap.ptr, n);
629
+ st->as.heap.len = n;
630
+ }
631
+ #endif
632
+
633
+ return (VALUE)st;
634
+ }
635
+
636
+ static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);}
637
+ static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);}
638
+ static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);}
639
+ static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);}
640
+ static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);}
641
+ static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);}
642
+ static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);}
643
+ static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);}
644
+ static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);}
645
+ static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}
646
+
647
+ static VALUE (*ref_func[10])() = {
648
+ hpricot_struct_ref0,
649
+ hpricot_struct_ref1,
650
+ hpricot_struct_ref2,
651
+ hpricot_struct_ref3,
652
+ hpricot_struct_ref4,
653
+ hpricot_struct_ref5,
654
+ hpricot_struct_ref6,
655
+ hpricot_struct_ref7,
656
+ hpricot_struct_ref8,
657
+ hpricot_struct_ref9,
658
+ };
659
+
660
+ static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);}
661
+ static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);}
662
+ static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);}
663
+ static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);}
664
+ static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);}
665
+ static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);}
666
+ static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);}
667
+ static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);}
668
+ static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);}
669
+ static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}
670
+
671
+ static VALUE (*set_func[10])() = {
672
+ hpricot_struct_set0,
673
+ hpricot_struct_set1,
674
+ hpricot_struct_set2,
675
+ hpricot_struct_set3,
676
+ hpricot_struct_set4,
677
+ hpricot_struct_set5,
678
+ hpricot_struct_set6,
679
+ hpricot_struct_set7,
680
+ hpricot_struct_set8,
681
+ hpricot_struct_set9,
682
+ };
683
+
684
+ static VALUE
685
+ make_hpricot_struct(VALUE members)
686
+ {
687
+ int i = 0;
688
+ VALUE klass = rb_class_new(rb_cObject);
689
+ rb_iv_set(klass, "__size__", INT2NUM(RARRAY_LEN(members)));
690
+ rb_define_alloc_func(klass, alloc_hpricot_struct);
691
+ rb_define_singleton_method(klass, "new", rb_class_new_instance, -1);
692
+ for (i = 0; i < RARRAY_LEN(members); i++) {
693
+ ID id = SYM2ID(RARRAY_PTR(members)[i]);
694
+ rb_define_method_id(klass, id, ref_func[i], 0);
695
+ rb_define_method_id(klass, rb_id_attrset(id), set_func[i], 1);
696
+ }
697
+ return klass;
698
+ }
699
+
700
+ void Init_hpricot_scan()
701
+ {
702
+ VALUE structElem, structAttr, structBasic;
703
+
704
+ s_ElementContent = rb_intern("ElementContent");
705
+ symAllow = ID2SYM(rb_intern("allow"));
706
+ symDeny = ID2SYM(rb_intern("deny"));
707
+ s_downcase = rb_intern("downcase");
708
+ s_new = rb_intern("new");
709
+ s_parent = rb_intern("parent");
710
+ s_read = rb_intern("read");
711
+ s_to_str = rb_intern("to_str");
712
+ sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
713
+ sym_doctype = ID2SYM(rb_intern("doctype"));
714
+ sym_procins = ID2SYM(rb_intern("procins"));
715
+ sym_stag = ID2SYM(rb_intern("stag"));
716
+ sym_etag = ID2SYM(rb_intern("etag"));
717
+ sym_emptytag = ID2SYM(rb_intern("emptytag"));
718
+ sym_allowed = ID2SYM(rb_intern("allowed"));
719
+ sym_children = ID2SYM(rb_intern("children"));
720
+ sym_comment = ID2SYM(rb_intern("comment"));
721
+ sym_cdata = ID2SYM(rb_intern("cdata"));
722
+ sym_name = ID2SYM(rb_intern("name"));
723
+ sym_parent = ID2SYM(rb_intern("parent"));
724
+ sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
725
+ sym_raw_string = ID2SYM(rb_intern("raw_string"));
726
+ sym_tagno = ID2SYM(rb_intern("tagno"));
727
+ sym_text = ID2SYM(rb_intern("text"));
728
+ sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
729
+ sym_CDATA = ID2SYM(rb_intern("CDATA"));
730
+
731
+ mHpricot = rb_define_module("Hpricot");
732
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
733
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
734
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
735
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
736
+
737
+ structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
738
+ sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
739
+ sym_tagno, sym_children));
740
+ structAttr = make_hpricot_struct(rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes));
741
+ structBasic = make_hpricot_struct(rb_ary_new3(2, sym_name, sym_parent));
742
+
743
+ cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
744
+ cCData = rb_define_class_under(mHpricot, "CData", structBasic);
745
+ rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
746
+ rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
747
+ cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
748
+ rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
749
+ rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
750
+ cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
751
+ rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
752
+ rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
753
+ rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
754
+ rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
755
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
756
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
757
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
758
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
759
+ cElem = rb_define_class_under(mHpricot, "Elem", structElem);
760
+ rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
761
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
762
+ rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
763
+ rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
764
+ cText = rb_define_class_under(mHpricot, "Text", structBasic);
765
+ rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
766
+ rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
767
+ rb_define_method(cText, "content", hpricot_ele_get_name, 0);
768
+ rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
769
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
770
+ rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
771
+ rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
772
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
773
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
774
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
775
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
776
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
777
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
778
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
779
+ rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
780
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
781
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
782
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
783
+
784
+ rb_const_set(mHpricot, rb_intern("ProcInsParse"),
785
+ reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
786
+ }