hpricot 0.8.3-i386-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. data/CHANGELOG +104 -0
  2. data/COPYING +18 -0
  3. data/README.md +276 -0
  4. data/Rakefile +234 -0
  5. data/ext/fast_xs/FastXsService.java +1123 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +210 -0
  8. data/ext/hpricot_scan/HpricotCss.java +850 -0
  9. data/ext/hpricot_scan/HpricotScanService.java +2099 -0
  10. data/ext/hpricot_scan/extconf.rb +9 -0
  11. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  12. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  13. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  14. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  15. data/ext/hpricot_scan/hpricot_scan.c +7039 -0
  16. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  17. data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
  18. data/ext/hpricot_scan/hpricot_scan.rl +896 -0
  19. data/extras/hpricot.png +0 -0
  20. data/lib/fast_xs.rb +1 -0
  21. data/lib/fast_xs/1.8/fast_xs.so +0 -0
  22. data/lib/fast_xs/1.9/fast_xs.so +0 -0
  23. data/lib/hpricot.rb +26 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +216 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +94 -0
  35. data/lib/hpricot_scan.rb +1 -0
  36. data/lib/hpricot_scan/1.8/hpricot_scan.so +0 -0
  37. data/lib/hpricot_scan/1.9/hpricot_scan.so +0 -0
  38. data/test/files/basic.xhtml +17 -0
  39. data/test/files/boingboing.html +2266 -0
  40. data/test/files/cy0.html +3653 -0
  41. data/test/files/immob.html +400 -0
  42. data/test/files/pace_application.html +1320 -0
  43. data/test/files/tenderlove.html +16 -0
  44. data/test/files/uswebgen.html +220 -0
  45. data/test/files/utf8.html +1054 -0
  46. data/test/files/week9.html +1723 -0
  47. data/test/files/why.xml +19 -0
  48. data/test/load_files.rb +7 -0
  49. data/test/nokogiri-bench.rb +64 -0
  50. data/test/test_alter.rb +96 -0
  51. data/test/test_builder.rb +37 -0
  52. data/test/test_parser.rb +457 -0
  53. data/test/test_paths.rb +25 -0
  54. data/test/test_preserved.rb +88 -0
  55. data/test/test_xml.rb +28 -0
  56. metadata +128 -0
@@ -0,0 +1,896 @@
1
+ /*
2
+ * hpricot_scan.rl
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006, 2010 why the lucky stiff
8
+ */
9
+ #include <ruby.h>
10
+ #include <assert.h>
11
+
12
+ struct hpricot_struct {
13
+ int len;
14
+ VALUE* ptr;
15
+ };
16
+
17
+ #ifndef RARRAY_LEN
18
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
19
+ #define RSTRING_LEN(str) RSTRING(str)->len
20
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
21
+ #endif
22
+
23
+ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
24
+
25
+ #define NO_WAY_SERIOUSLY "*** This should not happen, please file a bug report with the HTML you're parsing at http://github.com/hpricot/hpricot/issues. So sorry!"
26
+
27
+ static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
28
+ sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
29
+ sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;
30
+ static VALUE mHpricot, rb_eHpricotParseError;
31
+ static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,
32
+ cXMLDecl, cProcIns, symAllow, symDeny;
33
+ static ID s_ElementContent;
34
+ static ID s_downcase, s_new, s_parent, s_read, s_to_str;
35
+ static VALUE reProcInsParse;
36
+
37
+ #define H_ELE_TAG 0
38
+ #define H_ELE_PARENT 1
39
+ #define H_ELE_ATTR 2
40
+ #define H_ELE_ETAG 3
41
+ #define H_ELE_RAW 4
42
+ #define H_ELE_EC 5
43
+ #define H_ELE_HASH 6
44
+ #define H_ELE_CHILDREN 7
45
+
46
+ #define HSTRUCT_PTR(ele) ((struct hpricot_struct*)DATA_PTR(ele))->ptr
47
+
48
+ #define H_ELE_GET(ele, idx) HSTRUCT_PTR(ele)[idx]
49
+ #define H_ELE_SET(ele, idx, val) HSTRUCT_PTR(ele)[idx] = val
50
+
51
+ #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
52
+
53
+ #ifdef HAVE_RUBY_ENCODING_H
54
+ #include <ruby/encoding.h>
55
+ # define ASSOCIATE_INDEX(s) rb_enc_associate_index((s), encoding_index)
56
+ # define ENCODING_INDEX , encoding_index
57
+ #else
58
+ # define ASSOCIATE_INDEX(s)
59
+ # define ENCODING_INDEX
60
+ #endif
61
+
62
+ #define ELE(N) \
63
+ if (te > ts || text == 1) { \
64
+ char *raw = NULL; \
65
+ int rawlen = 0; \
66
+ ele_open = 0; text = 0; \
67
+ if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
68
+ raw = ts; rawlen = te - ts; \
69
+ } \
70
+ if (rb_block_given_p()) { \
71
+ VALUE raw_string = Qnil; \
72
+ if (raw != NULL) { \
73
+ raw_string = rb_str_new(raw, rawlen); \
74
+ ASSOCIATE_INDEX(raw_string); \
75
+ } \
76
+ rb_yield_tokens(sym_##N, tag, attr, Qnil, taint); \
77
+ } else \
78
+ rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint ENCODING_INDEX); \
79
+ }
80
+
81
+ #define SET(N, E) \
82
+ if (mark_##N == NULL || E == mark_##N) { \
83
+ N = rb_str_new2(""); \
84
+ ASSOCIATE_INDEX(N); \
85
+ } else if (E > mark_##N) { \
86
+ N = rb_str_new(mark_##N, E - mark_##N); \
87
+ ASSOCIATE_INDEX(N); \
88
+ }
89
+
90
+ #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
91
+
92
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
93
+
94
+ #define ATTR(K, V) \
95
+ if (!NIL_P(K)) { \
96
+ if (NIL_P(attr)) attr = rb_hash_new(); \
97
+ rb_hash_aset(attr, K, V); \
98
+ }
99
+
100
+ #define TEXT_PASS() \
101
+ if (text == 0) \
102
+ { \
103
+ if (ele_open == 1) { \
104
+ ele_open = 0; \
105
+ if (ts > 0) { \
106
+ mark_tag = ts; \
107
+ } \
108
+ } else { \
109
+ mark_tag = p; \
110
+ } \
111
+ attr = Qnil; \
112
+ tag = Qnil; \
113
+ text = 1; \
114
+ }
115
+
116
+ #define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
117
+
118
+ %%{
119
+ machine hpricot_scan;
120
+
121
+ action newEle {
122
+ if (text == 1) {
123
+ CAT(tag, p);
124
+ ELE(text);
125
+ text = 0;
126
+ }
127
+ attr = Qnil;
128
+ tag = Qnil;
129
+ mark_tag = NULL;
130
+ ele_open = 1;
131
+ }
132
+
133
+ action _tag { mark_tag = p; }
134
+ action _aval { mark_aval = p; }
135
+ action _akey { mark_akey = p; }
136
+ action tag { SET(tag, p); }
137
+ action tagc { SET(tag, p-1); }
138
+ action aval { SET(aval, p); }
139
+ action aunq {
140
+ if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
141
+ else { SET(aval, p); }
142
+ }
143
+ action akey { SET(akey, p); }
144
+ action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
145
+ action xmlenc {
146
+ #ifdef HAVE_RUBY_ENCODING_H
147
+ if (mark_aval < p) {
148
+ char psave = *p;
149
+ *p = '\0';
150
+ encoding_index = rb_enc_find_index(mark_aval);
151
+ *p = psave;
152
+ }
153
+ #endif
154
+ SET(aval, p);
155
+ ATTR(ID2SYM(rb_intern("encoding")), aval);
156
+ }
157
+ action xmlsd { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
158
+ action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
159
+ action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
160
+
161
+ action new_attr {
162
+ akey = Qnil;
163
+ aval = Qnil;
164
+ mark_akey = NULL;
165
+ mark_aval = NULL;
166
+ }
167
+
168
+ action save_attr {
169
+ if (!S->xml && !NIL_P(akey))
170
+ akey = rb_funcall(akey, s_downcase, 0);
171
+ ATTR(akey, aval);
172
+ }
173
+
174
+ include hpricot_common "hpricot_common.rl";
175
+
176
+ }%%
177
+
178
+ %% write data nofinal;
179
+
180
+ #define BUFSIZE 16384
181
+
182
+ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
183
+ {
184
+ VALUE ary;
185
+ if (sym == sym_text) {
186
+ raw = tag;
187
+ }
188
+ ary = rb_ary_new3(4, sym, tag, attr, raw);
189
+ if (taint) {
190
+ OBJ_TAINT(ary);
191
+ OBJ_TAINT(tag);
192
+ OBJ_TAINT(attr);
193
+ OBJ_TAINT(raw);
194
+ }
195
+ rb_yield(ary);
196
+ }
197
+
198
+ #ifndef RHASH_TBL
199
+ /* rb_hash_lookup() is only in Ruby 1.8.7 */
200
+ static VALUE
201
+ our_rb_hash_lookup(VALUE hash, VALUE key)
202
+ {
203
+ VALUE val;
204
+
205
+ if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
206
+ return Qnil; /* without Hash#default */
207
+ }
208
+
209
+ return val;
210
+ }
211
+ #define rb_hash_lookup our_rb_hash_lookup
212
+ #endif
213
+
214
+ static void
215
+ rb_hpricot_add(VALUE focus, VALUE ele)
216
+ {
217
+ VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
218
+ if (NIL_P(children))
219
+ H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
220
+ rb_ary_push(children, ele);
221
+ H_ELE_SET(ele, H_ELE_PARENT, focus);
222
+ }
223
+
224
+ typedef struct {
225
+ VALUE doc;
226
+ VALUE focus;
227
+ VALUE last;
228
+ VALUE EC;
229
+ unsigned char xml, strict, fixup;
230
+ } hpricot_state;
231
+
232
+ #define H_PROP(prop, idx) \
233
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
234
+ H_ELE_SET(self, idx, x); \
235
+ return self; \
236
+ } \
237
+ static VALUE hpricot_ele_clear_##prop(VALUE self) { \
238
+ H_ELE_SET(self, idx, Qnil); \
239
+ return Qtrue; \
240
+ } \
241
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
242
+ return H_ELE_GET(self, idx); \
243
+ }
244
+
245
+ #define H_ATTR(prop) \
246
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
247
+ rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
248
+ return self; \
249
+ } \
250
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
251
+ return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
252
+ }
253
+
254
+ H_PROP(name, H_ELE_TAG);
255
+ H_PROP(raw, H_ELE_RAW);
256
+ H_PROP(parent, H_ELE_PARENT);
257
+ H_PROP(attr, H_ELE_ATTR);
258
+ H_PROP(etag, H_ELE_ETAG);
259
+ H_PROP(children, H_ELE_CHILDREN);
260
+ H_ATTR(target);
261
+ H_ATTR(encoding);
262
+ H_ATTR(version);
263
+ H_ATTR(standalone);
264
+ H_ATTR(system_id);
265
+ H_ATTR(public_id);
266
+
267
+ #define H_ELE(klass) \
268
+ ele = rb_obj_alloc(klass); \
269
+ if (klass == cElem) { \
270
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
271
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
272
+ H_ELE_SET(ele, H_ELE_EC, ec); \
273
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
274
+ VALUE raw_str = rb_str_new(raw, rawlen); \
275
+ ASSOCIATE_INDEX(raw_str); \
276
+ H_ELE_SET(ele, H_ELE_RAW, raw_str); \
277
+ } \
278
+ } else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
279
+ if (klass == cBogusETag) { \
280
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
281
+ if (raw != NULL) { \
282
+ VALUE raw_str = rb_str_new(raw, rawlen); \
283
+ ASSOCIATE_INDEX(raw_str); \
284
+ H_ELE_SET(ele, H_ELE_ATTR, raw_str); \
285
+ } \
286
+ } else { \
287
+ if (klass == cDocType) \
288
+ ATTR(ID2SYM(rb_intern("target")), tag); \
289
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
290
+ if (klass != cProcIns) { \
291
+ tag = Qnil; \
292
+ if (raw != NULL) { \
293
+ tag = rb_str_new(raw, rawlen); \
294
+ ASSOCIATE_INDEX(tag); \
295
+ } \
296
+ } \
297
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
298
+ } \
299
+ } else { \
300
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
301
+ } \
302
+ S->last = ele
303
+
304
+ //
305
+ // the swift, compact parser logic. most of the complicated stuff is done
306
+ // in the lexer. this step just pairs up the start and end tags.
307
+ //
308
+ void
309
+ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr,
310
+ char *raw, int rawlen, int taint
311
+ #ifdef HAVE_RUBY_ENCODING_H
312
+ , int encoding_index
313
+ #endif
314
+ )
315
+ {
316
+ VALUE ele, ec = Qnil;
317
+
318
+ //
319
+ // in html mode, fix up start tags incorrectly formed as empty tags
320
+ //
321
+ if (!S->xml) {
322
+ if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
323
+ ec = rb_hash_aref(S->EC, tag);
324
+ if (NIL_P(ec)) {
325
+ tag = rb_funcall(tag, s_downcase, 0);
326
+ ec = rb_hash_aref(S->EC, tag);
327
+ }
328
+ }
329
+
330
+ if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
331
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
332
+ !(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
333
+ {
334
+ sym = sym_text;
335
+ tag = rb_str_new(raw, rawlen);
336
+ ASSOCIATE_INDEX(tag);
337
+ }
338
+
339
+ if (!NIL_P(ec)) {
340
+ if (sym == sym_emptytag) {
341
+ if (ec != sym_EMPTY)
342
+ sym = sym_stag;
343
+ } else if (sym == sym_stag) {
344
+ if (ec == sym_EMPTY)
345
+ sym = sym_emptytag;
346
+ }
347
+ }
348
+ }
349
+
350
+ if (sym == sym_emptytag || sym == sym_stag) {
351
+ VALUE name = INT2FIX(rb_str_hash(tag));
352
+ H_ELE(cElem);
353
+ H_ELE_SET(ele, H_ELE_HASH, name);
354
+
355
+ if (!S->xml) {
356
+ VALUE match = Qnil, e = S->focus;
357
+ while (e != S->doc)
358
+ {
359
+ VALUE hEC = H_ELE_GET(e, H_ELE_EC);
360
+
361
+ if (TYPE(hEC) == T_HASH)
362
+ {
363
+ VALUE has = rb_hash_lookup(hEC, name);
364
+ if (has != Qnil) {
365
+ if (has == Qtrue) {
366
+ if (match == Qnil)
367
+ match = e;
368
+ } else if (has == symAllow) {
369
+ match = S->focus;
370
+ } else if (has == symDeny) {
371
+ match = Qnil;
372
+ }
373
+ }
374
+ }
375
+
376
+ e = H_ELE_GET(e, H_ELE_PARENT);
377
+ }
378
+
379
+ if (match == Qnil)
380
+ match = S->focus;
381
+ S->focus = match;
382
+ }
383
+
384
+ rb_hpricot_add(S->focus, ele);
385
+
386
+ //
387
+ // in the case of a start tag that should be empty, just
388
+ // skip the step that focuses the element. focusing moves
389
+ // us deeper into the document.
390
+ //
391
+ if (sym == sym_stag) {
392
+ if (S->xml || ec != sym_EMPTY) {
393
+ S->focus = ele;
394
+ S->last = Qnil;
395
+ }
396
+ }
397
+ } else if (sym == sym_etag) {
398
+ VALUE name, match = Qnil, e = S->focus;
399
+ if (S->strict) {
400
+ if (NIL_P(rb_hash_aref(S->EC, tag))) {
401
+ tag = rb_str_new2("div");
402
+ ASSOCIATE_INDEX(tag);
403
+ }
404
+ }
405
+
406
+ //
407
+ // another optimization will be to improve this very simple
408
+ // O(n) tag search, where n is the depth of the focused tag.
409
+ //
410
+ // (see also: the search above for fixups)
411
+ //
412
+ name = INT2FIX(rb_str_hash(tag));
413
+ while (e != S->doc)
414
+ {
415
+ if (H_ELE_GET(e, H_ELE_HASH) == name)
416
+ {
417
+ match = e;
418
+ break;
419
+ }
420
+
421
+ e = H_ELE_GET(e, H_ELE_PARENT);
422
+ }
423
+
424
+ if (NIL_P(match))
425
+ {
426
+ H_ELE(cBogusETag);
427
+ rb_hpricot_add(S->focus, ele);
428
+ }
429
+ else
430
+ {
431
+ VALUE ele = Qnil;
432
+ if (raw != NULL) {
433
+ ele = rb_str_new(raw, rawlen);
434
+ ASSOCIATE_INDEX(ele);
435
+ }
436
+ H_ELE_SET(match, H_ELE_ETAG, ele);
437
+ S->focus = H_ELE_GET(match, H_ELE_PARENT);
438
+ S->last = Qnil;
439
+ }
440
+ } else if (sym == sym_cdata) {
441
+ H_ELE(cCData);
442
+ rb_hpricot_add(S->focus, ele);
443
+ } else if (sym == sym_comment) {
444
+ H_ELE(cComment);
445
+ rb_hpricot_add(S->focus, ele);
446
+ } else if (sym == sym_doctype) {
447
+ H_ELE(cDocType);
448
+ if (S->strict) {
449
+ VALUE id;
450
+ id = rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
451
+ ASSOCIATE_INDEX(id);
452
+ rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), id);
453
+ id = rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN");
454
+ ASSOCIATE_INDEX(id);
455
+ rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), id);
456
+ }
457
+ rb_hpricot_add(S->focus, ele);
458
+ } else if (sym == sym_procins) {
459
+ VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
460
+ tag = rb_reg_nth_match(1, match);
461
+ attr = rb_reg_nth_match(2, match);
462
+ {
463
+ H_ELE(cProcIns);
464
+ rb_hpricot_add(S->focus, ele);
465
+ }
466
+ } else if (sym == sym_text) {
467
+ // TODO: add raw_string as well?
468
+ if (!NIL_P(S->last) && RTEST(rb_obj_is_instance_of(S->last, cText))) {
469
+ rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
470
+ } else {
471
+ H_ELE(cText);
472
+ rb_hpricot_add(S->focus, ele);
473
+ }
474
+ } else if (sym == sym_xmldecl) {
475
+ H_ELE(cXMLDecl);
476
+ rb_hpricot_add(S->focus, ele);
477
+ }
478
+ }
479
+
480
+ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
481
+ {
482
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
483
+ char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
484
+
485
+ hpricot_state *S = NULL;
486
+ VALUE port, opts;
487
+ VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
488
+ char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
489
+ int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
490
+ #ifdef HAVE_RUBY_ENCODING_H
491
+ int encoding_index = rb_enc_to_index(rb_default_external_encoding());
492
+ #endif
493
+
494
+ rb_scan_args(argc, argv, "11", &port, &opts);
495
+ taint = OBJ_TAINTED(port);
496
+ io = rb_respond_to(port, s_read);
497
+ if (!io)
498
+ {
499
+ if (rb_respond_to(port, s_to_str))
500
+ {
501
+ port = rb_funcall(port, s_to_str, 0);
502
+ StringValue(port);
503
+ }
504
+ else
505
+ {
506
+ rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
507
+ }
508
+ }
509
+
510
+ if (TYPE(opts) != T_HASH)
511
+ opts = Qnil;
512
+
513
+ if (!rb_block_given_p())
514
+ {
515
+ S = ALLOC(hpricot_state);
516
+ S->doc = rb_obj_alloc(cDoc);
517
+ rb_gc_register_address(&S->doc);
518
+ S->focus = S->doc;
519
+ S->last = Qnil;
520
+ S->xml = OPT(opts, xml);
521
+ S->strict = OPT(opts, xhtml_strict);
522
+ S->fixup = OPT(opts, fixup_tags);
523
+ if (S->strict) S->fixup = 1;
524
+ rb_ivar_set(S->doc, rb_intern("@options"), opts);
525
+
526
+ S->EC = rb_const_get(mHpricot, s_ElementContent);
527
+ }
528
+
529
+ buffer_size = BUFSIZE;
530
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
531
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
532
+ if (!NIL_P(bufsize)) {
533
+ buffer_size = NUM2INT(bufsize);
534
+ }
535
+ }
536
+
537
+ if (io)
538
+ buf = ALLOC_N(char, buffer_size);
539
+
540
+ %% write init;
541
+
542
+ while (!done) {
543
+ VALUE str;
544
+ char *p, *pe;
545
+ int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
546
+
547
+ if (io)
548
+ {
549
+ if (space == 0) {
550
+ /* We've used up the entire buffer storing an already-parsed token
551
+ * prefix that must be preserved. Likely caused by super-long attributes.
552
+ * Increase buffer size and continue */
553
+ tokstart_diff = ts - buf;
554
+ tokend_diff = te - buf;
555
+ mark_tag_diff = mark_tag - buf;
556
+ mark_akey_diff = mark_akey - buf;
557
+ mark_aval_diff = mark_aval - buf;
558
+
559
+ buffer_size += BUFSIZE;
560
+ REALLOC_N(buf, char, buffer_size);
561
+
562
+ space = buffer_size - have;
563
+
564
+ ts = buf + tokstart_diff;
565
+ te = buf + tokend_diff;
566
+ mark_tag = buf + mark_tag_diff;
567
+ mark_akey = buf + mark_akey_diff;
568
+ mark_aval = buf + mark_aval_diff;
569
+ }
570
+ p = buf + have;
571
+
572
+ str = rb_funcall(port, s_read, 1, INT2FIX(space));
573
+ len = RSTRING_LEN(str);
574
+ memcpy(p, StringValuePtr(str), len);
575
+ }
576
+ else
577
+ {
578
+ p = RSTRING_PTR(port);
579
+ len = RSTRING_LEN(port) + 1;
580
+ done = 1;
581
+ }
582
+
583
+ nread += len;
584
+
585
+ /* If this is the last buffer, tack on an EOF. */
586
+ if (io && len < space) {
587
+ p[len++] = 0;
588
+ done = 1;
589
+ }
590
+
591
+ pe = p + len;
592
+ %% write exec;
593
+
594
+ if (cs == hpricot_scan_error) {
595
+ if (buf != NULL)
596
+ free(buf);
597
+ if (!NIL_P(tag))
598
+ {
599
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
600
+ }
601
+ else
602
+ {
603
+ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
604
+ }
605
+ }
606
+
607
+ if (done && ele_open)
608
+ {
609
+ ele_open = 0;
610
+ if (ts > 0) {
611
+ mark_tag = ts;
612
+ ts = 0;
613
+ text = 1;
614
+ }
615
+ }
616
+
617
+ if (ts == 0)
618
+ {
619
+ have = 0;
620
+ /* text nodes have no ts because each byte is parsed alone */
621
+ if (mark_tag != NULL && text == 1)
622
+ {
623
+ if (done)
624
+ {
625
+ if (mark_tag < p-1)
626
+ {
627
+ CAT(tag, p-1);
628
+ ELE(text);
629
+ }
630
+ }
631
+ else
632
+ {
633
+ CAT(tag, p);
634
+ }
635
+ }
636
+ if (io)
637
+ mark_tag = buf;
638
+ else
639
+ mark_tag = RSTRING_PTR(port);
640
+ }
641
+ else if (io)
642
+ {
643
+ have = pe - ts;
644
+ memmove(buf, ts, have);
645
+ SLIDE(tag);
646
+ SLIDE(akey);
647
+ SLIDE(aval);
648
+ te = buf + (te - ts);
649
+ ts = buf;
650
+ }
651
+ }
652
+
653
+ if (buf != NULL)
654
+ free(buf);
655
+
656
+ if (S != NULL)
657
+ {
658
+ VALUE doc = S->doc;
659
+ rb_gc_unregister_address(&S->doc);
660
+ free(S);
661
+ return doc;
662
+ }
663
+
664
+ return Qnil;
665
+ }
666
+
667
+ void hstruct_mark(void* ptr) {
668
+ struct hpricot_struct* st = (struct hpricot_struct*)ptr;
669
+ int i;
670
+
671
+ for(i = 0; i < st->len; i++) {
672
+ rb_gc_mark(st->ptr[i]);
673
+ }
674
+ }
675
+
676
+ void hstruct_free(void* ptr) {
677
+ struct hpricot_struct* st = (struct hpricot_struct*)ptr;
678
+
679
+ free(st->ptr);
680
+ free(st);
681
+ }
682
+
683
+ static VALUE
684
+ alloc_hpricot_struct8(VALUE klass)
685
+ {
686
+ VALUE obj;
687
+ struct hpricot_struct* st;
688
+
689
+ obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);
690
+
691
+ st->len = 8;
692
+ st->ptr = ALLOC_N(VALUE, 8);
693
+
694
+ rb_mem_clear(st->ptr, 8);
695
+
696
+ return obj;
697
+ }
698
+
699
+ static VALUE
700
+ alloc_hpricot_struct2(VALUE klass)
701
+ {
702
+ VALUE obj;
703
+ struct hpricot_struct* st;
704
+
705
+ obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);
706
+
707
+ st->len = 2;
708
+ st->ptr = ALLOC_N(VALUE, 2);
709
+
710
+ rb_mem_clear(st->ptr, 2);
711
+
712
+ return obj;
713
+ }
714
+
715
+ static VALUE
716
+ alloc_hpricot_struct3(VALUE klass)
717
+ {
718
+ VALUE obj;
719
+ struct hpricot_struct* st;
720
+
721
+ obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);
722
+
723
+ st->len = 3;
724
+ st->ptr = ALLOC_N(VALUE, 3);
725
+
726
+ rb_mem_clear(st->ptr, 3);
727
+
728
+ return obj;
729
+ }
730
+
731
+ static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);}
732
+ static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);}
733
+ static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);}
734
+ static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);}
735
+ static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);}
736
+ static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);}
737
+ static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);}
738
+ static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);}
739
+ static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);}
740
+ static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}
741
+
742
+ static VALUE (*ref_func[10])() = {
743
+ hpricot_struct_ref0,
744
+ hpricot_struct_ref1,
745
+ hpricot_struct_ref2,
746
+ hpricot_struct_ref3,
747
+ hpricot_struct_ref4,
748
+ hpricot_struct_ref5,
749
+ hpricot_struct_ref6,
750
+ hpricot_struct_ref7,
751
+ hpricot_struct_ref8,
752
+ hpricot_struct_ref9,
753
+ };
754
+
755
+ static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);}
756
+ static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);}
757
+ static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);}
758
+ static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);}
759
+ static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);}
760
+ static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);}
761
+ static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);}
762
+ static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);}
763
+ static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);}
764
+ static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}
765
+
766
+ static VALUE (*set_func[10])() = {
767
+ hpricot_struct_set0,
768
+ hpricot_struct_set1,
769
+ hpricot_struct_set2,
770
+ hpricot_struct_set3,
771
+ hpricot_struct_set4,
772
+ hpricot_struct_set5,
773
+ hpricot_struct_set6,
774
+ hpricot_struct_set7,
775
+ hpricot_struct_set8,
776
+ hpricot_struct_set9,
777
+ };
778
+
779
+ static VALUE
780
+ make_hpricot_struct(VALUE members, VALUE (*alloc)(VALUE klass))
781
+ {
782
+ int i = 0;
783
+ char attr_set[128];
784
+
785
+ VALUE klass = rb_class_new(rb_cObject);
786
+ rb_define_alloc_func(klass, alloc);
787
+
788
+ int len = RARRAY_LEN(members);
789
+ assert(len < 10);
790
+
791
+ for (i = 0; i < len; i++) {
792
+ ID id = SYM2ID(rb_ary_entry(members, i));
793
+ const char* name = rb_id2name(id);
794
+ int len = strlen(name);
795
+
796
+ memcpy(attr_set, name, strlen(name));
797
+ attr_set[len] = '=';
798
+ attr_set[len+1] = 0;
799
+
800
+ rb_define_method(klass, name, ref_func[i], 0);
801
+ rb_define_method(klass, attr_set, set_func[i], 1);
802
+ }
803
+ return klass;
804
+ }
805
+
806
+ void Init_hpricot_scan()
807
+ {
808
+ VALUE structElem, structAttr, structBasic;
809
+
810
+ s_ElementContent = rb_intern("ElementContent");
811
+ symAllow = ID2SYM(rb_intern("allow"));
812
+ symDeny = ID2SYM(rb_intern("deny"));
813
+ s_downcase = rb_intern("downcase");
814
+ s_new = rb_intern("new");
815
+ s_parent = rb_intern("parent");
816
+ s_read = rb_intern("read");
817
+ s_to_str = rb_intern("to_str");
818
+ sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
819
+ sym_doctype = ID2SYM(rb_intern("doctype"));
820
+ sym_procins = ID2SYM(rb_intern("procins"));
821
+ sym_stag = ID2SYM(rb_intern("stag"));
822
+ sym_etag = ID2SYM(rb_intern("etag"));
823
+ sym_emptytag = ID2SYM(rb_intern("emptytag"));
824
+ sym_allowed = ID2SYM(rb_intern("allowed"));
825
+ sym_children = ID2SYM(rb_intern("children"));
826
+ sym_comment = ID2SYM(rb_intern("comment"));
827
+ sym_cdata = ID2SYM(rb_intern("cdata"));
828
+ sym_name = ID2SYM(rb_intern("name"));
829
+ sym_parent = ID2SYM(rb_intern("parent"));
830
+ sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
831
+ sym_raw_string = ID2SYM(rb_intern("raw_string"));
832
+ sym_tagno = ID2SYM(rb_intern("tagno"));
833
+ sym_text = ID2SYM(rb_intern("text"));
834
+ sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
835
+ sym_CDATA = ID2SYM(rb_intern("CDATA"));
836
+
837
+ mHpricot = rb_define_module("Hpricot");
838
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
839
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
840
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
841
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
842
+
843
+ structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
844
+ sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
845
+ sym_tagno, sym_children), alloc_hpricot_struct8);
846
+ structAttr = make_hpricot_struct(
847
+ rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes),
848
+ alloc_hpricot_struct3);
849
+ structBasic = make_hpricot_struct(
850
+ rb_ary_new3(2, sym_name, sym_parent),
851
+ alloc_hpricot_struct2);
852
+
853
+ cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
854
+ cCData = rb_define_class_under(mHpricot, "CData", structBasic);
855
+ rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
856
+ rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
857
+ cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
858
+ rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
859
+ rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
860
+ cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
861
+ rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
862
+ rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
863
+ rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
864
+ rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
865
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
866
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
867
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
868
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
869
+ cElem = rb_define_class_under(mHpricot, "Elem", structElem);
870
+ rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
871
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
872
+ rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
873
+ rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
874
+ cText = rb_define_class_under(mHpricot, "Text", structBasic);
875
+ rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
876
+ rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
877
+ rb_define_method(cText, "content", hpricot_ele_get_name, 0);
878
+ rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
879
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
880
+ rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
881
+ rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
882
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
883
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
884
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
885
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
886
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
887
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
888
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
889
+ rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
890
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
891
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
892
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
893
+
894
+ rb_const_set(mHpricot, rb_intern("ProcInsParse"),
895
+ reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
896
+ }