thbar-hpricot 0.8.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. data/CHANGELOG +104 -0
  2. data/COPYING +18 -0
  3. data/README.md +276 -0
  4. data/Rakefile +234 -0
  5. data/ext/fast_xs/FastXsService.java +1123 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +210 -0
  8. data/ext/hpricot_scan/HpricotCss.java +850 -0
  9. data/ext/hpricot_scan/HpricotScanService.java +2099 -0
  10. data/ext/hpricot_scan/extconf.rb +9 -0
  11. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  12. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  13. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  14. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  15. data/ext/hpricot_scan/hpricot_scan.c +7045 -0
  16. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  17. data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
  18. data/ext/hpricot_scan/hpricot_scan.rl +902 -0
  19. data/extras/hpricot.png +0 -0
  20. data/lib/hpricot.rb +26 -0
  21. data/lib/hpricot/blankslate.rb +63 -0
  22. data/lib/hpricot/builder.rb +216 -0
  23. data/lib/hpricot/elements.rb +514 -0
  24. data/lib/hpricot/htmlinfo.rb +691 -0
  25. data/lib/hpricot/inspect.rb +103 -0
  26. data/lib/hpricot/modules.rb +40 -0
  27. data/lib/hpricot/parse.rb +38 -0
  28. data/lib/hpricot/tag.rb +219 -0
  29. data/lib/hpricot/tags.rb +164 -0
  30. data/lib/hpricot/traverse.rb +839 -0
  31. data/lib/hpricot/xchar.rb +94 -0
  32. data/test/files/basic.xhtml +17 -0
  33. data/test/files/boingboing.html +2266 -0
  34. data/test/files/cy0.html +3653 -0
  35. data/test/files/immob.html +400 -0
  36. data/test/files/pace_application.html +1320 -0
  37. data/test/files/tenderlove.html +16 -0
  38. data/test/files/uswebgen.html +220 -0
  39. data/test/files/utf8.html +1054 -0
  40. data/test/files/week9.html +1723 -0
  41. data/test/files/why.xml +19 -0
  42. data/test/load_files.rb +7 -0
  43. data/test/nokogiri-bench.rb +64 -0
  44. data/test/test_alter.rb +96 -0
  45. data/test/test_builder.rb +37 -0
  46. data/test/test_parser.rb +457 -0
  47. data/test/test_paths.rb +25 -0
  48. data/test/test_preserved.rb +88 -0
  49. data/test/test_xml.rb +28 -0
  50. metadata +124 -0
@@ -0,0 +1,902 @@
1
+ /*
2
+ * hpricot_scan.rl
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006, 2010 why the lucky stiff
8
+ */
9
+ #include <ruby.h>
10
+ #include <assert.h>
11
+
12
+ struct hpricot_struct {
13
+ int len;
14
+ VALUE* ptr;
15
+ };
16
+
17
+ #ifndef RARRAY_LEN
18
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
19
+ #define RSTRING_LEN(str) RSTRING(str)->len
20
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
21
+ #endif
22
+
23
+ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
24
+
25
+ #define NO_WAY_SERIOUSLY "*** This should not happen, please file a bug report with the HTML you're parsing at http://github.com/hpricot/hpricot/issues. So sorry!"
26
+
27
+ static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
28
+ sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
29
+ sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;
30
+ static VALUE mHpricot, rb_eHpricotParseError;
31
+ static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,
32
+ cXMLDecl, cProcIns, symAllow, symDeny;
33
+ static ID s_ElementContent;
34
+ static ID s_downcase, s_new, s_parent, s_read, s_to_str;
35
+ static VALUE reProcInsParse;
36
+
37
+ #define H_ELE_TAG 0
38
+ #define H_ELE_PARENT 1
39
+ #define H_ELE_ATTR 2
40
+ #define H_ELE_ETAG 3
41
+ #define H_ELE_RAW 4
42
+ #define H_ELE_EC 5
43
+ #define H_ELE_HASH 6
44
+ #define H_ELE_CHILDREN 7
45
+
46
+ #define HSTRUCT_PTR(ele) ((struct hpricot_struct*)DATA_PTR(ele))->ptr
47
+
48
+ #define H_ELE_GET(ele, idx) HSTRUCT_PTR(ele)[idx]
49
+ #define H_ELE_SET(ele, idx, val) HSTRUCT_PTR(ele)[idx] = val
50
+
51
+ #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
52
+
53
+ #ifdef HAVE_RUBY_ENCODING_H
54
+ #include <ruby/encoding.h>
55
+ # define ASSOCIATE_INDEX(s) rb_enc_associate_index((s), encoding_index)
56
+ # define ENCODING_INDEX , encoding_index
57
+ #else
58
+ # define ASSOCIATE_INDEX(s)
59
+ # define ENCODING_INDEX
60
+ #endif
61
+
62
+ #define ELE(N) \
63
+ if (te > ts || text == 1) { \
64
+ char *raw = NULL; \
65
+ int rawlen = 0; \
66
+ ele_open = 0; text = 0; \
67
+ if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
68
+ raw = ts; rawlen = te - ts; \
69
+ } \
70
+ if (rb_block_given_p()) { \
71
+ VALUE raw_string = Qnil; \
72
+ if (raw != NULL) { \
73
+ raw_string = rb_str_new(raw, rawlen); \
74
+ ASSOCIATE_INDEX(raw_string); \
75
+ } \
76
+ rb_yield_tokens(sym_##N, tag, attr, Qnil, taint); \
77
+ } else \
78
+ rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint ENCODING_INDEX); \
79
+ }
80
+
81
+ #define SET(N, E) \
82
+ if (mark_##N == NULL || E == mark_##N) { \
83
+ N = rb_str_new2(""); \
84
+ ASSOCIATE_INDEX(N); \
85
+ } else if (E > mark_##N) { \
86
+ N = rb_str_new(mark_##N, E - mark_##N); \
87
+ ASSOCIATE_INDEX(N); \
88
+ }
89
+
90
+ #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
91
+
92
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
93
+
94
+ #define ATTR(K, V) \
95
+ if (!NIL_P(K)) { \
96
+ if (NIL_P(attr)) attr = rb_hash_new(); \
97
+ rb_hash_aset(attr, K, V); \
98
+ }
99
+
100
+ #define TEXT_PASS() \
101
+ if (text == 0) \
102
+ { \
103
+ if (ele_open == 1) { \
104
+ ele_open = 0; \
105
+ if (ts > 0) { \
106
+ mark_tag = ts; \
107
+ } \
108
+ } else { \
109
+ mark_tag = p; \
110
+ } \
111
+ attr = Qnil; \
112
+ tag = Qnil; \
113
+ text = 1; \
114
+ }
115
+
116
+ #define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
117
+
118
+ %%{
119
+ machine hpricot_scan;
120
+
121
+ action newEle {
122
+ if (text == 1) {
123
+ CAT(tag, p);
124
+ ELE(text);
125
+ text = 0;
126
+ }
127
+ attr = Qnil;
128
+ tag = Qnil;
129
+ mark_tag = NULL;
130
+ ele_open = 1;
131
+ }
132
+
133
+ action _tag { mark_tag = p; }
134
+ action _aval { mark_aval = p; }
135
+ action _akey { mark_akey = p; }
136
+ action tag { SET(tag, p); }
137
+ action tagc { SET(tag, p-1); }
138
+ action aval { SET(aval, p); }
139
+ action aunq {
140
+ if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
141
+ else { SET(aval, p); }
142
+ }
143
+ action akey { SET(akey, p); }
144
+ action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
145
+ action xmlenc {
146
+ #ifdef HAVE_RUBY_ENCODING_H
147
+ if (mark_aval < p) {
148
+ char psave = *p;
149
+ *p = '\0';
150
+ encoding_index = rb_enc_find_index(mark_aval);
151
+ *p = psave;
152
+ }
153
+ #endif
154
+ SET(aval, p);
155
+ ATTR(ID2SYM(rb_intern("encoding")), aval);
156
+ }
157
+ action xmlsd { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
158
+ action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
159
+ action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
160
+
161
+ action new_attr {
162
+ akey = Qnil;
163
+ aval = Qnil;
164
+ mark_akey = NULL;
165
+ mark_aval = NULL;
166
+ }
167
+
168
+ action save_attr {
169
+ if (!S->xml && !NIL_P(akey))
170
+ akey = rb_funcall(akey, s_downcase, 0);
171
+ ATTR(akey, aval);
172
+ }
173
+
174
+ include hpricot_common "hpricot_common.rl";
175
+
176
+ }%%
177
+
178
+ %% write data nofinal;
179
+
180
+ #define BUFSIZE 16384
181
+
182
+ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
183
+ {
184
+ VALUE ary;
185
+ if (sym == sym_text) {
186
+ raw = tag;
187
+ }
188
+ ary = rb_ary_new3(4, sym, tag, attr, raw);
189
+ if (taint) {
190
+ OBJ_TAINT(ary);
191
+ OBJ_TAINT(tag);
192
+ OBJ_TAINT(attr);
193
+ OBJ_TAINT(raw);
194
+ }
195
+ rb_yield(ary);
196
+ }
197
+
198
+ #ifndef RHASH_TBL
199
+ /* rb_hash_lookup() is only in Ruby 1.8.7 */
200
+ static VALUE
201
+ our_rb_hash_lookup(VALUE hash, VALUE key)
202
+ {
203
+ VALUE val;
204
+
205
+ if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
206
+ return Qnil; /* without Hash#default */
207
+ }
208
+
209
+ return val;
210
+ }
211
+ #define rb_hash_lookup our_rb_hash_lookup
212
+ #endif
213
+
214
+ static void
215
+ rb_hpricot_add(VALUE focus, VALUE ele)
216
+ {
217
+ VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
218
+ if (NIL_P(children))
219
+ H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
220
+ rb_ary_push(children, ele);
221
+ H_ELE_SET(ele, H_ELE_PARENT, focus);
222
+ }
223
+
224
+ typedef struct {
225
+ VALUE doc;
226
+ VALUE focus;
227
+ VALUE last;
228
+ VALUE EC;
229
+ unsigned char xml, strict, fixup;
230
+ } hpricot_state;
231
+
232
+ #define H_PROP(prop, idx) \
233
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
234
+ H_ELE_SET(self, idx, x); \
235
+ return self; \
236
+ } \
237
+ static VALUE hpricot_ele_clear_##prop(VALUE self) { \
238
+ H_ELE_SET(self, idx, Qnil); \
239
+ return Qtrue; \
240
+ } \
241
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
242
+ return H_ELE_GET(self, idx); \
243
+ }
244
+
245
+ #define H_ATTR(prop) \
246
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
247
+ rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
248
+ return self; \
249
+ } \
250
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
251
+ return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
252
+ }
253
+
254
+ H_PROP(name, H_ELE_TAG);
255
+ H_PROP(raw, H_ELE_RAW);
256
+ H_PROP(parent, H_ELE_PARENT);
257
+ H_PROP(attr, H_ELE_ATTR);
258
+ H_PROP(etag, H_ELE_ETAG);
259
+ H_PROP(children, H_ELE_CHILDREN);
260
+ H_ATTR(target);
261
+ H_ATTR(encoding);
262
+ H_ATTR(version);
263
+ H_ATTR(standalone);
264
+ H_ATTR(system_id);
265
+ H_ATTR(public_id);
266
+
267
+ #define H_ELE(klass) \
268
+ ele = rb_obj_alloc(klass); \
269
+ if (klass == cElem) { \
270
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
271
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
272
+ H_ELE_SET(ele, H_ELE_EC, ec); \
273
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
274
+ VALUE raw_str = rb_str_new(raw, rawlen); \
275
+ ASSOCIATE_INDEX(raw_str); \
276
+ H_ELE_SET(ele, H_ELE_RAW, raw_str); \
277
+ } \
278
+ } else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
279
+ if (klass == cBogusETag) { \
280
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
281
+ if (raw != NULL) { \
282
+ VALUE raw_str = rb_str_new(raw, rawlen); \
283
+ ASSOCIATE_INDEX(raw_str); \
284
+ H_ELE_SET(ele, H_ELE_ATTR, raw_str); \
285
+ } \
286
+ } else { \
287
+ if (klass == cDocType) \
288
+ ATTR(ID2SYM(rb_intern("target")), tag); \
289
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
290
+ if (klass != cProcIns) { \
291
+ tag = Qnil; \
292
+ if (raw != NULL) { \
293
+ tag = rb_str_new(raw, rawlen); \
294
+ ASSOCIATE_INDEX(tag); \
295
+ } \
296
+ } \
297
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
298
+ } \
299
+ } else { \
300
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
301
+ } \
302
+ S->last = ele
303
+
304
+ //
305
+ // the swift, compact parser logic. most of the complicated stuff is done
306
+ // in the lexer. this step just pairs up the start and end tags.
307
+ //
308
+ void
309
+ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr,
310
+ char *raw, int rawlen, int taint
311
+ #ifdef HAVE_RUBY_ENCODING_H
312
+ , int encoding_index
313
+ #endif
314
+ )
315
+ {
316
+ VALUE ele, ec = Qnil;
317
+
318
+ //
319
+ // in html mode, fix up start tags incorrectly formed as empty tags
320
+ //
321
+ if (!S->xml) {
322
+ if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
323
+ ec = rb_hash_aref(S->EC, tag);
324
+ if (NIL_P(ec)) {
325
+ tag = rb_funcall(tag, s_downcase, 0);
326
+ ec = rb_hash_aref(S->EC, tag);
327
+ }
328
+ }
329
+
330
+ if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
331
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
332
+ !(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
333
+ {
334
+ sym = sym_text;
335
+ tag = rb_str_new(raw, rawlen);
336
+ ASSOCIATE_INDEX(tag);
337
+ }
338
+
339
+ if (!NIL_P(ec)) {
340
+ if (sym == sym_emptytag) {
341
+ if (ec != sym_EMPTY)
342
+ sym = sym_stag;
343
+ } else if (sym == sym_stag) {
344
+ if (ec == sym_EMPTY)
345
+ sym = sym_emptytag;
346
+ }
347
+ }
348
+ }
349
+
350
+ if (sym == sym_emptytag || sym == sym_stag) {
351
+ VALUE name = INT2FIX(rb_str_hash(tag));
352
+ H_ELE(cElem);
353
+ H_ELE_SET(ele, H_ELE_HASH, name);
354
+
355
+ if (!S->xml) {
356
+ VALUE match = Qnil, e = S->focus;
357
+ while (e != S->doc)
358
+ {
359
+ VALUE hEC = H_ELE_GET(e, H_ELE_EC);
360
+
361
+ if (TYPE(hEC) == T_HASH)
362
+ {
363
+ VALUE has = rb_hash_lookup(hEC, name);
364
+ if (has != Qnil) {
365
+ if (has == Qtrue) {
366
+ if (match == Qnil)
367
+ match = e;
368
+ } else if (has == symAllow) {
369
+ match = S->focus;
370
+ } else if (has == symDeny) {
371
+ match = Qnil;
372
+ }
373
+ }
374
+ }
375
+
376
+ e = H_ELE_GET(e, H_ELE_PARENT);
377
+ }
378
+
379
+ if (match == Qnil)
380
+ match = S->focus;
381
+ S->focus = match;
382
+ }
383
+
384
+ rb_hpricot_add(S->focus, ele);
385
+
386
+ //
387
+ // in the case of a start tag that should be empty, just
388
+ // skip the step that focuses the element. focusing moves
389
+ // us deeper into the document.
390
+ //
391
+ if (sym == sym_stag) {
392
+ if (S->xml || ec != sym_EMPTY) {
393
+ S->focus = ele;
394
+ S->last = Qnil;
395
+ }
396
+ }
397
+ } else if (sym == sym_etag) {
398
+ VALUE name, match = Qnil, e = S->focus;
399
+ if (S->strict) {
400
+ if (NIL_P(rb_hash_aref(S->EC, tag))) {
401
+ tag = rb_str_new2("div");
402
+ ASSOCIATE_INDEX(tag);
403
+ }
404
+ }
405
+
406
+ //
407
+ // another optimization will be to improve this very simple
408
+ // O(n) tag search, where n is the depth of the focused tag.
409
+ //
410
+ // (see also: the search above for fixups)
411
+ //
412
+ name = INT2FIX(rb_str_hash(tag));
413
+ while (e != S->doc)
414
+ {
415
+ if (H_ELE_GET(e, H_ELE_HASH) == name)
416
+ {
417
+ match = e;
418
+ break;
419
+ }
420
+
421
+ e = H_ELE_GET(e, H_ELE_PARENT);
422
+ }
423
+
424
+ if (NIL_P(match))
425
+ {
426
+ H_ELE(cBogusETag);
427
+ rb_hpricot_add(S->focus, ele);
428
+ }
429
+ else
430
+ {
431
+ VALUE ele = Qnil;
432
+ if (raw != NULL) {
433
+ ele = rb_str_new(raw, rawlen);
434
+ ASSOCIATE_INDEX(ele);
435
+ }
436
+ H_ELE_SET(match, H_ELE_ETAG, ele);
437
+ S->focus = H_ELE_GET(match, H_ELE_PARENT);
438
+ S->last = Qnil;
439
+ }
440
+ } else if (sym == sym_cdata) {
441
+ H_ELE(cCData);
442
+ rb_hpricot_add(S->focus, ele);
443
+ } else if (sym == sym_comment) {
444
+ H_ELE(cComment);
445
+ rb_hpricot_add(S->focus, ele);
446
+ } else if (sym == sym_doctype) {
447
+ H_ELE(cDocType);
448
+ if (S->strict) {
449
+ VALUE id;
450
+ id = rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
451
+ ASSOCIATE_INDEX(id);
452
+ rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), id);
453
+ id = rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN");
454
+ ASSOCIATE_INDEX(id);
455
+ rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), id);
456
+ }
457
+ rb_hpricot_add(S->focus, ele);
458
+ } else if (sym == sym_procins) {
459
+ VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
460
+ tag = rb_reg_nth_match(1, match);
461
+ attr = rb_reg_nth_match(2, match);
462
+ {
463
+ H_ELE(cProcIns);
464
+ rb_hpricot_add(S->focus, ele);
465
+ }
466
+ } else if (sym == sym_text) {
467
+ // TODO: add raw_string as well?
468
+ if (!NIL_P(S->last) && RTEST(rb_obj_is_instance_of(S->last, cText))) {
469
+ rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
470
+ } else {
471
+ H_ELE(cText);
472
+ rb_hpricot_add(S->focus, ele);
473
+ }
474
+ } else if (sym == sym_xmldecl) {
475
+ H_ELE(cXMLDecl);
476
+ rb_hpricot_add(S->focus, ele);
477
+ }
478
+ }
479
+
480
+ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
481
+ {
482
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
483
+ char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
484
+
485
+ hpricot_state *S = NULL;
486
+ VALUE port, opts;
487
+ VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
488
+ char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
489
+ int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
490
+ #ifdef HAVE_RUBY_ENCODING_H
491
+ int encoding_index = rb_enc_to_index(rb_default_external_encoding());
492
+ #endif
493
+
494
+ rb_scan_args(argc, argv, "11", &port, &opts);
495
+ taint = OBJ_TAINTED(port);
496
+ io = rb_respond_to(port, s_read);
497
+ if (!io)
498
+ {
499
+ if (rb_respond_to(port, s_to_str))
500
+ {
501
+ port = rb_funcall(port, s_to_str, 0);
502
+ StringValue(port);
503
+ }
504
+ else
505
+ {
506
+ rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
507
+ }
508
+ }
509
+
510
+ if (TYPE(opts) != T_HASH)
511
+ opts = Qnil;
512
+
513
+ if (!rb_block_given_p())
514
+ {
515
+ S = ALLOC(hpricot_state);
516
+ S->doc = rb_obj_alloc(cDoc);
517
+ rb_gc_register_address(&S->doc);
518
+ S->focus = S->doc;
519
+ S->last = Qnil;
520
+ S->xml = OPT(opts, xml);
521
+ S->strict = OPT(opts, xhtml_strict);
522
+ S->fixup = OPT(opts, fixup_tags);
523
+ if (S->strict) S->fixup = 1;
524
+ rb_ivar_set(S->doc, rb_intern("@options"), opts);
525
+
526
+ S->EC = rb_const_get(mHpricot, s_ElementContent);
527
+ }
528
+
529
+ buffer_size = BUFSIZE;
530
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
531
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
532
+ if (!NIL_P(bufsize)) {
533
+ buffer_size = NUM2INT(bufsize);
534
+ }
535
+ }
536
+
537
+ if (io)
538
+ buf = ALLOC_N(char, buffer_size);
539
+
540
+ %% write init;
541
+
542
+ while (!done) {
543
+ VALUE str;
544
+ char *p, *pe;
545
+ int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
546
+
547
+ if (io)
548
+ {
549
+ if (space == 0) {
550
+ /* We've used up the entire buffer storing an already-parsed token
551
+ * prefix that must be preserved. Likely caused by super-long attributes.
552
+ * Increase buffer size and continue */
553
+ tokstart_diff = ts - buf;
554
+ tokend_diff = te - buf;
555
+ mark_tag_diff = mark_tag - buf;
556
+ mark_akey_diff = mark_akey - buf;
557
+ mark_aval_diff = mark_aval - buf;
558
+
559
+ buffer_size += BUFSIZE;
560
+ REALLOC_N(buf, char, buffer_size);
561
+
562
+ space = buffer_size - have;
563
+
564
+ ts = buf + tokstart_diff;
565
+ te = buf + tokend_diff;
566
+ mark_tag = buf + mark_tag_diff;
567
+ mark_akey = buf + mark_akey_diff;
568
+ mark_aval = buf + mark_aval_diff;
569
+ }
570
+ p = buf + have;
571
+
572
+ str = rb_funcall(port, s_read, 1, INT2FIX(space));
573
+ len = RSTRING_LEN(str);
574
+ memcpy(p, StringValuePtr(str), len);
575
+ }
576
+ else
577
+ {
578
+ p = RSTRING_PTR(port);
579
+ len = RSTRING_LEN(port) + 1;
580
+ done = 1;
581
+ }
582
+
583
+ nread += len;
584
+
585
+ /* If this is the last buffer, tack on an EOF. */
586
+ if (io && len < space) {
587
+ p[len++] = 0;
588
+ done = 1;
589
+ }
590
+
591
+ pe = p + len;
592
+ %% write exec;
593
+
594
+ if (cs == hpricot_scan_error) {
595
+ if (buf != NULL)
596
+ free(buf);
597
+ if (!NIL_P(tag))
598
+ {
599
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
600
+ }
601
+ else
602
+ {
603
+ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
604
+ }
605
+ }
606
+
607
+ if (done && ele_open)
608
+ {
609
+ ele_open = 0;
610
+ if (ts > 0) {
611
+ mark_tag = ts;
612
+ ts = 0;
613
+ text = 1;
614
+ }
615
+ }
616
+
617
+ if (ts == 0)
618
+ {
619
+ have = 0;
620
+ /* text nodes have no ts because each byte is parsed alone */
621
+ if (mark_tag != NULL && text == 1)
622
+ {
623
+ if (done)
624
+ {
625
+ if (mark_tag < p-1)
626
+ {
627
+ CAT(tag, p-1);
628
+ ELE(text);
629
+ }
630
+ }
631
+ else
632
+ {
633
+ CAT(tag, p);
634
+ }
635
+ }
636
+ if (io)
637
+ mark_tag = buf;
638
+ else
639
+ mark_tag = RSTRING_PTR(port);
640
+ }
641
+ else if (io)
642
+ {
643
+ have = pe - ts;
644
+ memmove(buf, ts, have);
645
+ SLIDE(tag);
646
+ SLIDE(akey);
647
+ SLIDE(aval);
648
+ te = buf + (te - ts);
649
+ ts = buf;
650
+ }
651
+ }
652
+
653
+ if (buf != NULL)
654
+ free(buf);
655
+
656
+ if (S != NULL)
657
+ {
658
+ VALUE doc = S->doc;
659
+ rb_gc_unregister_address(&S->doc);
660
+ free(S);
661
+ return doc;
662
+ }
663
+
664
+ return Qnil;
665
+ }
666
+
667
+ void hstruct_mark(void* ptr) {
668
+ struct hpricot_struct* st = (struct hpricot_struct*)ptr;
669
+ int i;
670
+
671
+ /* it's likely to hit GC when allocating st->ptr.
672
+ * that should be checked to avoid segfault.
673
+ * and simply ignore it.
674
+ */
675
+ if (st->ptr) {
676
+ for(i = 0; i < st->len; i++) {
677
+ rb_gc_mark(st->ptr[i]);
678
+ }
679
+ }
680
+ }
681
+
682
+ void hstruct_free(void* ptr) {
683
+ struct hpricot_struct* st = (struct hpricot_struct*)ptr;
684
+
685
+ free(st->ptr);
686
+ free(st);
687
+ }
688
+
689
+ static VALUE
690
+ alloc_hpricot_struct8(VALUE klass)
691
+ {
692
+ VALUE obj;
693
+ struct hpricot_struct* st;
694
+
695
+ obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);
696
+
697
+ st->len = 8;
698
+ st->ptr = ALLOC_N(VALUE, 8);
699
+
700
+ rb_mem_clear(st->ptr, 8);
701
+
702
+ return obj;
703
+ }
704
+
705
+ static VALUE
706
+ alloc_hpricot_struct2(VALUE klass)
707
+ {
708
+ VALUE obj;
709
+ struct hpricot_struct* st;
710
+
711
+ obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);
712
+
713
+ st->len = 2;
714
+ st->ptr = ALLOC_N(VALUE, 2);
715
+
716
+ rb_mem_clear(st->ptr, 2);
717
+
718
+ return obj;
719
+ }
720
+
721
+ static VALUE
722
+ alloc_hpricot_struct3(VALUE klass)
723
+ {
724
+ VALUE obj;
725
+ struct hpricot_struct* st;
726
+
727
+ obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);
728
+
729
+ st->len = 3;
730
+ st->ptr = ALLOC_N(VALUE, 3);
731
+
732
+ rb_mem_clear(st->ptr, 3);
733
+
734
+ return obj;
735
+ }
736
+
737
+ static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);}
738
+ static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);}
739
+ static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);}
740
+ static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);}
741
+ static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);}
742
+ static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);}
743
+ static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);}
744
+ static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);}
745
+ static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);}
746
+ static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}
747
+
748
+ static VALUE (*ref_func[10])() = {
749
+ hpricot_struct_ref0,
750
+ hpricot_struct_ref1,
751
+ hpricot_struct_ref2,
752
+ hpricot_struct_ref3,
753
+ hpricot_struct_ref4,
754
+ hpricot_struct_ref5,
755
+ hpricot_struct_ref6,
756
+ hpricot_struct_ref7,
757
+ hpricot_struct_ref8,
758
+ hpricot_struct_ref9,
759
+ };
760
+
761
+ static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);}
762
+ static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);}
763
+ static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);}
764
+ static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);}
765
+ static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);}
766
+ static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);}
767
+ static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);}
768
+ static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);}
769
+ static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);}
770
+ static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}
771
+
772
+ static VALUE (*set_func[10])() = {
773
+ hpricot_struct_set0,
774
+ hpricot_struct_set1,
775
+ hpricot_struct_set2,
776
+ hpricot_struct_set3,
777
+ hpricot_struct_set4,
778
+ hpricot_struct_set5,
779
+ hpricot_struct_set6,
780
+ hpricot_struct_set7,
781
+ hpricot_struct_set8,
782
+ hpricot_struct_set9,
783
+ };
784
+
785
+ static VALUE
786
+ make_hpricot_struct(VALUE members, VALUE (*alloc)(VALUE klass))
787
+ {
788
+ int i = 0;
789
+ char attr_set[128];
790
+
791
+ VALUE klass = rb_class_new(rb_cObject);
792
+ rb_define_alloc_func(klass, alloc);
793
+
794
+ int len = RARRAY_LEN(members);
795
+ assert(len < 10);
796
+
797
+ for (i = 0; i < len; i++) {
798
+ ID id = SYM2ID(rb_ary_entry(members, i));
799
+ const char* name = rb_id2name(id);
800
+ int len = strlen(name);
801
+
802
+ memcpy(attr_set, name, strlen(name));
803
+ attr_set[len] = '=';
804
+ attr_set[len+1] = 0;
805
+
806
+ rb_define_method(klass, name, ref_func[i], 0);
807
+ rb_define_method(klass, attr_set, set_func[i], 1);
808
+ }
809
+ return klass;
810
+ }
811
+
812
+ void Init_hpricot_scan()
813
+ {
814
+ VALUE structElem, structAttr, structBasic;
815
+
816
+ s_ElementContent = rb_intern("ElementContent");
817
+ symAllow = ID2SYM(rb_intern("allow"));
818
+ symDeny = ID2SYM(rb_intern("deny"));
819
+ s_downcase = rb_intern("downcase");
820
+ s_new = rb_intern("new");
821
+ s_parent = rb_intern("parent");
822
+ s_read = rb_intern("read");
823
+ s_to_str = rb_intern("to_str");
824
+ sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
825
+ sym_doctype = ID2SYM(rb_intern("doctype"));
826
+ sym_procins = ID2SYM(rb_intern("procins"));
827
+ sym_stag = ID2SYM(rb_intern("stag"));
828
+ sym_etag = ID2SYM(rb_intern("etag"));
829
+ sym_emptytag = ID2SYM(rb_intern("emptytag"));
830
+ sym_allowed = ID2SYM(rb_intern("allowed"));
831
+ sym_children = ID2SYM(rb_intern("children"));
832
+ sym_comment = ID2SYM(rb_intern("comment"));
833
+ sym_cdata = ID2SYM(rb_intern("cdata"));
834
+ sym_name = ID2SYM(rb_intern("name"));
835
+ sym_parent = ID2SYM(rb_intern("parent"));
836
+ sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
837
+ sym_raw_string = ID2SYM(rb_intern("raw_string"));
838
+ sym_tagno = ID2SYM(rb_intern("tagno"));
839
+ sym_text = ID2SYM(rb_intern("text"));
840
+ sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
841
+ sym_CDATA = ID2SYM(rb_intern("CDATA"));
842
+
843
+ mHpricot = rb_define_module("Hpricot");
844
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
845
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
846
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
847
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
848
+
849
+ structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
850
+ sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
851
+ sym_tagno, sym_children), alloc_hpricot_struct8);
852
+ structAttr = make_hpricot_struct(
853
+ rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes),
854
+ alloc_hpricot_struct3);
855
+ structBasic = make_hpricot_struct(
856
+ rb_ary_new3(2, sym_name, sym_parent),
857
+ alloc_hpricot_struct2);
858
+
859
+ cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
860
+ cCData = rb_define_class_under(mHpricot, "CData", structBasic);
861
+ rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
862
+ rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
863
+ cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
864
+ rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
865
+ rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
866
+ cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
867
+ rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
868
+ rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
869
+ rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
870
+ rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
871
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
872
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
873
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
874
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
875
+ cElem = rb_define_class_under(mHpricot, "Elem", structElem);
876
+ rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
877
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
878
+ rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
879
+ rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
880
+ cText = rb_define_class_under(mHpricot, "Text", structBasic);
881
+ rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
882
+ rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
883
+ rb_define_method(cText, "content", hpricot_ele_get_name, 0);
884
+ rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
885
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
886
+ rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
887
+ rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
888
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
889
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
890
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
891
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
892
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
893
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
894
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
895
+ rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
896
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
897
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
898
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
899
+
900
+ rb_const_set(mHpricot, rb_intern("ProcInsParse"),
901
+ reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
902
+ }