webtranslateit-hpricot 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/CHANGELOG +122 -0
  4. data/COPYING +18 -0
  5. data/README.md +295 -0
  6. data/Rakefile +237 -0
  7. data/ext/fast_xs/FastXsService.java +1123 -0
  8. data/ext/fast_xs/extconf.rb +4 -0
  9. data/ext/fast_xs/fast_xs.c +210 -0
  10. data/ext/hpricot_scan/HpricotCss.java +850 -0
  11. data/ext/hpricot_scan/HpricotScanService.java +2085 -0
  12. data/ext/hpricot_scan/MANIFEST +0 -0
  13. data/ext/hpricot_scan/extconf.rb +9 -0
  14. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  15. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  16. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  17. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  18. data/ext/hpricot_scan/hpricot_scan.c +6848 -0
  19. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  20. data/ext/hpricot_scan/hpricot_scan.java.rl +1173 -0
  21. data/ext/hpricot_scan/hpricot_scan.rl +911 -0
  22. data/extras/hpricot.png +0 -0
  23. data/hpricot.gemspec +18 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +217 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +95 -0
  35. data/lib/hpricot.rb +26 -0
  36. data/setup.rb +1585 -0
  37. data/test/files/basic.xhtml +17 -0
  38. data/test/files/boingboing.html +2266 -0
  39. data/test/files/cy0.html +3653 -0
  40. data/test/files/immob.html +400 -0
  41. data/test/files/pace_application.html +1320 -0
  42. data/test/files/tenderlove.html +16 -0
  43. data/test/files/uswebgen.html +220 -0
  44. data/test/files/utf8.html +1054 -0
  45. data/test/files/week9.html +1723 -0
  46. data/test/files/why.xml +19 -0
  47. data/test/load_files.rb +7 -0
  48. data/test/nokogiri-bench.rb +64 -0
  49. data/test/test_alter.rb +96 -0
  50. data/test/test_builder.rb +37 -0
  51. data/test/test_parser.rb +496 -0
  52. data/test/test_paths.rb +25 -0
  53. data/test/test_preserved.rb +88 -0
  54. data/test/test_xml.rb +28 -0
  55. metadata +106 -0
@@ -0,0 +1,911 @@
1
+ /*
2
+ * hpricot_scan.rl
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006, 2010 why the lucky stiff
8
+ */
9
+ #include <ruby.h>
10
+ #include <assert.h>
11
+
12
+ struct hpricot_struct {
13
+ int len;
14
+ VALUE* ptr;
15
+ };
16
+
17
+ #ifndef RARRAY_LEN
18
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
19
+ #define RSTRING_LEN(str) RSTRING(str)->len
20
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
21
+ #endif
22
+
23
+ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
24
+
25
+ #define NO_WAY_SERIOUSLY "*** This should not happen, please file a bug report with the HTML you're parsing at http://github.com/hpricot/hpricot/issues. So sorry!"
26
+
27
+ static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
28
+ sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
29
+ sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;
30
+ static VALUE mHpricot, rb_eHpricotParseError;
31
+ static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,
32
+ cXMLDecl, cProcIns, symAllow, symDeny;
33
+ static ID s_ElementContent;
34
+ static ID s_downcase, s_new, s_parent, s_read, s_to_str;
35
+ static VALUE reProcInsParse;
36
+
37
+ #define H_ELE_TAG 0
38
+ #define H_ELE_PARENT 1
39
+ #define H_ELE_ATTR 2
40
+ #define H_ELE_ETAG 3
41
+ #define H_ELE_RAW 4
42
+ #define H_ELE_EC 5
43
+ #define H_ELE_HASH 6
44
+ #define H_ELE_CHILDREN 7
45
+
46
+ #define HSTRUCT_PTR(ele) ((struct hpricot_struct*)DATA_PTR(ele))->ptr
47
+
48
+ #define H_ELE_GET(ele, idx) HSTRUCT_PTR(ele)[idx]
49
+ #define H_ELE_SET(ele, idx, val) HSTRUCT_PTR(ele)[idx] = val
50
+
51
+ #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
52
+
53
+ #ifdef HAVE_RUBY_ENCODING_H
54
+ #include <ruby/encoding.h>
55
+ # define ASSOCIATE_INDEX(s) rb_enc_associate_index((s), encoding_index)
56
+ # define ENCODING_INDEX , encoding_index
57
+ #else
58
+ # define ASSOCIATE_INDEX(s)
59
+ # define ENCODING_INDEX
60
+ #endif
61
+
62
+ #define ELE(N) \
63
+ if (te > ts || text == 1) { \
64
+ char *raw = NULL; \
65
+ int rawlen = 0; \
66
+ ele_open = 0; text = 0; \
67
+ if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
68
+ raw = ts; rawlen = te - ts; \
69
+ } \
70
+ if (rb_block_given_p()) { \
71
+ VALUE raw_string = Qnil; \
72
+ if (raw != NULL) { \
73
+ raw_string = rb_str_new(raw, rawlen); \
74
+ ASSOCIATE_INDEX(raw_string); \
75
+ } \
76
+ rb_yield_tokens(sym_##N, tag, attr, Qnil, taint); \
77
+ } else \
78
+ rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint ENCODING_INDEX); \
79
+ }
80
+
81
+ #define SET(N, E) \
82
+ if (mark_##N == NULL || E == mark_##N) { \
83
+ N = rb_str_new2(""); \
84
+ ASSOCIATE_INDEX(N); \
85
+ } else if (E > mark_##N) { \
86
+ N = rb_str_new(mark_##N, E - mark_##N); \
87
+ ASSOCIATE_INDEX(N); \
88
+ }
89
+
90
+ #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
91
+
92
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
93
+
94
+ #define ATTR(K, V) \
95
+ if (!NIL_P(K)) { \
96
+ if (NIL_P(attr)) attr = rb_hash_new(); \
97
+ rb_hash_aset(attr, K, V); \
98
+ }
99
+
100
+ #define TEXT_PASS() \
101
+ if (text == 0) \
102
+ { \
103
+ if (ele_open == 1) { \
104
+ ele_open = 0; \
105
+ if (ts > 0) { \
106
+ mark_tag = ts; \
107
+ } \
108
+ } else { \
109
+ mark_tag = p; \
110
+ } \
111
+ attr = Qnil; \
112
+ tag = Qnil; \
113
+ text = 1; \
114
+ }
115
+
116
+ #define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
117
+
118
+ %%{
119
+ machine hpricot_scan;
120
+
121
+ action newEle {
122
+ if (text == 1) {
123
+ CAT(tag, p);
124
+ ELE(text);
125
+ text = 0;
126
+ }
127
+ attr = Qnil;
128
+ tag = Qnil;
129
+ mark_tag = NULL;
130
+ ele_open = 1;
131
+ }
132
+
133
+ action _tag { mark_tag = p; }
134
+ action _aval { mark_aval = p; }
135
+ action _akey { mark_akey = p; }
136
+ action tag { SET(tag, p); }
137
+ action tagc { SET(tag, p-1); }
138
+ action aval { SET(aval, p); }
139
+ action aunq {
140
+ if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
141
+ else { SET(aval, p); }
142
+ }
143
+ action akey { SET(akey, p); }
144
+ action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
145
+ action xmlenc {
146
+ #ifdef HAVE_RUBY_ENCODING_H
147
+ if (mark_aval < p) {
148
+ char psave = *p;
149
+ *p = '\0';
150
+ encoding_index = rb_enc_find_index(mark_aval);
151
+ *p = psave;
152
+ }
153
+ #endif
154
+ SET(aval, p);
155
+ ATTR(ID2SYM(rb_intern("encoding")), aval);
156
+ }
157
+ action xmlsd { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
158
+ action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
159
+ action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
160
+
161
+ action new_attr {
162
+ akey = Qnil;
163
+ aval = Qnil;
164
+ mark_akey = NULL;
165
+ mark_aval = NULL;
166
+ }
167
+
168
+ action save_attr {
169
+ if (!S->xml && !NIL_P(akey))
170
+ akey = rb_funcall(akey, s_downcase, 0);
171
+ ATTR(akey, aval);
172
+ }
173
+
174
+ include hpricot_common "hpricot_common.rl";
175
+
176
+ }%%
177
+
178
+ %% write data nofinal;
179
+
180
+ #define BUFSIZE 16384
181
+
182
+ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
183
+ {
184
+ VALUE ary;
185
+ if (sym == sym_text) {
186
+ raw = tag;
187
+ }
188
+ ary = rb_ary_new3(4, sym, tag, attr, raw);
189
+ if (taint) {
190
+ OBJ_TAINT(ary);
191
+ OBJ_TAINT(tag);
192
+ OBJ_TAINT(attr);
193
+ OBJ_TAINT(raw);
194
+ }
195
+ rb_yield(ary);
196
+ }
197
+
198
+ #ifndef RHASH_TBL
199
+ /* rb_hash_lookup() is only in Ruby 1.8.7 */
200
+ static VALUE
201
+ our_rb_hash_lookup(VALUE hash, VALUE key)
202
+ {
203
+ VALUE val;
204
+
205
+ if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
206
+ return Qnil; /* without Hash#default */
207
+ }
208
+
209
+ return val;
210
+ }
211
+ #define rb_hash_lookup our_rb_hash_lookup
212
+ #endif
213
+
214
+ static void
215
+ rb_hpricot_add(VALUE focus, VALUE ele)
216
+ {
217
+ VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
218
+ if (NIL_P(children))
219
+ H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
220
+ rb_ary_push(children, ele);
221
+ H_ELE_SET(ele, H_ELE_PARENT, focus);
222
+ }
223
+
224
+ typedef struct {
225
+ VALUE doc;
226
+ VALUE focus;
227
+ VALUE last;
228
+ VALUE EC;
229
+ unsigned char xml, strict, fixup;
230
+ } hpricot_state;
231
+
232
+ #define H_PROP(prop, idx) \
233
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
234
+ H_ELE_SET(self, idx, x); \
235
+ return self; \
236
+ } \
237
+ static VALUE hpricot_ele_clear_##prop(VALUE self) { \
238
+ H_ELE_SET(self, idx, Qnil); \
239
+ return Qtrue; \
240
+ } \
241
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
242
+ return H_ELE_GET(self, idx); \
243
+ }
244
+
245
+ #define H_ATTR(prop) \
246
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
247
+ rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
248
+ return self; \
249
+ } \
250
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
251
+ return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
252
+ }
253
+
254
+ H_PROP(name, H_ELE_TAG);
255
+ H_PROP(raw, H_ELE_RAW);
256
+ H_PROP(parent, H_ELE_PARENT);
257
+ H_PROP(attr, H_ELE_ATTR);
258
+ H_PROP(etag, H_ELE_ETAG);
259
+ H_PROP(children, H_ELE_CHILDREN);
260
+ H_ATTR(target);
261
+ H_ATTR(encoding);
262
+ H_ATTR(version);
263
+ H_ATTR(standalone);
264
+ H_ATTR(system_id);
265
+ H_ATTR(public_id);
266
+
267
+ #define H_ELE(klass) \
268
+ ele = rb_obj_alloc(klass); \
269
+ if (klass == cElem) { \
270
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
271
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
272
+ H_ELE_SET(ele, H_ELE_EC, ec); \
273
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
274
+ VALUE raw_str = rb_str_new(raw, rawlen); \
275
+ ASSOCIATE_INDEX(raw_str); \
276
+ H_ELE_SET(ele, H_ELE_RAW, raw_str); \
277
+ } \
278
+ } else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
279
+ if (klass == cBogusETag) { \
280
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
281
+ if (raw != NULL) { \
282
+ VALUE raw_str = rb_str_new(raw, rawlen); \
283
+ ASSOCIATE_INDEX(raw_str); \
284
+ H_ELE_SET(ele, H_ELE_ATTR, raw_str); \
285
+ } \
286
+ } else { \
287
+ if (klass == cDocType) \
288
+ ATTR(ID2SYM(rb_intern("target")), tag); \
289
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
290
+ if (klass != cProcIns) { \
291
+ tag = Qnil; \
292
+ if (raw != NULL) { \
293
+ tag = rb_str_new(raw, rawlen); \
294
+ ASSOCIATE_INDEX(tag); \
295
+ } \
296
+ } \
297
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
298
+ } \
299
+ } else { \
300
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
301
+ } \
302
+ S->last = ele
303
+
304
+ //
305
+ // the swift, compact parser logic. most of the complicated stuff is done
306
+ // in the lexer. this step just pairs up the start and end tags.
307
+ //
308
+ void
309
+ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr,
310
+ char *raw, int rawlen, int taint
311
+ #ifdef HAVE_RUBY_ENCODING_H
312
+ , int encoding_index
313
+ #endif
314
+ )
315
+ {
316
+ VALUE ele, ec = Qnil;
317
+
318
+ //
319
+ // in html mode, fix up start tags incorrectly formed as empty tags
320
+ //
321
+ if (!S->xml) {
322
+ if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
323
+ ec = rb_hash_aref(S->EC, tag);
324
+ if (NIL_P(ec)) {
325
+ tag = rb_funcall(tag, s_downcase, 0);
326
+ ec = rb_hash_aref(S->EC, tag);
327
+ }
328
+ }
329
+
330
+ if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
331
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
332
+ !(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
333
+ {
334
+ sym = sym_text;
335
+ tag = rb_str_new(raw, rawlen);
336
+ ASSOCIATE_INDEX(tag);
337
+ }
338
+
339
+ if (!NIL_P(ec)) {
340
+ if (sym == sym_emptytag) {
341
+ if (ec != sym_EMPTY)
342
+ sym = sym_stag;
343
+ } else if (sym == sym_stag) {
344
+ if (ec == sym_EMPTY)
345
+ sym = sym_emptytag;
346
+ }
347
+ }
348
+ }
349
+
350
+ if (sym == sym_emptytag || sym == sym_stag) {
351
+ VALUE name = INT2FIX(rb_str_hash(tag));
352
+ H_ELE(cElem);
353
+ H_ELE_SET(ele, H_ELE_HASH, name);
354
+
355
+ if (!S->xml) {
356
+ VALUE match = Qnil, e = S->focus;
357
+ while (e != S->doc)
358
+ {
359
+ if (ec == Qnil) {
360
+ // anything can contain unknown elements
361
+ if (match == Qnil)
362
+ match = e;
363
+ } else {
364
+ VALUE hEC = H_ELE_GET(e, H_ELE_EC);
365
+
366
+ if (TYPE(hEC) == T_HASH)
367
+ {
368
+ VALUE has = rb_hash_lookup(hEC, name);
369
+ if (has != Qnil) {
370
+ if (has == Qtrue) {
371
+ if (match == Qnil)
372
+ match = e;
373
+ } else if (has == symAllow) {
374
+ match = S->focus;
375
+ } else if (has == symDeny) {
376
+ match = Qnil;
377
+ }
378
+ }
379
+ } else {
380
+ // Unknown elements can contain anything
381
+ if (match == Qnil)
382
+ match = e;
383
+ }
384
+ }
385
+ e = H_ELE_GET(e, H_ELE_PARENT);
386
+ }
387
+
388
+ if (match == Qnil)
389
+ match = S->focus;
390
+ S->focus = match;
391
+ }
392
+
393
+ rb_hpricot_add(S->focus, ele);
394
+
395
+ //
396
+ // in the case of a start tag that should be empty, just
397
+ // skip the step that focuses the element. focusing moves
398
+ // us deeper into the document.
399
+ //
400
+ if (sym == sym_stag) {
401
+ if (S->xml || ec != sym_EMPTY) {
402
+ S->focus = ele;
403
+ S->last = Qnil;
404
+ }
405
+ }
406
+ } else if (sym == sym_etag) {
407
+ VALUE name, match = Qnil, e = S->focus;
408
+ if (S->strict) {
409
+ if (NIL_P(rb_hash_aref(S->EC, tag))) {
410
+ tag = rb_str_new2("div");
411
+ ASSOCIATE_INDEX(tag);
412
+ }
413
+ }
414
+
415
+ //
416
+ // another optimization will be to improve this very simple
417
+ // O(n) tag search, where n is the depth of the focused tag.
418
+ //
419
+ // (see also: the search above for fixups)
420
+ //
421
+ name = INT2FIX(rb_str_hash(tag));
422
+ while (e != S->doc)
423
+ {
424
+ if (H_ELE_GET(e, H_ELE_HASH) == name)
425
+ {
426
+ match = e;
427
+ break;
428
+ }
429
+
430
+ e = H_ELE_GET(e, H_ELE_PARENT);
431
+ }
432
+
433
+ if (NIL_P(match))
434
+ {
435
+ H_ELE(cBogusETag);
436
+ rb_hpricot_add(S->focus, ele);
437
+ }
438
+ else
439
+ {
440
+ VALUE ele = Qnil;
441
+ if (raw != NULL) {
442
+ ele = rb_str_new(raw, rawlen);
443
+ ASSOCIATE_INDEX(ele);
444
+ }
445
+ H_ELE_SET(match, H_ELE_ETAG, ele);
446
+ S->focus = H_ELE_GET(match, H_ELE_PARENT);
447
+ S->last = Qnil;
448
+ }
449
+ } else if (sym == sym_cdata) {
450
+ H_ELE(cCData);
451
+ rb_hpricot_add(S->focus, ele);
452
+ } else if (sym == sym_comment) {
453
+ H_ELE(cComment);
454
+ rb_hpricot_add(S->focus, ele);
455
+ } else if (sym == sym_doctype) {
456
+ H_ELE(cDocType);
457
+ if (S->strict) {
458
+ VALUE id;
459
+ id = rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
460
+ ASSOCIATE_INDEX(id);
461
+ rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), id);
462
+ id = rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN");
463
+ ASSOCIATE_INDEX(id);
464
+ rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), id);
465
+ }
466
+ rb_hpricot_add(S->focus, ele);
467
+ } else if (sym == sym_procins) {
468
+ VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
469
+ tag = rb_reg_nth_match(1, match);
470
+ attr = rb_reg_nth_match(2, match);
471
+ {
472
+ H_ELE(cProcIns);
473
+ rb_hpricot_add(S->focus, ele);
474
+ }
475
+ } else if (sym == sym_text) {
476
+ // TODO: add raw_string as well?
477
+ if (!NIL_P(S->last) && RTEST(rb_obj_is_instance_of(S->last, cText))) {
478
+ rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
479
+ } else {
480
+ H_ELE(cText);
481
+ rb_hpricot_add(S->focus, ele);
482
+ }
483
+ } else if (sym == sym_xmldecl) {
484
+ H_ELE(cXMLDecl);
485
+ rb_hpricot_add(S->focus, ele);
486
+ }
487
+ }
488
+
489
+ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
490
+ {
491
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
492
+ char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
493
+
494
+ hpricot_state *S = NULL;
495
+ VALUE port, opts;
496
+ VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
497
+ char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
498
+ int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
499
+ #ifdef HAVE_RUBY_ENCODING_H
500
+ int encoding_index = rb_enc_to_index(rb_default_external_encoding());
501
+ #endif
502
+
503
+ rb_scan_args(argc, argv, "11", &port, &opts);
504
+ taint = OBJ_TAINTED(port);
505
+ io = rb_respond_to(port, s_read);
506
+ if (!io)
507
+ {
508
+ if (rb_respond_to(port, s_to_str))
509
+ {
510
+ port = rb_funcall(port, s_to_str, 0);
511
+ StringValue(port);
512
+ }
513
+ else
514
+ {
515
+ rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
516
+ }
517
+ }
518
+
519
+ if (TYPE(opts) != T_HASH)
520
+ opts = Qnil;
521
+
522
+ if (!rb_block_given_p())
523
+ {
524
+ S = ALLOC(hpricot_state);
525
+ S->doc = rb_obj_alloc(cDoc);
526
+ rb_gc_register_address(&S->doc);
527
+ S->focus = S->doc;
528
+ S->last = Qnil;
529
+ S->xml = OPT(opts, xml);
530
+ S->strict = OPT(opts, xhtml_strict);
531
+ S->fixup = OPT(opts, fixup_tags);
532
+ if (S->strict) S->fixup = 1;
533
+ rb_ivar_set(S->doc, rb_intern("@options"), opts);
534
+
535
+ S->EC = rb_const_get(mHpricot, s_ElementContent);
536
+ }
537
+
538
+ buffer_size = BUFSIZE;
539
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
540
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
541
+ if (!NIL_P(bufsize)) {
542
+ buffer_size = NUM2INT(bufsize);
543
+ }
544
+ }
545
+
546
+ if (io)
547
+ buf = ALLOC_N(char, buffer_size);
548
+
549
+ %% write init;
550
+
551
+ while (!done) {
552
+ VALUE str;
553
+ char *p, *pe;
554
+ int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
555
+
556
+ if (io)
557
+ {
558
+ if (space == 0) {
559
+ /* We've used up the entire buffer storing an already-parsed token
560
+ * prefix that must be preserved. Likely caused by super-long attributes.
561
+ * Increase buffer size and continue */
562
+ tokstart_diff = ts - buf;
563
+ tokend_diff = te - buf;
564
+ mark_tag_diff = mark_tag - buf;
565
+ mark_akey_diff = mark_akey - buf;
566
+ mark_aval_diff = mark_aval - buf;
567
+
568
+ buffer_size += BUFSIZE;
569
+ REALLOC_N(buf, char, buffer_size);
570
+
571
+ space = buffer_size - have;
572
+
573
+ ts = buf + tokstart_diff;
574
+ te = buf + tokend_diff;
575
+ mark_tag = buf + mark_tag_diff;
576
+ mark_akey = buf + mark_akey_diff;
577
+ mark_aval = buf + mark_aval_diff;
578
+ }
579
+ p = buf + have;
580
+
581
+ str = rb_funcall(port, s_read, 1, INT2FIX(space));
582
+ len = RSTRING_LEN(str);
583
+ memcpy(p, StringValuePtr(str), len);
584
+ }
585
+ else
586
+ {
587
+ p = RSTRING_PTR(port);
588
+ len = RSTRING_LEN(port) + 1;
589
+ done = 1;
590
+ }
591
+
592
+ nread += len;
593
+
594
+ /* If this is the last buffer, tack on an EOF. */
595
+ if (io && len < space) {
596
+ p[len++] = 0;
597
+ done = 1;
598
+ }
599
+
600
+ pe = p + len;
601
+ %% write exec;
602
+
603
+ if (cs == hpricot_scan_error) {
604
+ if (buf != NULL)
605
+ free(buf);
606
+ if (!NIL_P(tag))
607
+ {
608
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
609
+ }
610
+ else
611
+ {
612
+ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
613
+ }
614
+ }
615
+
616
+ if (done && ele_open)
617
+ {
618
+ ele_open = 0;
619
+ if (ts > 0) {
620
+ mark_tag = ts;
621
+ ts = 0;
622
+ text = 1;
623
+ }
624
+ }
625
+
626
+ if (ts == 0)
627
+ {
628
+ have = 0;
629
+ /* text nodes have no ts because each byte is parsed alone */
630
+ if (mark_tag != NULL && text == 1)
631
+ {
632
+ if (done)
633
+ {
634
+ if (mark_tag < p-1)
635
+ {
636
+ CAT(tag, p-1);
637
+ ELE(text);
638
+ }
639
+ }
640
+ else
641
+ {
642
+ CAT(tag, p);
643
+ }
644
+ }
645
+ if (io)
646
+ mark_tag = buf;
647
+ else
648
+ mark_tag = RSTRING_PTR(port);
649
+ }
650
+ else if (io)
651
+ {
652
+ have = pe - ts;
653
+ memmove(buf, ts, have);
654
+ SLIDE(tag);
655
+ SLIDE(akey);
656
+ SLIDE(aval);
657
+ te = buf + (te - ts);
658
+ ts = buf;
659
+ }
660
+ }
661
+
662
+ if (buf != NULL)
663
+ free(buf);
664
+
665
+ if (S != NULL)
666
+ {
667
+ VALUE doc = S->doc;
668
+ rb_gc_unregister_address(&S->doc);
669
+ free(S);
670
+ return doc;
671
+ }
672
+
673
+ return Qnil;
674
+ }
675
+
676
+ void hstruct_mark(void* ptr) {
677
+ struct hpricot_struct* st = (struct hpricot_struct*)ptr;
678
+ int i;
679
+
680
+ /* it's likely to hit GC when allocating st->ptr.
681
+ * that should be checked to avoid segfault.
682
+ * and simply ignore it.
683
+ */
684
+ if (st->ptr) {
685
+ for(i = 0; i < st->len; i++) {
686
+ rb_gc_mark(st->ptr[i]);
687
+ }
688
+ }
689
+ }
690
+
691
+ void hstruct_free(void* ptr) {
692
+ struct hpricot_struct* st = (struct hpricot_struct*)ptr;
693
+
694
+ free(st->ptr);
695
+ free(st);
696
+ }
697
+
698
+ static VALUE
699
+ alloc_hpricot_struct8(VALUE klass)
700
+ {
701
+ VALUE obj;
702
+ struct hpricot_struct* st;
703
+
704
+ obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);
705
+
706
+ st->len = 8;
707
+ st->ptr = ALLOC_N(VALUE, 8);
708
+
709
+ rb_mem_clear(st->ptr, 8);
710
+
711
+ return obj;
712
+ }
713
+
714
+ static VALUE
715
+ alloc_hpricot_struct2(VALUE klass)
716
+ {
717
+ VALUE obj;
718
+ struct hpricot_struct* st;
719
+
720
+ obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);
721
+
722
+ st->len = 2;
723
+ st->ptr = ALLOC_N(VALUE, 2);
724
+
725
+ rb_mem_clear(st->ptr, 2);
726
+
727
+ return obj;
728
+ }
729
+
730
+ static VALUE
731
+ alloc_hpricot_struct3(VALUE klass)
732
+ {
733
+ VALUE obj;
734
+ struct hpricot_struct* st;
735
+
736
+ obj = Data_Make_Struct(klass, struct hpricot_struct, hstruct_mark, hstruct_free, st);
737
+
738
+ st->len = 3;
739
+ st->ptr = ALLOC_N(VALUE, 3);
740
+
741
+ rb_mem_clear(st->ptr, 3);
742
+
743
+ return obj;
744
+ }
745
+
746
+ static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);}
747
+ static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);}
748
+ static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);}
749
+ static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);}
750
+ static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);}
751
+ static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);}
752
+ static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);}
753
+ static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);}
754
+ static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);}
755
+ static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}
756
+
757
+ static VALUE (*ref_func[10])() = {
758
+ hpricot_struct_ref0,
759
+ hpricot_struct_ref1,
760
+ hpricot_struct_ref2,
761
+ hpricot_struct_ref3,
762
+ hpricot_struct_ref4,
763
+ hpricot_struct_ref5,
764
+ hpricot_struct_ref6,
765
+ hpricot_struct_ref7,
766
+ hpricot_struct_ref8,
767
+ hpricot_struct_ref9,
768
+ };
769
+
770
+ static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);}
771
+ static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);}
772
+ static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);}
773
+ static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);}
774
+ static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);}
775
+ static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);}
776
+ static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);}
777
+ static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);}
778
+ static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);}
779
+ static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}
780
+
781
+ static VALUE (*set_func[10])() = {
782
+ hpricot_struct_set0,
783
+ hpricot_struct_set1,
784
+ hpricot_struct_set2,
785
+ hpricot_struct_set3,
786
+ hpricot_struct_set4,
787
+ hpricot_struct_set5,
788
+ hpricot_struct_set6,
789
+ hpricot_struct_set7,
790
+ hpricot_struct_set8,
791
+ hpricot_struct_set9,
792
+ };
793
+
794
+ static VALUE
795
+ make_hpricot_struct(VALUE members, VALUE (*alloc)(VALUE klass))
796
+ {
797
+ int i = 0;
798
+ char attr_set[128];
799
+
800
+ VALUE klass = rb_class_new(rb_cObject);
801
+ rb_define_alloc_func(klass, alloc);
802
+
803
+ int len = RARRAY_LEN(members);
804
+ assert(len < 10);
805
+
806
+ for (i = 0; i < len; i++) {
807
+ ID id = SYM2ID(rb_ary_entry(members, i));
808
+ const char* name = rb_id2name(id);
809
+ int len = strlen(name);
810
+
811
+ memcpy(attr_set, name, strlen(name));
812
+ attr_set[len] = '=';
813
+ attr_set[len+1] = 0;
814
+
815
+ rb_define_method(klass, name, ref_func[i], 0);
816
+ rb_define_method(klass, attr_set, set_func[i], 1);
817
+ }
818
+ return klass;
819
+ }
820
+
821
+ void Init_hpricot_scan()
822
+ {
823
+ VALUE structElem, structAttr, structBasic;
824
+
825
+ s_ElementContent = rb_intern("ElementContent");
826
+ symAllow = ID2SYM(rb_intern("allow"));
827
+ symDeny = ID2SYM(rb_intern("deny"));
828
+ s_downcase = rb_intern("downcase");
829
+ s_new = rb_intern("new");
830
+ s_parent = rb_intern("parent");
831
+ s_read = rb_intern("read");
832
+ s_to_str = rb_intern("to_str");
833
+ sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
834
+ sym_doctype = ID2SYM(rb_intern("doctype"));
835
+ sym_procins = ID2SYM(rb_intern("procins"));
836
+ sym_stag = ID2SYM(rb_intern("stag"));
837
+ sym_etag = ID2SYM(rb_intern("etag"));
838
+ sym_emptytag = ID2SYM(rb_intern("emptytag"));
839
+ sym_allowed = ID2SYM(rb_intern("allowed"));
840
+ sym_children = ID2SYM(rb_intern("children"));
841
+ sym_comment = ID2SYM(rb_intern("comment"));
842
+ sym_cdata = ID2SYM(rb_intern("cdata"));
843
+ sym_name = ID2SYM(rb_intern("name"));
844
+ sym_parent = ID2SYM(rb_intern("parent"));
845
+ sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
846
+ sym_raw_string = ID2SYM(rb_intern("raw_string"));
847
+ sym_tagno = ID2SYM(rb_intern("tagno"));
848
+ sym_text = ID2SYM(rb_intern("text"));
849
+ sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
850
+ sym_CDATA = ID2SYM(rb_intern("CDATA"));
851
+
852
+ mHpricot = rb_define_module("Hpricot");
853
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
854
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
855
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
856
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
857
+
858
+ structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
859
+ sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
860
+ sym_tagno, sym_children), alloc_hpricot_struct8);
861
+ structAttr = make_hpricot_struct(
862
+ rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes),
863
+ alloc_hpricot_struct3);
864
+ structBasic = make_hpricot_struct(
865
+ rb_ary_new3(2, sym_name, sym_parent),
866
+ alloc_hpricot_struct2);
867
+
868
+ cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
869
+ cCData = rb_define_class_under(mHpricot, "CData", structBasic);
870
+ rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
871
+ rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
872
+ cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
873
+ rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
874
+ rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
875
+ cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
876
+ rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
877
+ rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
878
+ rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
879
+ rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
880
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
881
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
882
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
883
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
884
+ cElem = rb_define_class_under(mHpricot, "Elem", structElem);
885
+ rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
886
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
887
+ rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
888
+ rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
889
+ cText = rb_define_class_under(mHpricot, "Text", structBasic);
890
+ rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
891
+ rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
892
+ rb_define_method(cText, "content", hpricot_ele_get_name, 0);
893
+ rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
894
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
895
+ rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
896
+ rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
897
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
898
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
899
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
900
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
901
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
902
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
903
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
904
+ rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
905
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
906
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
907
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
908
+
909
+ rb_const_set(mHpricot, rb_intern("ProcInsParse"),
910
+ reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
911
+ }