hpricot 0.8.2-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. data/CHANGELOG +88 -0
  2. data/COPYING +18 -0
  3. data/README +275 -0
  4. data/Rakefile +272 -0
  5. data/ext/fast_xs/FastXsService.java +1030 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +201 -0
  8. data/ext/hpricot_scan/HpricotCss.java +831 -0
  9. data/ext/hpricot_scan/HpricotScanService.java +2086 -0
  10. data/ext/hpricot_scan/extconf.rb +6 -0
  11. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  12. data/ext/hpricot_scan/hpricot_css.c +3503 -0
  13. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  14. data/ext/hpricot_scan/hpricot_css.rl +115 -0
  15. data/ext/hpricot_scan/hpricot_scan.c +6927 -0
  16. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  17. data/ext/hpricot_scan/hpricot_scan.java.rl +1152 -0
  18. data/ext/hpricot_scan/hpricot_scan.rl +788 -0
  19. data/extras/mingw-rbconfig.rb +176 -0
  20. data/lib/fast_xs.jar +0 -0
  21. data/lib/hpricot.rb +26 -0
  22. data/lib/hpricot/blankslate.rb +63 -0
  23. data/lib/hpricot/builder.rb +216 -0
  24. data/lib/hpricot/elements.rb +510 -0
  25. data/lib/hpricot/htmlinfo.rb +691 -0
  26. data/lib/hpricot/inspect.rb +103 -0
  27. data/lib/hpricot/modules.rb +40 -0
  28. data/lib/hpricot/parse.rb +38 -0
  29. data/lib/hpricot/tag.rb +219 -0
  30. data/lib/hpricot/tags.rb +164 -0
  31. data/lib/hpricot/traverse.rb +839 -0
  32. data/lib/hpricot/xchar.rb +94 -0
  33. data/lib/hpricot_scan.jar +0 -0
  34. data/test/files/basic.xhtml +17 -0
  35. data/test/files/boingboing.html +2266 -0
  36. data/test/files/cy0.html +3653 -0
  37. data/test/files/immob.html +400 -0
  38. data/test/files/pace_application.html +1320 -0
  39. data/test/files/tenderlove.html +16 -0
  40. data/test/files/uswebgen.html +220 -0
  41. data/test/files/utf8.html +1054 -0
  42. data/test/files/week9.html +1723 -0
  43. data/test/files/why.xml +19 -0
  44. data/test/load_files.rb +7 -0
  45. data/test/nokogiri-bench.rb +64 -0
  46. data/test/test_alter.rb +96 -0
  47. data/test/test_builder.rb +37 -0
  48. data/test/test_parser.rb +428 -0
  49. data/test/test_paths.rb +25 -0
  50. data/test/test_preserved.rb +88 -0
  51. data/test/test_xml.rb +28 -0
  52. metadata +112 -0
@@ -0,0 +1,788 @@
1
+ /*
2
+ * hpricot_scan.rl
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006 why the lucky stiff
8
+ */
9
+ #include <ruby.h>
10
+
11
+ #ifndef RARRAY_LEN
12
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
13
+ #define RSTRING_LEN(str) RSTRING(str)->len
14
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
15
+ #endif
16
+
17
+ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
18
+
19
+ #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
20
+
21
+ static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
22
+ sym_cdata, sym_name, sym_parent, sym_raw_attributes, sym_raw_string, sym_tagno,
23
+ sym_allowed, sym_text, sym_children, sym_EMPTY, sym_CDATA;
24
+ static VALUE mHpricot, rb_eHpricotParseError;
25
+ static VALUE cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cText,
26
+ cXMLDecl, cProcIns, symAllow, symDeny;
27
+ static ID s_ElementContent;
28
+ static ID s_downcase, s_new, s_parent, s_read, s_to_str;
29
+ static VALUE reProcInsParse;
30
+
31
+ #define H_ELE_TAG 0
32
+ #define H_ELE_PARENT 1
33
+ #define H_ELE_ATTR 2
34
+ #define H_ELE_ETAG 3
35
+ #define H_ELE_RAW 4
36
+ #define H_ELE_EC 5
37
+ #define H_ELE_HASH 6
38
+ #define H_ELE_CHILDREN 7
39
+
40
+ #define H_ELE_GET(ele, idx) RSTRUCT_PTR(ele)[idx]
41
+ #define H_ELE_SET(ele, idx, val) RSTRUCT_PTR(ele)[idx] = val
42
+
43
+ #define OPT(opts, key) (!NIL_P(opts) && RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("" # key)))))
44
+
45
+ #define ELE(N) \
46
+ if (te > ts || text == 1) { \
47
+ char *raw = NULL; \
48
+ int rawlen = 0; \
49
+ ele_open = 0; text = 0; \
50
+ if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
51
+ raw = ts; rawlen = te - ts; \
52
+ } \
53
+ if (rb_block_given_p()) { \
54
+ VALUE raw_string = Qnil; \
55
+ if (raw != NULL) raw_string = rb_str_new(raw, rawlen); \
56
+ rb_yield_tokens(sym_##N, tag, attr, Qnil, taint); \
57
+ } else \
58
+ rb_hpricot_token(S, sym_##N, tag, attr, raw, rawlen, taint); \
59
+ }
60
+
61
+ #define SET(N, E) \
62
+ if (mark_##N == NULL || E == mark_##N) \
63
+ N = rb_str_new2(""); \
64
+ else if (E > mark_##N) \
65
+ N = rb_str_new(mark_##N, E - mark_##N);
66
+
67
+ #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
68
+
69
+ #define SLIDE(N) if (mark_##N > ts) mark_##N = buf + (mark_##N - ts);
70
+
71
+ #define ATTR(K, V) \
72
+ if (!NIL_P(K)) { \
73
+ if (NIL_P(attr)) attr = rb_hash_new(); \
74
+ rb_hash_aset(attr, K, V); \
75
+ }
76
+
77
+ #define TEXT_PASS() \
78
+ if (text == 0) \
79
+ { \
80
+ if (ele_open == 1) { \
81
+ ele_open = 0; \
82
+ if (ts > 0) { \
83
+ mark_tag = ts; \
84
+ } \
85
+ } else { \
86
+ mark_tag = p; \
87
+ } \
88
+ attr = Qnil; \
89
+ tag = Qnil; \
90
+ text = 1; \
91
+ }
92
+
93
+ #define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
94
+
95
+ %%{
96
+ machine hpricot_scan;
97
+
98
+ action newEle {
99
+ if (text == 1) {
100
+ CAT(tag, p);
101
+ ELE(text);
102
+ text = 0;
103
+ }
104
+ attr = Qnil;
105
+ tag = Qnil;
106
+ mark_tag = NULL;
107
+ ele_open = 1;
108
+ }
109
+
110
+ action _tag { mark_tag = p; }
111
+ action _aval { mark_aval = p; }
112
+ action _akey { mark_akey = p; }
113
+ action tag { SET(tag, p); }
114
+ action tagc { SET(tag, p-1); }
115
+ action aval { SET(aval, p); }
116
+ action aunq {
117
+ if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
118
+ else { SET(aval, p); }
119
+ }
120
+ action akey { SET(akey, p); }
121
+ action xmlver { SET(aval, p); ATTR(ID2SYM(rb_intern("version")), aval); }
122
+ action xmlenc { SET(aval, p); ATTR(ID2SYM(rb_intern("encoding")), aval); }
123
+ action xmlsd { SET(aval, p); ATTR(ID2SYM(rb_intern("standalone")), aval); }
124
+ action pubid { SET(aval, p); ATTR(ID2SYM(rb_intern("public_id")), aval); }
125
+ action sysid { SET(aval, p); ATTR(ID2SYM(rb_intern("system_id")), aval); }
126
+
127
+ action new_attr {
128
+ akey = Qnil;
129
+ aval = Qnil;
130
+ mark_akey = NULL;
131
+ mark_aval = NULL;
132
+ }
133
+
134
+ action save_attr {
135
+ if (!S->xml)
136
+ akey = rb_funcall(akey, s_downcase, 0);
137
+ ATTR(akey, aval);
138
+ }
139
+
140
+ include hpricot_common "hpricot_common.rl";
141
+
142
+ }%%
143
+
144
+ %% write data nofinal;
145
+
146
+ #define BUFSIZE 16384
147
+
148
+ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
149
+ {
150
+ VALUE ary;
151
+ if (sym == sym_text) {
152
+ raw = tag;
153
+ }
154
+ ary = rb_ary_new3(4, sym, tag, attr, raw);
155
+ if (taint) {
156
+ OBJ_TAINT(ary);
157
+ OBJ_TAINT(tag);
158
+ OBJ_TAINT(attr);
159
+ OBJ_TAINT(raw);
160
+ }
161
+ rb_yield(ary);
162
+ }
163
+
164
+ #ifndef RHASH_TBL
165
+ /* rb_hash_lookup() is only in Ruby 1.8.7 */
166
+ static VALUE
167
+ our_rb_hash_lookup(VALUE hash, VALUE key)
168
+ {
169
+ VALUE val;
170
+
171
+ if (!st_lookup(RHASH(hash)->tbl, key, &val)) {
172
+ return Qnil; /* without Hash#default */
173
+ }
174
+
175
+ return val;
176
+ }
177
+ #define rb_hash_lookup our_rb_hash_lookup
178
+ #endif
179
+
180
+ static void
181
+ rb_hpricot_add(VALUE focus, VALUE ele)
182
+ {
183
+ VALUE children = H_ELE_GET(focus, H_ELE_CHILDREN);
184
+ if (NIL_P(children))
185
+ H_ELE_SET(focus, H_ELE_CHILDREN, (children = rb_ary_new2(1)));
186
+ rb_ary_push(children, ele);
187
+ H_ELE_SET(ele, H_ELE_PARENT, focus);
188
+ }
189
+
190
+ typedef struct {
191
+ VALUE doc;
192
+ VALUE focus;
193
+ VALUE last;
194
+ VALUE EC;
195
+ unsigned char xml, strict, fixup;
196
+ } hpricot_state;
197
+
198
+ #define H_PROP(prop, idx) \
199
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
200
+ H_ELE_SET(self, idx, x); \
201
+ return self; \
202
+ } \
203
+ static VALUE hpricot_ele_clear_##prop(VALUE self) { \
204
+ H_ELE_SET(self, idx, Qnil); \
205
+ return Qtrue; \
206
+ } \
207
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
208
+ return H_ELE_GET(self, idx); \
209
+ }
210
+
211
+ #define H_ATTR(prop) \
212
+ static VALUE hpricot_ele_set_##prop(VALUE self, VALUE x) { \
213
+ rb_hash_aset(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop)), x); \
214
+ return self; \
215
+ } \
216
+ static VALUE hpricot_ele_get_##prop(VALUE self) { \
217
+ return rb_hash_aref(H_ELE_GET(self, H_ELE_ATTR), ID2SYM(rb_intern("" # prop))); \
218
+ }
219
+
220
+ H_PROP(name, H_ELE_TAG);
221
+ H_PROP(raw, H_ELE_RAW);
222
+ H_PROP(parent, H_ELE_PARENT);
223
+ H_PROP(attr, H_ELE_ATTR);
224
+ H_PROP(etag, H_ELE_ETAG);
225
+ H_PROP(children, H_ELE_CHILDREN);
226
+ H_ATTR(target);
227
+ H_ATTR(encoding);
228
+ H_ATTR(version);
229
+ H_ATTR(standalone);
230
+ H_ATTR(system_id);
231
+ H_ATTR(public_id);
232
+
233
+ #define H_ELE(klass) \
234
+ ele = rb_obj_alloc(klass); \
235
+ if (klass == cElem) { \
236
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
237
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
238
+ H_ELE_SET(ele, H_ELE_EC, ec); \
239
+ if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_doctype)) { \
240
+ H_ELE_SET(ele, H_ELE_RAW, rb_str_new(raw, rawlen)); \
241
+ } \
242
+ } else if (klass == cDocType || klass == cProcIns || klass == cXMLDecl || klass == cBogusETag) { \
243
+ if (klass == cBogusETag) { \
244
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
245
+ if (raw != NULL) \
246
+ H_ELE_SET(ele, H_ELE_ATTR, rb_str_new(raw, rawlen)); \
247
+ } else { \
248
+ if (klass == cDocType) \
249
+ ATTR(ID2SYM(rb_intern("target")), tag); \
250
+ H_ELE_SET(ele, H_ELE_ATTR, attr); \
251
+ if (klass != cProcIns) { \
252
+ tag = Qnil; \
253
+ if (raw != NULL) tag = rb_str_new(raw, rawlen); \
254
+ } \
255
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
256
+ } \
257
+ } else { \
258
+ H_ELE_SET(ele, H_ELE_TAG, tag); \
259
+ } \
260
+ S->last = ele
261
+
262
+ //
263
+ // the swift, compact parser logic. most of the complicated stuff is done
264
+ // in the lexer. this step just pairs up the start and end tags.
265
+ //
266
+ void
267
+ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw, int rawlen, int taint)
268
+ {
269
+ VALUE ele, ec = Qnil;
270
+
271
+ //
272
+ // in html mode, fix up start tags incorrectly formed as empty tags
273
+ //
274
+ if (!S->xml) {
275
+ if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
276
+ ec = rb_hash_aref(S->EC, tag);
277
+ if (NIL_P(ec)) {
278
+ tag = rb_funcall(tag, s_downcase, 0);
279
+ ec = rb_hash_aref(S->EC, tag);
280
+ }
281
+ }
282
+
283
+ if (H_ELE_GET(S->focus, H_ELE_EC) == sym_CDATA &&
284
+ (sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
285
+ !(sym == sym_etag && INT2FIX(rb_str_hash(tag)) == H_ELE_GET(S->focus, H_ELE_HASH)))
286
+ {
287
+ sym = sym_text;
288
+ tag = rb_str_new(raw, rawlen);
289
+ }
290
+
291
+ if (!NIL_P(ec)) {
292
+ if (sym == sym_emptytag) {
293
+ if (ec != sym_EMPTY)
294
+ sym = sym_stag;
295
+ } else if (sym == sym_stag) {
296
+ if (ec == sym_EMPTY)
297
+ sym = sym_emptytag;
298
+ }
299
+ }
300
+ }
301
+
302
+ if (sym == sym_emptytag || sym == sym_stag) {
303
+ VALUE name = INT2FIX(rb_str_hash(tag));
304
+ H_ELE(cElem);
305
+ H_ELE_SET(ele, H_ELE_HASH, name);
306
+
307
+ if (!S->xml) {
308
+ VALUE match = Qnil, e = S->focus;
309
+ while (e != S->doc)
310
+ {
311
+ VALUE hEC = H_ELE_GET(e, H_ELE_EC);
312
+
313
+ if (TYPE(hEC) == T_HASH)
314
+ {
315
+ VALUE has = rb_hash_lookup(hEC, name);
316
+ if (has != Qnil) {
317
+ if (has == Qtrue) {
318
+ if (match == Qnil)
319
+ match = e;
320
+ } else if (has == symAllow) {
321
+ match = S->focus;
322
+ } else if (has == symDeny) {
323
+ match = Qnil;
324
+ }
325
+ }
326
+ }
327
+
328
+ e = H_ELE_GET(e, H_ELE_PARENT);
329
+ }
330
+
331
+ if (match == Qnil)
332
+ match = S->focus;
333
+ S->focus = match;
334
+ }
335
+
336
+ rb_hpricot_add(S->focus, ele);
337
+
338
+ //
339
+ // in the case of a start tag that should be empty, just
340
+ // skip the step that focuses the element. focusing moves
341
+ // us deeper into the document.
342
+ //
343
+ if (sym == sym_stag) {
344
+ if (S->xml || ec != sym_EMPTY) {
345
+ S->focus = ele;
346
+ S->last = Qnil;
347
+ }
348
+ }
349
+ } else if (sym == sym_etag) {
350
+ VALUE name, match = Qnil, e = S->focus;
351
+ if (S->strict) {
352
+ if (NIL_P(rb_hash_aref(S->EC, tag))) {
353
+ tag = rb_str_new2("div");
354
+ }
355
+ }
356
+
357
+ //
358
+ // another optimization will be to improve this very simple
359
+ // O(n) tag search, where n is the depth of the focused tag.
360
+ //
361
+ // (see also: the search above for fixups)
362
+ //
363
+ name = INT2FIX(rb_str_hash(tag));
364
+ while (e != S->doc)
365
+ {
366
+ if (H_ELE_GET(e, H_ELE_HASH) == name)
367
+ {
368
+ match = e;
369
+ break;
370
+ }
371
+
372
+ e = H_ELE_GET(e, H_ELE_PARENT);
373
+ }
374
+
375
+ if (NIL_P(match))
376
+ {
377
+ H_ELE(cBogusETag);
378
+ rb_hpricot_add(S->focus, ele);
379
+ }
380
+ else
381
+ {
382
+ VALUE ele = Qnil;
383
+ if (raw != NULL)
384
+ ele = rb_str_new(raw, rawlen);
385
+ H_ELE_SET(match, H_ELE_ETAG, ele);
386
+ S->focus = H_ELE_GET(match, H_ELE_PARENT);
387
+ S->last = Qnil;
388
+ }
389
+ } else if (sym == sym_cdata) {
390
+ H_ELE(cCData);
391
+ rb_hpricot_add(S->focus, ele);
392
+ } else if (sym == sym_comment) {
393
+ H_ELE(cComment);
394
+ rb_hpricot_add(S->focus, ele);
395
+ } else if (sym == sym_doctype) {
396
+ H_ELE(cDocType);
397
+ if (S->strict) {
398
+ rb_hash_aset(attr, ID2SYM(rb_intern("system_id")), rb_str_new2("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"));
399
+ rb_hash_aset(attr, ID2SYM(rb_intern("public_id")), rb_str_new2("-//W3C//DTD XHTML 1.0 Strict//EN"));
400
+ }
401
+ rb_hpricot_add(S->focus, ele);
402
+ } else if (sym == sym_procins) {
403
+ VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
404
+ tag = rb_reg_nth_match(1, match);
405
+ attr = rb_reg_nth_match(2, match);
406
+ {
407
+ H_ELE(cProcIns);
408
+ rb_hpricot_add(S->focus, ele);
409
+ }
410
+ } else if (sym == sym_text) {
411
+ // TODO: add raw_string as well?
412
+ if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
413
+ rb_str_append(H_ELE_GET(S->last, H_ELE_TAG), tag);
414
+ } else {
415
+ H_ELE(cText);
416
+ rb_hpricot_add(S->focus, ele);
417
+ }
418
+ } else if (sym == sym_xmldecl) {
419
+ H_ELE(cXMLDecl);
420
+ rb_hpricot_add(S->focus, ele);
421
+ }
422
+ }
423
+
424
+ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
425
+ {
426
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0, io = 0;
427
+ char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
428
+
429
+ hpricot_state *S = NULL;
430
+ VALUE port, opts;
431
+ VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
432
+ char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
433
+ int done = 0, ele_open = 0, buffer_size = 0, taint = 0;
434
+
435
+ rb_scan_args(argc, argv, "11", &port, &opts);
436
+ taint = OBJ_TAINTED(port);
437
+ io = rb_respond_to(port, s_read);
438
+ if (!io)
439
+ {
440
+ if (rb_respond_to(port, s_to_str))
441
+ {
442
+ port = rb_funcall(port, s_to_str, 0);
443
+ StringValue(port);
444
+ }
445
+ else
446
+ {
447
+ rb_raise(rb_eArgError, "an Hpricot document must be built from an input source (a String or IO object.)");
448
+ }
449
+ }
450
+
451
+ if (TYPE(opts) != T_HASH)
452
+ opts = Qnil;
453
+
454
+ if (!rb_block_given_p())
455
+ {
456
+ S = ALLOC(hpricot_state);
457
+ S->doc = rb_obj_alloc(cDoc);
458
+ rb_gc_register_address(&S->doc);
459
+ S->focus = S->doc;
460
+ S->last = Qnil;
461
+ S->xml = OPT(opts, xml);
462
+ S->strict = OPT(opts, xhtml_strict);
463
+ S->fixup = OPT(opts, fixup_tags);
464
+ if (S->strict) S->fixup = 1;
465
+ rb_ivar_set(S->doc, rb_intern("@options"), opts);
466
+
467
+ S->EC = rb_const_get(mHpricot, s_ElementContent);
468
+ }
469
+
470
+ buffer_size = BUFSIZE;
471
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
472
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
473
+ if (!NIL_P(bufsize)) {
474
+ buffer_size = NUM2INT(bufsize);
475
+ }
476
+ }
477
+
478
+ if (io)
479
+ buf = ALLOC_N(char, buffer_size);
480
+
481
+ %% write init;
482
+
483
+ while (!done) {
484
+ VALUE str;
485
+ char *p, *pe;
486
+ int len, space = buffer_size - have, tokstart_diff, tokend_diff, mark_tag_diff, mark_akey_diff, mark_aval_diff;
487
+
488
+ if (io)
489
+ {
490
+ if (space == 0) {
491
+ /* We've used up the entire buffer storing an already-parsed token
492
+ * prefix that must be preserved. Likely caused by super-long attributes.
493
+ * Increase buffer size and continue */
494
+ tokstart_diff = ts - buf;
495
+ tokend_diff = te - buf;
496
+ mark_tag_diff = mark_tag - buf;
497
+ mark_akey_diff = mark_akey - buf;
498
+ mark_aval_diff = mark_aval - buf;
499
+
500
+ buffer_size += BUFSIZE;
501
+ REALLOC_N(buf, char, buffer_size);
502
+
503
+ space = buffer_size - have;
504
+
505
+ ts = buf + tokstart_diff;
506
+ te = buf + tokend_diff;
507
+ mark_tag = buf + mark_tag_diff;
508
+ mark_akey = buf + mark_akey_diff;
509
+ mark_aval = buf + mark_aval_diff;
510
+ }
511
+ p = buf + have;
512
+
513
+ str = rb_funcall(port, s_read, 1, INT2FIX(space));
514
+ len = RSTRING_LEN(str);
515
+ memcpy(p, StringValuePtr(str), len);
516
+ }
517
+ else
518
+ {
519
+ p = RSTRING_PTR(port);
520
+ len = RSTRING_LEN(port) + 1;
521
+ done = 1;
522
+ }
523
+
524
+ nread += len;
525
+
526
+ /* If this is the last buffer, tack on an EOF. */
527
+ if (io && len < space) {
528
+ p[len++] = 0;
529
+ done = 1;
530
+ }
531
+
532
+ pe = p + len;
533
+ %% write exec;
534
+
535
+ if (cs == hpricot_scan_error) {
536
+ if (buf != NULL)
537
+ free(buf);
538
+ if (!NIL_P(tag))
539
+ {
540
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
541
+ }
542
+ else
543
+ {
544
+ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
545
+ }
546
+ }
547
+
548
+ if (done && ele_open)
549
+ {
550
+ ele_open = 0;
551
+ if (ts > 0) {
552
+ mark_tag = ts;
553
+ ts = 0;
554
+ text = 1;
555
+ }
556
+ }
557
+
558
+ if (ts == 0)
559
+ {
560
+ have = 0;
561
+ /* text nodes have no ts because each byte is parsed alone */
562
+ if (mark_tag != NULL && text == 1)
563
+ {
564
+ if (done)
565
+ {
566
+ if (mark_tag < p-1)
567
+ {
568
+ CAT(tag, p-1);
569
+ ELE(text);
570
+ }
571
+ }
572
+ else
573
+ {
574
+ CAT(tag, p);
575
+ }
576
+ }
577
+ if (io)
578
+ mark_tag = buf;
579
+ else
580
+ mark_tag = RSTRING_PTR(port);
581
+ }
582
+ else if (io)
583
+ {
584
+ have = pe - ts;
585
+ memmove(buf, ts, have);
586
+ SLIDE(tag);
587
+ SLIDE(akey);
588
+ SLIDE(aval);
589
+ te = buf + (te - ts);
590
+ ts = buf;
591
+ }
592
+ }
593
+
594
+ if (buf != NULL)
595
+ free(buf);
596
+
597
+ if (S != NULL)
598
+ {
599
+ VALUE doc = S->doc;
600
+ rb_gc_unregister_address(&S->doc);
601
+ free(S);
602
+ return doc;
603
+ }
604
+
605
+ return Qnil;
606
+ }
607
+
608
+ static VALUE
609
+ alloc_hpricot_struct(VALUE klass)
610
+ {
611
+ VALUE size;
612
+ long n;
613
+ NEWOBJ(st, struct RStruct);
614
+ OBJSETUP(st, klass, T_STRUCT);
615
+
616
+ size = rb_struct_iv_get(klass, "__size__");
617
+ n = FIX2LONG(size);
618
+
619
+ #ifndef RSTRUCT_EMBED_LEN_MAX
620
+ st->ptr = ALLOC_N(VALUE, n);
621
+ rb_mem_clear(st->ptr, n);
622
+ st->len = n;
623
+ #else
624
+ if (0 < n && n <= RSTRUCT_EMBED_LEN_MAX) {
625
+ RBASIC(st)->flags &= ~RSTRUCT_EMBED_LEN_MASK;
626
+ RBASIC(st)->flags |= n << RSTRUCT_EMBED_LEN_SHIFT;
627
+ rb_mem_clear(st->as.ary, n);
628
+ } else {
629
+ st->as.heap.ptr = ALLOC_N(VALUE, n);
630
+ rb_mem_clear(st->as.heap.ptr, n);
631
+ st->as.heap.len = n;
632
+ }
633
+ #endif
634
+
635
+ return (VALUE)st;
636
+ }
637
+
638
+ static VALUE hpricot_struct_ref0(VALUE obj) {return H_ELE_GET(obj, 0);}
639
+ static VALUE hpricot_struct_ref1(VALUE obj) {return H_ELE_GET(obj, 1);}
640
+ static VALUE hpricot_struct_ref2(VALUE obj) {return H_ELE_GET(obj, 2);}
641
+ static VALUE hpricot_struct_ref3(VALUE obj) {return H_ELE_GET(obj, 3);}
642
+ static VALUE hpricot_struct_ref4(VALUE obj) {return H_ELE_GET(obj, 4);}
643
+ static VALUE hpricot_struct_ref5(VALUE obj) {return H_ELE_GET(obj, 5);}
644
+ static VALUE hpricot_struct_ref6(VALUE obj) {return H_ELE_GET(obj, 6);}
645
+ static VALUE hpricot_struct_ref7(VALUE obj) {return H_ELE_GET(obj, 7);}
646
+ static VALUE hpricot_struct_ref8(VALUE obj) {return H_ELE_GET(obj, 8);}
647
+ static VALUE hpricot_struct_ref9(VALUE obj) {return H_ELE_GET(obj, 9);}
648
+
649
+ static VALUE (*ref_func[10])() = {
650
+ hpricot_struct_ref0,
651
+ hpricot_struct_ref1,
652
+ hpricot_struct_ref2,
653
+ hpricot_struct_ref3,
654
+ hpricot_struct_ref4,
655
+ hpricot_struct_ref5,
656
+ hpricot_struct_ref6,
657
+ hpricot_struct_ref7,
658
+ hpricot_struct_ref8,
659
+ hpricot_struct_ref9,
660
+ };
661
+
662
+ static VALUE hpricot_struct_set0(VALUE obj, VALUE val) {return H_ELE_SET(obj, 0, val);}
663
+ static VALUE hpricot_struct_set1(VALUE obj, VALUE val) {return H_ELE_SET(obj, 1, val);}
664
+ static VALUE hpricot_struct_set2(VALUE obj, VALUE val) {return H_ELE_SET(obj, 2, val);}
665
+ static VALUE hpricot_struct_set3(VALUE obj, VALUE val) {return H_ELE_SET(obj, 3, val);}
666
+ static VALUE hpricot_struct_set4(VALUE obj, VALUE val) {return H_ELE_SET(obj, 4, val);}
667
+ static VALUE hpricot_struct_set5(VALUE obj, VALUE val) {return H_ELE_SET(obj, 5, val);}
668
+ static VALUE hpricot_struct_set6(VALUE obj, VALUE val) {return H_ELE_SET(obj, 6, val);}
669
+ static VALUE hpricot_struct_set7(VALUE obj, VALUE val) {return H_ELE_SET(obj, 7, val);}
670
+ static VALUE hpricot_struct_set8(VALUE obj, VALUE val) {return H_ELE_SET(obj, 8, val);}
671
+ static VALUE hpricot_struct_set9(VALUE obj, VALUE val) {return H_ELE_SET(obj, 9, val);}
672
+
673
+ static VALUE (*set_func[10])() = {
674
+ hpricot_struct_set0,
675
+ hpricot_struct_set1,
676
+ hpricot_struct_set2,
677
+ hpricot_struct_set3,
678
+ hpricot_struct_set4,
679
+ hpricot_struct_set5,
680
+ hpricot_struct_set6,
681
+ hpricot_struct_set7,
682
+ hpricot_struct_set8,
683
+ hpricot_struct_set9,
684
+ };
685
+
686
+ static VALUE
687
+ make_hpricot_struct(VALUE members)
688
+ {
689
+ int i = 0;
690
+ VALUE klass = rb_class_new(rb_cObject);
691
+ rb_iv_set(klass, "__size__", INT2NUM(RARRAY_LEN(members)));
692
+ rb_define_alloc_func(klass, alloc_hpricot_struct);
693
+ rb_define_singleton_method(klass, "new", rb_class_new_instance, -1);
694
+ for (i = 0; i < RARRAY_LEN(members); i++) {
695
+ ID id = SYM2ID(RARRAY_PTR(members)[i]);
696
+ rb_define_method_id(klass, id, ref_func[i], 0);
697
+ rb_define_method_id(klass, rb_id_attrset(id), set_func[i], 1);
698
+ }
699
+ return klass;
700
+ }
701
+
702
+ void Init_hpricot_scan()
703
+ {
704
+ VALUE structElem, structAttr, structBasic;
705
+
706
+ s_ElementContent = rb_intern("ElementContent");
707
+ symAllow = ID2SYM(rb_intern("allow"));
708
+ symDeny = ID2SYM(rb_intern("deny"));
709
+ s_downcase = rb_intern("downcase");
710
+ s_new = rb_intern("new");
711
+ s_parent = rb_intern("parent");
712
+ s_read = rb_intern("read");
713
+ s_to_str = rb_intern("to_str");
714
+ sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
715
+ sym_doctype = ID2SYM(rb_intern("doctype"));
716
+ sym_procins = ID2SYM(rb_intern("procins"));
717
+ sym_stag = ID2SYM(rb_intern("stag"));
718
+ sym_etag = ID2SYM(rb_intern("etag"));
719
+ sym_emptytag = ID2SYM(rb_intern("emptytag"));
720
+ sym_allowed = ID2SYM(rb_intern("allowed"));
721
+ sym_children = ID2SYM(rb_intern("children"));
722
+ sym_comment = ID2SYM(rb_intern("comment"));
723
+ sym_cdata = ID2SYM(rb_intern("cdata"));
724
+ sym_name = ID2SYM(rb_intern("name"));
725
+ sym_parent = ID2SYM(rb_intern("parent"));
726
+ sym_raw_attributes = ID2SYM(rb_intern("raw_attributes"));
727
+ sym_raw_string = ID2SYM(rb_intern("raw_string"));
728
+ sym_tagno = ID2SYM(rb_intern("tagno"));
729
+ sym_text = ID2SYM(rb_intern("text"));
730
+ sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
731
+ sym_CDATA = ID2SYM(rb_intern("CDATA"));
732
+
733
+ mHpricot = rb_define_module("Hpricot");
734
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
735
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, -1);
736
+ rb_define_singleton_method(mHpricot, "css", hpricot_css, 3);
737
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
738
+
739
+ structElem = make_hpricot_struct(rb_ary_new3(8, sym_name, sym_parent,
740
+ sym_raw_attributes, sym_etag, sym_raw_string, sym_allowed,
741
+ sym_tagno, sym_children));
742
+ structAttr = make_hpricot_struct(rb_ary_new3(3, sym_name, sym_parent, sym_raw_attributes));
743
+ structBasic = make_hpricot_struct(rb_ary_new3(2, sym_name, sym_parent));
744
+
745
+ cDoc = rb_define_class_under(mHpricot, "Doc", structElem);
746
+ cCData = rb_define_class_under(mHpricot, "CData", structBasic);
747
+ rb_define_method(cCData, "content", hpricot_ele_get_name, 0);
748
+ rb_define_method(cCData, "content=", hpricot_ele_set_name, 1);
749
+ cComment = rb_define_class_under(mHpricot, "Comment", structBasic);
750
+ rb_define_method(cComment, "content", hpricot_ele_get_name, 0);
751
+ rb_define_method(cComment, "content=", hpricot_ele_set_name, 1);
752
+ cDocType = rb_define_class_under(mHpricot, "DocType", structAttr);
753
+ rb_define_method(cDocType, "raw_string", hpricot_ele_get_name, 0);
754
+ rb_define_method(cDocType, "clear_raw", hpricot_ele_clear_name, 0);
755
+ rb_define_method(cDocType, "target", hpricot_ele_get_target, 0);
756
+ rb_define_method(cDocType, "target=", hpricot_ele_set_target, 1);
757
+ rb_define_method(cDocType, "public_id", hpricot_ele_get_public_id, 0);
758
+ rb_define_method(cDocType, "public_id=", hpricot_ele_set_public_id, 1);
759
+ rb_define_method(cDocType, "system_id", hpricot_ele_get_system_id, 0);
760
+ rb_define_method(cDocType, "system_id=", hpricot_ele_set_system_id, 1);
761
+ cElem = rb_define_class_under(mHpricot, "Elem", structElem);
762
+ rb_define_method(cElem, "clear_raw", hpricot_ele_clear_raw, 0);
763
+ cBogusETag = rb_define_class_under(mHpricot, "BogusETag", structAttr);
764
+ rb_define_method(cBogusETag, "raw_string", hpricot_ele_get_attr, 0);
765
+ rb_define_method(cBogusETag, "clear_raw", hpricot_ele_clear_attr, 0);
766
+ cText = rb_define_class_under(mHpricot, "Text", structBasic);
767
+ rb_define_method(cText, "raw_string", hpricot_ele_get_name, 0);
768
+ rb_define_method(cText, "clear_raw", hpricot_ele_clear_name, 0);
769
+ rb_define_method(cText, "content", hpricot_ele_get_name, 0);
770
+ rb_define_method(cText, "content=", hpricot_ele_set_name, 1);
771
+ cXMLDecl = rb_define_class_under(mHpricot, "XMLDecl", structAttr);
772
+ rb_define_method(cXMLDecl, "raw_string", hpricot_ele_get_name, 0);
773
+ rb_define_method(cXMLDecl, "clear_raw", hpricot_ele_clear_name, 0);
774
+ rb_define_method(cXMLDecl, "encoding", hpricot_ele_get_encoding, 0);
775
+ rb_define_method(cXMLDecl, "encoding=", hpricot_ele_set_encoding, 1);
776
+ rb_define_method(cXMLDecl, "standalone", hpricot_ele_get_standalone, 0);
777
+ rb_define_method(cXMLDecl, "standalone=", hpricot_ele_set_standalone, 1);
778
+ rb_define_method(cXMLDecl, "version", hpricot_ele_get_version, 0);
779
+ rb_define_method(cXMLDecl, "version=", hpricot_ele_set_version, 1);
780
+ cProcIns = rb_define_class_under(mHpricot, "ProcIns", structAttr);
781
+ rb_define_method(cProcIns, "target", hpricot_ele_get_name, 0);
782
+ rb_define_method(cProcIns, "target=", hpricot_ele_set_name, 1);
783
+ rb_define_method(cProcIns, "content", hpricot_ele_get_attr, 0);
784
+ rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
785
+
786
+ rb_const_set(mHpricot, rb_intern("ProcInsParse"),
787
+ reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
788
+ }