hpricot 0.6-jruby

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/CHANGELOG +62 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +211 -0
  5. data/ext/hpricot_scan/HpricotScanService.java +1340 -0
  6. data/ext/hpricot_scan/extconf.rb +6 -0
  7. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  8. data/ext/hpricot_scan/hpricot_scan.c +5976 -0
  9. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  10. data/ext/hpricot_scan/hpricot_scan.java.rl +363 -0
  11. data/ext/hpricot_scan/hpricot_scan.rl +273 -0
  12. data/extras/mingw-rbconfig.rb +176 -0
  13. data/lib/hpricot.rb +26 -0
  14. data/lib/hpricot/blankslate.rb +63 -0
  15. data/lib/hpricot/builder.rb +200 -0
  16. data/lib/hpricot/elements.rb +510 -0
  17. data/lib/hpricot/htmlinfo.rb +672 -0
  18. data/lib/hpricot/inspect.rb +107 -0
  19. data/lib/hpricot/modules.rb +37 -0
  20. data/lib/hpricot/parse.rb +297 -0
  21. data/lib/hpricot/tag.rb +228 -0
  22. data/lib/hpricot/tags.rb +164 -0
  23. data/lib/hpricot/traverse.rb +821 -0
  24. data/lib/hpricot/xchar.rb +94 -0
  25. data/lib/i686-linux/hpricot_scan.jar +0 -0
  26. data/test/files/basic.xhtml +17 -0
  27. data/test/files/boingboing.html +2266 -0
  28. data/test/files/cy0.html +3653 -0
  29. data/test/files/immob.html +400 -0
  30. data/test/files/pace_application.html +1320 -0
  31. data/test/files/tenderlove.html +16 -0
  32. data/test/files/uswebgen.html +220 -0
  33. data/test/files/utf8.html +1054 -0
  34. data/test/files/week9.html +1723 -0
  35. data/test/files/why.xml +19 -0
  36. data/test/load_files.rb +7 -0
  37. data/test/test_alter.rb +65 -0
  38. data/test/test_builder.rb +24 -0
  39. data/test/test_parser.rb +379 -0
  40. data/test/test_paths.rb +16 -0
  41. data/test/test_preserved.rb +66 -0
  42. data/test/test_xml.rb +28 -0
  43. metadata +98 -0
@@ -0,0 +1,79 @@
1
+ /*
2
+ * hpricot_scan.h
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006 why the lucky stiff
8
+ * You can redistribute it and/or modify it under the same terms as Ruby.
9
+ */
10
+
11
+ #ifndef hpricot_scan_h
12
+ #define hpricot_scan_h
13
+
14
+ #include <sys/types.h>
15
+
16
+ #if defined(_WIN32)
17
+ #include <stddef.h>
18
+ #endif
19
+
20
+ /*
21
+ * Memory Allocation
22
+ */
23
+ #if defined(HAVE_ALLOCA_H) && !defined(__GNUC__)
24
+ #include <alloca.h>
25
+ #endif
26
+
27
+ #ifndef NULL
28
+ # define NULL (void *)0
29
+ #endif
30
+
31
+ #define BUFSIZE 16384
32
+
33
+ #define S_ALLOC_N(type,n) (type*)malloc(sizeof(type)*(n))
34
+ #define S_ALLOC(type) (type*)malloc(sizeof(type))
35
+ #define S_REALLOC_N(var,type,n) (var)=(type*)realloc((char*)(var),sizeof(type)*(n))
36
+ #define S_FREE(n) free(n); n = NULL;
37
+
38
+ #define S_ALLOCA_N(type,n) (type*)alloca(sizeof(type)*(n))
39
+
40
+ #define S_MEMZERO(p,type,n) memset((p), 0, sizeof(type)*(n))
41
+ #define S_MEMCPY(p1,p2,type,n) memcpy((p1), (p2), sizeof(type)*(n))
42
+ #define S_MEMMOVE(p1,p2,type,n) memmove((p1), (p2), sizeof(type)*(n))
43
+ #define S_MEMCMP(p1,p2,type,n) memcmp((p1), (p2), sizeof(type)*(n))
44
+
45
+ typedef struct {
46
+ void *name;
47
+ void *attributes;
48
+ } hpricot_element;
49
+
50
+ typedef void (*hpricot_element_cb)(void *data, hpricot_element *token);
51
+
52
+ typedef struct hpricot_scan {
53
+ int lineno;
54
+ int cs;
55
+ size_t nread;
56
+ size_t mark;
57
+
58
+ void *data;
59
+
60
+ hpricot_element_cb xmldecl;
61
+ hpricot_element_cb doctype;
62
+ hpricot_element_cb xmlprocins;
63
+ hpricot_element_cb starttag;
64
+ hpricot_element_cb endtag;
65
+ hpricot_element_cb emptytag;
66
+ hpricot_element_cb comment;
67
+ hpricot_element_cb cdata;
68
+
69
+ } http_scan;
70
+
71
+ // int hpricot_scan_init(hpricot_scan *scan);
72
+ // int hpricot_scan_finish(hpricot_scan *scan);
73
+ // size_t hpricot_scan_execute(hpricot_scan *scan, const char *data, size_t len, size_t off);
74
+ // int hpricot_scan_has_error(hpricot_scan *scan);
75
+ // int hpricot_scan_is_finished(hpricot_scan *scan);
76
+ //
77
+ // #define hpricot_scan_nread(scan) (scan)->nread
78
+
79
+ #endif
@@ -0,0 +1,363 @@
1
+
2
+ import java.io.IOException;
3
+
4
+ import org.jruby.Ruby;
5
+ import org.jruby.RubyClass;
6
+ import org.jruby.RubyHash;
7
+ import org.jruby.RubyModule;
8
+ import org.jruby.RubyNumeric;
9
+ import org.jruby.RubyString;
10
+ import org.jruby.runtime.Block;
11
+ import org.jruby.runtime.CallbackFactory;
12
+ import org.jruby.runtime.builtin.IRubyObject;
13
+ import org.jruby.exceptions.RaiseException;
14
+ import org.jruby.runtime.load.BasicLibraryService;
15
+
16
+ public class HpricotScanService implements BasicLibraryService {
17
+ public static String NO_WAY_SERIOUSLY="*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!";
18
+
19
+ public void ELE(IRubyObject N) {
20
+ if (tokend > tokstart || text) {
21
+ IRubyObject raw_string = runtime.getNil();
22
+ ele_open = false; text = false;
23
+ if (tokstart != -1 && N != cdata && N != sym_text && N != procins && N != comment) {
24
+ raw_string = runtime.newString(new String(buf,tokstart,tokend-tokstart));
25
+ }
26
+ rb_yield_tokens(N, tag[0], attr, raw_string, taint);
27
+ }
28
+ }
29
+
30
+ public void SET(IRubyObject[] N, int E) {
31
+ int mark = 0;
32
+ if(N == tag) {
33
+ if(mark_tag == -1 || E == mark_tag) {
34
+ tag[0] = runtime.newString("");
35
+ } else if(E > mark_tag) {
36
+ tag[0] = runtime.newString(new String(buf,mark_tag, E-mark_tag));
37
+ }
38
+ } else if(N == akey) {
39
+ if(mark_akey == -1 || E == mark_akey) {
40
+ akey[0] = runtime.newString("");
41
+ } else if(E > mark_akey) {
42
+ akey[0] = runtime.newString(new String(buf,mark_akey, E-mark_akey));
43
+ }
44
+ } else if(N == aval) {
45
+ if(mark_aval == -1 || E == mark_aval) {
46
+ aval[0] = runtime.newString("");
47
+ } else if(E > mark_aval) {
48
+ aval[0] = runtime.newString(new String(buf,mark_aval, E-mark_aval));
49
+ }
50
+ }
51
+ }
52
+
53
+ public void CAT(IRubyObject[] N, int E) {
54
+ if(N[0].isNil()) {
55
+ SET(N,E);
56
+ } else {
57
+ int mark = 0;
58
+ if(N == tag) {
59
+ mark = mark_tag;
60
+ } else if(N == akey) {
61
+ mark = mark_akey;
62
+ } else if(N == aval) {
63
+ mark = mark_aval;
64
+ }
65
+ ((RubyString)(N[0])).append(runtime.newString(new String(buf, mark, E-mark)));
66
+ }
67
+ }
68
+
69
+ public void SLIDE(Object N) {
70
+ int mark = 0;
71
+ if(N == tag) {
72
+ mark = mark_tag;
73
+ } else if(N == akey) {
74
+ mark = mark_akey;
75
+ } else if(N == aval) {
76
+ mark = mark_aval;
77
+ }
78
+ if(mark > tokstart) {
79
+ if(N == tag) {
80
+ mark_tag -= tokstart;
81
+ } else if(N == akey) {
82
+ mark_akey -= tokstart;
83
+ } else if(N == aval) {
84
+ mark_aval -= tokstart;
85
+ }
86
+ }
87
+ }
88
+
89
+ public void ATTR(IRubyObject K, IRubyObject V) {
90
+ if(!K.isNil()) {
91
+ if(attr.isNil()) {
92
+ attr = RubyHash.newHash(runtime);
93
+ }
94
+ ((RubyHash)attr).aset(K,V);
95
+ }
96
+ }
97
+
98
+ public void ATTR(IRubyObject[] K, IRubyObject V) {
99
+ ATTR(K[0],V);
100
+ }
101
+
102
+ public void ATTR(IRubyObject K, IRubyObject[] V) {
103
+ ATTR(K,V[0]);
104
+ }
105
+
106
+ public void ATTR(IRubyObject[] K, IRubyObject[] V) {
107
+ ATTR(K[0],V[0]);
108
+ }
109
+
110
+ public void TEXT_PASS() {
111
+ if(!text) {
112
+ if(ele_open) {
113
+ ele_open = false;
114
+ if(tokstart > -1) {
115
+ mark_tag = tokstart;
116
+ }
117
+ } else {
118
+ mark_tag = p;
119
+ }
120
+ attr = runtime.getNil();
121
+ tag[0] = runtime.getNil();
122
+ text = true;
123
+ }
124
+ }
125
+
126
+ public void EBLK(IRubyObject N, int T) {
127
+ CAT(tag, p - T + 1);
128
+ ELE(N);
129
+ }
130
+
131
+
132
+ public void rb_raise(RubyClass error, String message) {
133
+ throw new RaiseException(runtime, error, message, true);
134
+ }
135
+
136
+ public IRubyObject rb_str_new2(String s) {
137
+ return runtime.newString(s);
138
+ }
139
+
140
+ %%{
141
+ machine hpricot_scan;
142
+
143
+ action newEle {
144
+ if (text) {
145
+ CAT(tag, p);
146
+ ELE(sym_text);
147
+ text = false;
148
+ }
149
+ attr = runtime.getNil();
150
+ tag[0] = runtime.getNil();
151
+ mark_tag = -1;
152
+ ele_open = true;
153
+ }
154
+
155
+ action _tag { mark_tag = p; }
156
+ action _aval { mark_aval = p; }
157
+ action _akey { mark_akey = p; }
158
+ action tag { SET(tag, p); }
159
+ action tagc { SET(tag, p-1); }
160
+ action aval { SET(aval, p); }
161
+ action aunq {
162
+ if (buf[p-1] == '"' || buf[p-1] == '\'') { SET(aval, p-1); }
163
+ else { SET(aval, p); }
164
+ }
165
+ action akey { SET(akey, p); }
166
+ action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
167
+ action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
168
+ action xmlsd { SET(aval, p); ATTR(rb_str_new2("standalone"), aval); }
169
+ action pubid { SET(aval, p); ATTR(rb_str_new2("public_id"), aval); }
170
+ action sysid { SET(aval, p); ATTR(rb_str_new2("system_id"), aval); }
171
+
172
+ action new_attr {
173
+ akey[0] = runtime.getNil();
174
+ aval[0] = runtime.getNil();
175
+ mark_akey = -1;
176
+ mark_aval = -1;
177
+ }
178
+
179
+ action save_attr {
180
+ ATTR(akey, aval);
181
+ }
182
+
183
+ include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
184
+
185
+ }%%
186
+
187
+ %% write data nofinal;
188
+
189
+ public final static int BUFSIZE=16384;
190
+
191
+ private void rb_yield_tokens(IRubyObject sym, IRubyObject tag, IRubyObject attr, IRubyObject raw, boolean taint) {
192
+ IRubyObject ary;
193
+ if (sym == runtime.newSymbol("text")) {
194
+ raw = tag;
195
+ }
196
+ ary = runtime.newArray(new IRubyObject[]{sym, tag, attr, raw});
197
+ if (taint) {
198
+ ary.setTaint(true);
199
+ tag.setTaint(true);
200
+ attr.setTaint(true);
201
+ raw.setTaint(true);
202
+ }
203
+ block.yield(runtime.getCurrentContext(), ary, null, null, false);
204
+ }
205
+
206
+
207
+ int cs, act, have = 0, nread = 0, curline = 1, p=-1;
208
+ boolean text = false;
209
+ int tokstart=-1, tokend;
210
+ char[] buf;
211
+ Ruby runtime;
212
+ IRubyObject attr, bufsize;
213
+ IRubyObject[] tag, akey, aval;
214
+ int mark_tag, mark_akey, mark_aval;
215
+ boolean done = false, ele_open = false;
216
+ int buffer_size = 0;
217
+ boolean taint = false;
218
+ Block block = null;
219
+
220
+
221
+ IRubyObject xmldecl, doctype, procins, stag, etag, emptytag, comment,
222
+ cdata, sym_text;
223
+
224
+ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
225
+ attr = bufsize = runtime.getNil();
226
+ tag = new IRubyObject[]{runtime.getNil()};
227
+ akey = new IRubyObject[]{runtime.getNil()};
228
+ aval = new IRubyObject[]{runtime.getNil()};
229
+
230
+ RubyClass rb_eHpricotParseError = runtime.getModule("Hpricot").getClass("ParseError");
231
+
232
+ taint = port.isTaint();
233
+ if ( !port.respondsTo("read")) {
234
+ if ( port.respondsTo("to_str")) {
235
+ port = port.callMethod(runtime.getCurrentContext(),"to_str");
236
+ } else {
237
+ throw runtime.newArgumentError("bad Hpricot argument, String or IO only please.");
238
+ }
239
+ }
240
+
241
+ buffer_size = BUFSIZE;
242
+ if (recv.getInstanceVariable("@buffer_size") != null) {
243
+ bufsize = recv.getInstanceVariable("@buffer_size");
244
+ if (!bufsize.isNil()) {
245
+ buffer_size = RubyNumeric.fix2int(bufsize);
246
+ }
247
+ }
248
+ buf = new char[buffer_size];
249
+
250
+ %% write init;
251
+
252
+ while( !done ) {
253
+ IRubyObject str;
254
+ p = have;
255
+ int pe;
256
+ int len, space = buffer_size - have;
257
+
258
+ if ( space == 0 ) {
259
+ /* We've used up the entire buffer storing an already-parsed token
260
+ * prefix that must be preserved. Likely caused by super-long attributes.
261
+ * See ticket #13. */
262
+ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <" + tag.toString() + ">, starting on line "+curline+".");
263
+ }
264
+
265
+ if (port.respondsTo("read")) {
266
+ str = port.callMethod(runtime.getCurrentContext(),"read",runtime.newFixnum(space));
267
+ } else {
268
+ str = ((RubyString)port).substr(nread,space);
269
+ }
270
+
271
+ str = str.convertToString();
272
+ String sss = str.toString();
273
+ char[] chars = sss.toCharArray();
274
+ System.arraycopy(chars,0,buf,p,chars.length);
275
+
276
+ len = sss.length();
277
+ nread += len;
278
+
279
+ if ( len < space ) {
280
+ len++;
281
+ done = true;
282
+ }
283
+
284
+ pe = p + len;
285
+ char[] data = buf;
286
+
287
+ %% write exec;
288
+
289
+ if ( cs == hpricot_scan_error ) {
290
+ if(!tag[0].isNil()) {
291
+ rb_raise(rb_eHpricotParseError, "parse error on element <"+tag.toString()+">, starting on line "+curline+".\n" + NO_WAY_SERIOUSLY);
292
+ } else {
293
+ rb_raise(rb_eHpricotParseError, "parse error on line "+curline+".\n" + NO_WAY_SERIOUSLY);
294
+ }
295
+ }
296
+
297
+ if ( done && ele_open ) {
298
+ ele_open = false;
299
+ if(tokstart > -1) {
300
+ mark_tag = tokstart;
301
+ tokstart = -1;
302
+ text = true;
303
+ }
304
+ }
305
+
306
+ if(tokstart == -1) {
307
+ have = 0;
308
+ /* text nodes have no tokstart because each byte is parsed alone */
309
+ if(mark_tag != -1 && text) {
310
+ if (done) {
311
+ if(mark_tag < p-1) {
312
+ CAT(tag, p-1);
313
+ ELE(sym_text);
314
+ }
315
+ } else {
316
+ CAT(tag, p);
317
+ }
318
+ }
319
+ mark_tag = 0;
320
+ } else {
321
+ have = pe - tokstart;
322
+ System.arraycopy(buf,tokstart,buf,0,have);
323
+ SLIDE(tag);
324
+ SLIDE(akey);
325
+ SLIDE(aval);
326
+ tokend = (tokend - tokstart);
327
+ tokstart = 0;
328
+ }
329
+ }
330
+ return runtime.getNil();
331
+ }
332
+
333
+ public static IRubyObject __hpricot_scan(IRubyObject recv, IRubyObject port, Block block) {
334
+ Ruby runtime = recv.getRuntime();
335
+ HpricotScanService service = new HpricotScanService();
336
+ service.runtime = runtime;
337
+ service.xmldecl = runtime.newSymbol("xmldecl");
338
+ service.doctype = runtime.newSymbol("doctype");
339
+ service.procins = runtime.newSymbol("procins");
340
+ service.stag = runtime.newSymbol("stag");
341
+ service.etag = runtime.newSymbol("etag");
342
+ service.emptytag = runtime.newSymbol("emptytag");
343
+ service.comment = runtime.newSymbol("comment");
344
+ service.cdata = runtime.newSymbol("cdata");
345
+ service.sym_text = runtime.newSymbol("text");
346
+ service.block = block;
347
+ return service.hpricot_scan(recv, port);
348
+ }
349
+
350
+
351
+ public boolean basicLoad(final Ruby runtime) throws IOException {
352
+ Init_hpricot_scan(runtime);
353
+ return true;
354
+ }
355
+
356
+ public static void Init_hpricot_scan(Ruby runtime) {
357
+ RubyModule mHpricot = runtime.defineModule("Hpricot");
358
+ mHpricot.getMetaClass().attr_accessor(new IRubyObject[]{runtime.newSymbol("buffer_size")});
359
+ CallbackFactory fact = runtime.callbackFactory(HpricotScanService.class);
360
+ mHpricot.getMetaClass().defineMethod("scan",fact.getSingletonMethod("__hpricot_scan",IRubyObject.class));
361
+ mHpricot.defineClassUnder("ParseError",runtime.getClass("Exception"),runtime.getClass("Exception").getAllocator());
362
+ }
363
+ }
@@ -0,0 +1,273 @@
1
+ /*
2
+ * hpricot_scan.rl
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006 why the lucky stiff
8
+ */
9
+ #include <ruby.h>
10
+
11
+ #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
12
+
13
+ static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
14
+ sym_cdata, sym_text;
15
+ static VALUE rb_eHpricotParseError;
16
+ static ID s_read, s_to_str;
17
+
18
+ #define ELE(N) \
19
+ if (tokend > tokstart || text == 1) { \
20
+ VALUE raw_string = Qnil; \
21
+ ele_open = 0; text = 0; \
22
+ if (tokstart != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
23
+ raw_string = rb_str_new(tokstart, tokend-tokstart); \
24
+ } \
25
+ rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
26
+ }
27
+
28
+ #define SET(N, E) \
29
+ if (mark_##N == NULL || E == mark_##N) \
30
+ N = rb_str_new2(""); \
31
+ else if (E > mark_##N) \
32
+ N = rb_str_new(mark_##N, E - mark_##N);
33
+
34
+ #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
35
+
36
+ #define SLIDE(N) if ( mark_##N > tokstart ) mark_##N = buf + (mark_##N - tokstart);
37
+
38
+ #define ATTR(K, V) \
39
+ if (!NIL_P(K)) { \
40
+ if (NIL_P(attr)) attr = rb_hash_new(); \
41
+ rb_hash_aset(attr, K, V); \
42
+ }
43
+
44
+ #define TEXT_PASS() \
45
+ if (text == 0) \
46
+ { \
47
+ if (ele_open == 1) { \
48
+ ele_open = 0; \
49
+ if (tokstart > 0) { \
50
+ mark_tag = tokstart; \
51
+ } \
52
+ } else { \
53
+ mark_tag = p; \
54
+ } \
55
+ attr = Qnil; \
56
+ tag = Qnil; \
57
+ text = 1; \
58
+ }
59
+
60
+ #define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
61
+
62
+ %%{
63
+ machine hpricot_scan;
64
+
65
+ action newEle {
66
+ if (text == 1) {
67
+ CAT(tag, p);
68
+ ELE(text);
69
+ text = 0;
70
+ }
71
+ attr = Qnil;
72
+ tag = Qnil;
73
+ mark_tag = NULL;
74
+ ele_open = 1;
75
+ }
76
+
77
+ action _tag { mark_tag = p; }
78
+ action _aval { mark_aval = p; }
79
+ action _akey { mark_akey = p; }
80
+ action tag { SET(tag, p); }
81
+ action tagc { SET(tag, p-1); }
82
+ action aval { SET(aval, p); }
83
+ action aunq {
84
+ if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
85
+ else { SET(aval, p); }
86
+ }
87
+ action akey { SET(akey, p); }
88
+ action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
89
+ action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
90
+ action xmlsd { SET(aval, p); ATTR(rb_str_new2("standalone"), aval); }
91
+ action pubid { SET(aval, p); ATTR(rb_str_new2("public_id"), aval); }
92
+ action sysid { SET(aval, p); ATTR(rb_str_new2("system_id"), aval); }
93
+
94
+ action new_attr {
95
+ akey = Qnil;
96
+ aval = Qnil;
97
+ mark_akey = NULL;
98
+ mark_aval = NULL;
99
+ }
100
+
101
+ action save_attr {
102
+ ATTR(akey, aval);
103
+ }
104
+
105
+ include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
106
+
107
+ }%%
108
+
109
+ %% write data nofinal;
110
+
111
+ #define BUFSIZE 16384
112
+
113
+ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
114
+ {
115
+ VALUE ary;
116
+ if (sym == sym_text) {
117
+ raw = tag;
118
+ }
119
+ ary = rb_ary_new3(4, sym, tag, attr, raw);
120
+ if (taint) {
121
+ OBJ_TAINT(ary);
122
+ OBJ_TAINT(tag);
123
+ OBJ_TAINT(attr);
124
+ OBJ_TAINT(raw);
125
+ }
126
+ rb_yield(ary);
127
+ }
128
+
129
+ VALUE hpricot_scan(VALUE self, VALUE port)
130
+ {
131
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0;
132
+ char *tokstart = 0, *tokend = 0, *buf = NULL;
133
+
134
+ VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
135
+ char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
136
+ int done = 0, ele_open = 0, buffer_size = 0;
137
+
138
+ int taint = OBJ_TAINTED( port );
139
+ if ( !rb_respond_to( port, s_read ) )
140
+ {
141
+ if ( rb_respond_to( port, s_to_str ) )
142
+ {
143
+ port = rb_funcall( port, s_to_str, 0 );
144
+ StringValue(port);
145
+ }
146
+ else
147
+ {
148
+ rb_raise( rb_eArgError, "bad Hpricot argument, String or IO only please." );
149
+ }
150
+ }
151
+
152
+ buffer_size = BUFSIZE;
153
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
154
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
155
+ if (!NIL_P(bufsize)) {
156
+ buffer_size = NUM2INT(bufsize);
157
+ }
158
+ }
159
+ buf = ALLOC_N(char, buffer_size);
160
+
161
+ %% write init;
162
+
163
+ while ( !done ) {
164
+ VALUE str;
165
+ char *p = buf + have, *pe;
166
+ int len, space = buffer_size - have;
167
+
168
+ if ( space == 0 ) {
169
+ /* We've used up the entire buffer storing an already-parsed token
170
+ * prefix that must be preserved. Likely caused by super-long attributes.
171
+ * See ticket #13. */
172
+ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING(tag)->ptr, curline);
173
+ }
174
+
175
+ if ( rb_respond_to( port, s_read ) )
176
+ {
177
+ str = rb_funcall( port, s_read, 1, INT2FIX(space) );
178
+ }
179
+ else
180
+ {
181
+ str = rb_str_substr( port, nread, space );
182
+ }
183
+
184
+ StringValue(str);
185
+ memcpy( p, RSTRING(str)->ptr, RSTRING(str)->len );
186
+ len = RSTRING(str)->len;
187
+ nread += len;
188
+
189
+ /* If this is the last buffer, tack on an EOF. */
190
+ if ( len < space ) {
191
+ p[len++] = 0;
192
+ done = 1;
193
+ }
194
+
195
+ pe = p + len;
196
+ %% write exec;
197
+
198
+ if ( cs == hpricot_scan_error ) {
199
+ free(buf);
200
+ if ( !NIL_P(tag) )
201
+ {
202
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING(tag)->ptr, curline);
203
+ }
204
+ else
205
+ {
206
+ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
207
+ }
208
+ }
209
+
210
+ if ( done && ele_open )
211
+ {
212
+ ele_open = 0;
213
+ if (tokstart > 0) {
214
+ mark_tag = tokstart;
215
+ tokstart = 0;
216
+ text = 1;
217
+ }
218
+ }
219
+
220
+ if ( tokstart == 0 )
221
+ {
222
+ have = 0;
223
+ /* text nodes have no tokstart because each byte is parsed alone */
224
+ if ( mark_tag != NULL && text == 1 )
225
+ {
226
+ if (done)
227
+ {
228
+ if (mark_tag < p-1)
229
+ {
230
+ CAT(tag, p-1);
231
+ ELE(text);
232
+ }
233
+ }
234
+ else
235
+ {
236
+ CAT(tag, p);
237
+ }
238
+ }
239
+ mark_tag = buf;
240
+ }
241
+ else
242
+ {
243
+ have = pe - tokstart;
244
+ memmove( buf, tokstart, have );
245
+ SLIDE(tag);
246
+ SLIDE(akey);
247
+ SLIDE(aval);
248
+ tokend = buf + (tokend - tokstart);
249
+ tokstart = buf;
250
+ }
251
+ }
252
+ free(buf);
253
+ }
254
+
255
+ void Init_hpricot_scan()
256
+ {
257
+ VALUE mHpricot = rb_define_module("Hpricot");
258
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
259
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
260
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException);
261
+
262
+ s_read = rb_intern("read");
263
+ s_to_str = rb_intern("to_str");
264
+ sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
265
+ sym_doctype = ID2SYM(rb_intern("doctype"));
266
+ sym_procins = ID2SYM(rb_intern("procins"));
267
+ sym_stag = ID2SYM(rb_intern("stag"));
268
+ sym_etag = ID2SYM(rb_intern("etag"));
269
+ sym_emptytag = ID2SYM(rb_intern("emptytag"));
270
+ sym_comment = ID2SYM(rb_intern("comment"));
271
+ sym_cdata = ID2SYM(rb_intern("cdata"));
272
+ sym_text = ID2SYM(rb_intern("text"));
273
+ }