hpricot 0.6-jruby

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/CHANGELOG +62 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +211 -0
  5. data/ext/hpricot_scan/HpricotScanService.java +1340 -0
  6. data/ext/hpricot_scan/extconf.rb +6 -0
  7. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  8. data/ext/hpricot_scan/hpricot_scan.c +5976 -0
  9. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  10. data/ext/hpricot_scan/hpricot_scan.java.rl +363 -0
  11. data/ext/hpricot_scan/hpricot_scan.rl +273 -0
  12. data/extras/mingw-rbconfig.rb +176 -0
  13. data/lib/hpricot.rb +26 -0
  14. data/lib/hpricot/blankslate.rb +63 -0
  15. data/lib/hpricot/builder.rb +200 -0
  16. data/lib/hpricot/elements.rb +510 -0
  17. data/lib/hpricot/htmlinfo.rb +672 -0
  18. data/lib/hpricot/inspect.rb +107 -0
  19. data/lib/hpricot/modules.rb +37 -0
  20. data/lib/hpricot/parse.rb +297 -0
  21. data/lib/hpricot/tag.rb +228 -0
  22. data/lib/hpricot/tags.rb +164 -0
  23. data/lib/hpricot/traverse.rb +821 -0
  24. data/lib/hpricot/xchar.rb +94 -0
  25. data/lib/i686-linux/hpricot_scan.jar +0 -0
  26. data/test/files/basic.xhtml +17 -0
  27. data/test/files/boingboing.html +2266 -0
  28. data/test/files/cy0.html +3653 -0
  29. data/test/files/immob.html +400 -0
  30. data/test/files/pace_application.html +1320 -0
  31. data/test/files/tenderlove.html +16 -0
  32. data/test/files/uswebgen.html +220 -0
  33. data/test/files/utf8.html +1054 -0
  34. data/test/files/week9.html +1723 -0
  35. data/test/files/why.xml +19 -0
  36. data/test/load_files.rb +7 -0
  37. data/test/test_alter.rb +65 -0
  38. data/test/test_builder.rb +24 -0
  39. data/test/test_parser.rb +379 -0
  40. data/test/test_paths.rb +16 -0
  41. data/test/test_preserved.rb +66 -0
  42. data/test/test_xml.rb +28 -0
  43. metadata +98 -0
@@ -0,0 +1,79 @@
1
+ /*
2
+ * hpricot_scan.h
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006 why the lucky stiff
8
+ * You can redistribute it and/or modify it under the same terms as Ruby.
9
+ */
10
+
11
+ #ifndef hpricot_scan_h
12
+ #define hpricot_scan_h
13
+
14
+ #include <sys/types.h>
15
+
16
+ #if defined(_WIN32)
17
+ #include <stddef.h>
18
+ #endif
19
+
20
+ /*
21
+ * Memory Allocation
22
+ */
23
+ #if defined(HAVE_ALLOCA_H) && !defined(__GNUC__)
24
+ #include <alloca.h>
25
+ #endif
26
+
27
+ #ifndef NULL
28
+ # define NULL (void *)0
29
+ #endif
30
+
31
+ #define BUFSIZE 16384
32
+
33
+ #define S_ALLOC_N(type,n) (type*)malloc(sizeof(type)*(n))
34
+ #define S_ALLOC(type) (type*)malloc(sizeof(type))
35
+ #define S_REALLOC_N(var,type,n) (var)=(type*)realloc((char*)(var),sizeof(type)*(n))
36
+ #define S_FREE(n) free(n); n = NULL;
37
+
38
+ #define S_ALLOCA_N(type,n) (type*)alloca(sizeof(type)*(n))
39
+
40
+ #define S_MEMZERO(p,type,n) memset((p), 0, sizeof(type)*(n))
41
+ #define S_MEMCPY(p1,p2,type,n) memcpy((p1), (p2), sizeof(type)*(n))
42
+ #define S_MEMMOVE(p1,p2,type,n) memmove((p1), (p2), sizeof(type)*(n))
43
+ #define S_MEMCMP(p1,p2,type,n) memcmp((p1), (p2), sizeof(type)*(n))
44
+
45
+ typedef struct {
46
+ void *name;
47
+ void *attributes;
48
+ } hpricot_element;
49
+
50
+ typedef void (*hpricot_element_cb)(void *data, hpricot_element *token);
51
+
52
+ typedef struct hpricot_scan {
53
+ int lineno;
54
+ int cs;
55
+ size_t nread;
56
+ size_t mark;
57
+
58
+ void *data;
59
+
60
+ hpricot_element_cb xmldecl;
61
+ hpricot_element_cb doctype;
62
+ hpricot_element_cb xmlprocins;
63
+ hpricot_element_cb starttag;
64
+ hpricot_element_cb endtag;
65
+ hpricot_element_cb emptytag;
66
+ hpricot_element_cb comment;
67
+ hpricot_element_cb cdata;
68
+
69
+ } http_scan;
70
+
71
+ // int hpricot_scan_init(hpricot_scan *scan);
72
+ // int hpricot_scan_finish(hpricot_scan *scan);
73
+ // size_t hpricot_scan_execute(hpricot_scan *scan, const char *data, size_t len, size_t off);
74
+ // int hpricot_scan_has_error(hpricot_scan *scan);
75
+ // int hpricot_scan_is_finished(hpricot_scan *scan);
76
+ //
77
+ // #define hpricot_scan_nread(scan) (scan)->nread
78
+
79
+ #endif
@@ -0,0 +1,363 @@
1
+
2
+ import java.io.IOException;
3
+
4
+ import org.jruby.Ruby;
5
+ import org.jruby.RubyClass;
6
+ import org.jruby.RubyHash;
7
+ import org.jruby.RubyModule;
8
+ import org.jruby.RubyNumeric;
9
+ import org.jruby.RubyString;
10
+ import org.jruby.runtime.Block;
11
+ import org.jruby.runtime.CallbackFactory;
12
+ import org.jruby.runtime.builtin.IRubyObject;
13
+ import org.jruby.exceptions.RaiseException;
14
+ import org.jruby.runtime.load.BasicLibraryService;
15
+
16
+ public class HpricotScanService implements BasicLibraryService {
17
+ public static String NO_WAY_SERIOUSLY="*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!";
18
+
19
+ public void ELE(IRubyObject N) {
20
+ if (tokend > tokstart || text) {
21
+ IRubyObject raw_string = runtime.getNil();
22
+ ele_open = false; text = false;
23
+ if (tokstart != -1 && N != cdata && N != sym_text && N != procins && N != comment) {
24
+ raw_string = runtime.newString(new String(buf,tokstart,tokend-tokstart));
25
+ }
26
+ rb_yield_tokens(N, tag[0], attr, raw_string, taint);
27
+ }
28
+ }
29
+
30
+ public void SET(IRubyObject[] N, int E) {
31
+ int mark = 0;
32
+ if(N == tag) {
33
+ if(mark_tag == -1 || E == mark_tag) {
34
+ tag[0] = runtime.newString("");
35
+ } else if(E > mark_tag) {
36
+ tag[0] = runtime.newString(new String(buf,mark_tag, E-mark_tag));
37
+ }
38
+ } else if(N == akey) {
39
+ if(mark_akey == -1 || E == mark_akey) {
40
+ akey[0] = runtime.newString("");
41
+ } else if(E > mark_akey) {
42
+ akey[0] = runtime.newString(new String(buf,mark_akey, E-mark_akey));
43
+ }
44
+ } else if(N == aval) {
45
+ if(mark_aval == -1 || E == mark_aval) {
46
+ aval[0] = runtime.newString("");
47
+ } else if(E > mark_aval) {
48
+ aval[0] = runtime.newString(new String(buf,mark_aval, E-mark_aval));
49
+ }
50
+ }
51
+ }
52
+
53
+ public void CAT(IRubyObject[] N, int E) {
54
+ if(N[0].isNil()) {
55
+ SET(N,E);
56
+ } else {
57
+ int mark = 0;
58
+ if(N == tag) {
59
+ mark = mark_tag;
60
+ } else if(N == akey) {
61
+ mark = mark_akey;
62
+ } else if(N == aval) {
63
+ mark = mark_aval;
64
+ }
65
+ ((RubyString)(N[0])).append(runtime.newString(new String(buf, mark, E-mark)));
66
+ }
67
+ }
68
+
69
+ public void SLIDE(Object N) {
70
+ int mark = 0;
71
+ if(N == tag) {
72
+ mark = mark_tag;
73
+ } else if(N == akey) {
74
+ mark = mark_akey;
75
+ } else if(N == aval) {
76
+ mark = mark_aval;
77
+ }
78
+ if(mark > tokstart) {
79
+ if(N == tag) {
80
+ mark_tag -= tokstart;
81
+ } else if(N == akey) {
82
+ mark_akey -= tokstart;
83
+ } else if(N == aval) {
84
+ mark_aval -= tokstart;
85
+ }
86
+ }
87
+ }
88
+
89
+ public void ATTR(IRubyObject K, IRubyObject V) {
90
+ if(!K.isNil()) {
91
+ if(attr.isNil()) {
92
+ attr = RubyHash.newHash(runtime);
93
+ }
94
+ ((RubyHash)attr).aset(K,V);
95
+ }
96
+ }
97
+
98
+ public void ATTR(IRubyObject[] K, IRubyObject V) {
99
+ ATTR(K[0],V);
100
+ }
101
+
102
+ public void ATTR(IRubyObject K, IRubyObject[] V) {
103
+ ATTR(K,V[0]);
104
+ }
105
+
106
+ public void ATTR(IRubyObject[] K, IRubyObject[] V) {
107
+ ATTR(K[0],V[0]);
108
+ }
109
+
110
+ public void TEXT_PASS() {
111
+ if(!text) {
112
+ if(ele_open) {
113
+ ele_open = false;
114
+ if(tokstart > -1) {
115
+ mark_tag = tokstart;
116
+ }
117
+ } else {
118
+ mark_tag = p;
119
+ }
120
+ attr = runtime.getNil();
121
+ tag[0] = runtime.getNil();
122
+ text = true;
123
+ }
124
+ }
125
+
126
+ public void EBLK(IRubyObject N, int T) {
127
+ CAT(tag, p - T + 1);
128
+ ELE(N);
129
+ }
130
+
131
+
132
+ public void rb_raise(RubyClass error, String message) {
133
+ throw new RaiseException(runtime, error, message, true);
134
+ }
135
+
136
+ public IRubyObject rb_str_new2(String s) {
137
+ return runtime.newString(s);
138
+ }
139
+
140
+ %%{
141
+ machine hpricot_scan;
142
+
143
+ action newEle {
144
+ if (text) {
145
+ CAT(tag, p);
146
+ ELE(sym_text);
147
+ text = false;
148
+ }
149
+ attr = runtime.getNil();
150
+ tag[0] = runtime.getNil();
151
+ mark_tag = -1;
152
+ ele_open = true;
153
+ }
154
+
155
+ action _tag { mark_tag = p; }
156
+ action _aval { mark_aval = p; }
157
+ action _akey { mark_akey = p; }
158
+ action tag { SET(tag, p); }
159
+ action tagc { SET(tag, p-1); }
160
+ action aval { SET(aval, p); }
161
+ action aunq {
162
+ if (buf[p-1] == '"' || buf[p-1] == '\'') { SET(aval, p-1); }
163
+ else { SET(aval, p); }
164
+ }
165
+ action akey { SET(akey, p); }
166
+ action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
167
+ action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
168
+ action xmlsd { SET(aval, p); ATTR(rb_str_new2("standalone"), aval); }
169
+ action pubid { SET(aval, p); ATTR(rb_str_new2("public_id"), aval); }
170
+ action sysid { SET(aval, p); ATTR(rb_str_new2("system_id"), aval); }
171
+
172
+ action new_attr {
173
+ akey[0] = runtime.getNil();
174
+ aval[0] = runtime.getNil();
175
+ mark_akey = -1;
176
+ mark_aval = -1;
177
+ }
178
+
179
+ action save_attr {
180
+ ATTR(akey, aval);
181
+ }
182
+
183
+ include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
184
+
185
+ }%%
186
+
187
+ %% write data nofinal;
188
+
189
+ public final static int BUFSIZE=16384;
190
+
191
+ private void rb_yield_tokens(IRubyObject sym, IRubyObject tag, IRubyObject attr, IRubyObject raw, boolean taint) {
192
+ IRubyObject ary;
193
+ if (sym == runtime.newSymbol("text")) {
194
+ raw = tag;
195
+ }
196
+ ary = runtime.newArray(new IRubyObject[]{sym, tag, attr, raw});
197
+ if (taint) {
198
+ ary.setTaint(true);
199
+ tag.setTaint(true);
200
+ attr.setTaint(true);
201
+ raw.setTaint(true);
202
+ }
203
+ block.yield(runtime.getCurrentContext(), ary, null, null, false);
204
+ }
205
+
206
+
207
+ int cs, act, have = 0, nread = 0, curline = 1, p=-1;
208
+ boolean text = false;
209
+ int tokstart=-1, tokend;
210
+ char[] buf;
211
+ Ruby runtime;
212
+ IRubyObject attr, bufsize;
213
+ IRubyObject[] tag, akey, aval;
214
+ int mark_tag, mark_akey, mark_aval;
215
+ boolean done = false, ele_open = false;
216
+ int buffer_size = 0;
217
+ boolean taint = false;
218
+ Block block = null;
219
+
220
+
221
+ IRubyObject xmldecl, doctype, procins, stag, etag, emptytag, comment,
222
+ cdata, sym_text;
223
+
224
+ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
225
+ attr = bufsize = runtime.getNil();
226
+ tag = new IRubyObject[]{runtime.getNil()};
227
+ akey = new IRubyObject[]{runtime.getNil()};
228
+ aval = new IRubyObject[]{runtime.getNil()};
229
+
230
+ RubyClass rb_eHpricotParseError = runtime.getModule("Hpricot").getClass("ParseError");
231
+
232
+ taint = port.isTaint();
233
+ if ( !port.respondsTo("read")) {
234
+ if ( port.respondsTo("to_str")) {
235
+ port = port.callMethod(runtime.getCurrentContext(),"to_str");
236
+ } else {
237
+ throw runtime.newArgumentError("bad Hpricot argument, String or IO only please.");
238
+ }
239
+ }
240
+
241
+ buffer_size = BUFSIZE;
242
+ if (recv.getInstanceVariable("@buffer_size") != null) {
243
+ bufsize = recv.getInstanceVariable("@buffer_size");
244
+ if (!bufsize.isNil()) {
245
+ buffer_size = RubyNumeric.fix2int(bufsize);
246
+ }
247
+ }
248
+ buf = new char[buffer_size];
249
+
250
+ %% write init;
251
+
252
+ while( !done ) {
253
+ IRubyObject str;
254
+ p = have;
255
+ int pe;
256
+ int len, space = buffer_size - have;
257
+
258
+ if ( space == 0 ) {
259
+ /* We've used up the entire buffer storing an already-parsed token
260
+ * prefix that must be preserved. Likely caused by super-long attributes.
261
+ * See ticket #13. */
262
+ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <" + tag.toString() + ">, starting on line "+curline+".");
263
+ }
264
+
265
+ if (port.respondsTo("read")) {
266
+ str = port.callMethod(runtime.getCurrentContext(),"read",runtime.newFixnum(space));
267
+ } else {
268
+ str = ((RubyString)port).substr(nread,space);
269
+ }
270
+
271
+ str = str.convertToString();
272
+ String sss = str.toString();
273
+ char[] chars = sss.toCharArray();
274
+ System.arraycopy(chars,0,buf,p,chars.length);
275
+
276
+ len = sss.length();
277
+ nread += len;
278
+
279
+ if ( len < space ) {
280
+ len++;
281
+ done = true;
282
+ }
283
+
284
+ pe = p + len;
285
+ char[] data = buf;
286
+
287
+ %% write exec;
288
+
289
+ if ( cs == hpricot_scan_error ) {
290
+ if(!tag[0].isNil()) {
291
+ rb_raise(rb_eHpricotParseError, "parse error on element <"+tag.toString()+">, starting on line "+curline+".\n" + NO_WAY_SERIOUSLY);
292
+ } else {
293
+ rb_raise(rb_eHpricotParseError, "parse error on line "+curline+".\n" + NO_WAY_SERIOUSLY);
294
+ }
295
+ }
296
+
297
+ if ( done && ele_open ) {
298
+ ele_open = false;
299
+ if(tokstart > -1) {
300
+ mark_tag = tokstart;
301
+ tokstart = -1;
302
+ text = true;
303
+ }
304
+ }
305
+
306
+ if(tokstart == -1) {
307
+ have = 0;
308
+ /* text nodes have no tokstart because each byte is parsed alone */
309
+ if(mark_tag != -1 && text) {
310
+ if (done) {
311
+ if(mark_tag < p-1) {
312
+ CAT(tag, p-1);
313
+ ELE(sym_text);
314
+ }
315
+ } else {
316
+ CAT(tag, p);
317
+ }
318
+ }
319
+ mark_tag = 0;
320
+ } else {
321
+ have = pe - tokstart;
322
+ System.arraycopy(buf,tokstart,buf,0,have);
323
+ SLIDE(tag);
324
+ SLIDE(akey);
325
+ SLIDE(aval);
326
+ tokend = (tokend - tokstart);
327
+ tokstart = 0;
328
+ }
329
+ }
330
+ return runtime.getNil();
331
+ }
332
+
333
+ public static IRubyObject __hpricot_scan(IRubyObject recv, IRubyObject port, Block block) {
334
+ Ruby runtime = recv.getRuntime();
335
+ HpricotScanService service = new HpricotScanService();
336
+ service.runtime = runtime;
337
+ service.xmldecl = runtime.newSymbol("xmldecl");
338
+ service.doctype = runtime.newSymbol("doctype");
339
+ service.procins = runtime.newSymbol("procins");
340
+ service.stag = runtime.newSymbol("stag");
341
+ service.etag = runtime.newSymbol("etag");
342
+ service.emptytag = runtime.newSymbol("emptytag");
343
+ service.comment = runtime.newSymbol("comment");
344
+ service.cdata = runtime.newSymbol("cdata");
345
+ service.sym_text = runtime.newSymbol("text");
346
+ service.block = block;
347
+ return service.hpricot_scan(recv, port);
348
+ }
349
+
350
+
351
+ public boolean basicLoad(final Ruby runtime) throws IOException {
352
+ Init_hpricot_scan(runtime);
353
+ return true;
354
+ }
355
+
356
+ public static void Init_hpricot_scan(Ruby runtime) {
357
+ RubyModule mHpricot = runtime.defineModule("Hpricot");
358
+ mHpricot.getMetaClass().attr_accessor(new IRubyObject[]{runtime.newSymbol("buffer_size")});
359
+ CallbackFactory fact = runtime.callbackFactory(HpricotScanService.class);
360
+ mHpricot.getMetaClass().defineMethod("scan",fact.getSingletonMethod("__hpricot_scan",IRubyObject.class));
361
+ mHpricot.defineClassUnder("ParseError",runtime.getClass("Exception"),runtime.getClass("Exception").getAllocator());
362
+ }
363
+ }
@@ -0,0 +1,273 @@
1
+ /*
2
+ * hpricot_scan.rl
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006 why the lucky stiff
8
+ */
9
+ #include <ruby.h>
10
+
11
+ #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
12
+
13
+ static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
14
+ sym_cdata, sym_text;
15
+ static VALUE rb_eHpricotParseError;
16
+ static ID s_read, s_to_str;
17
+
18
+ #define ELE(N) \
19
+ if (tokend > tokstart || text == 1) { \
20
+ VALUE raw_string = Qnil; \
21
+ ele_open = 0; text = 0; \
22
+ if (tokstart != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
23
+ raw_string = rb_str_new(tokstart, tokend-tokstart); \
24
+ } \
25
+ rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
26
+ }
27
+
28
+ #define SET(N, E) \
29
+ if (mark_##N == NULL || E == mark_##N) \
30
+ N = rb_str_new2(""); \
31
+ else if (E > mark_##N) \
32
+ N = rb_str_new(mark_##N, E - mark_##N);
33
+
34
+ #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
35
+
36
+ #define SLIDE(N) if ( mark_##N > tokstart ) mark_##N = buf + (mark_##N - tokstart);
37
+
38
+ #define ATTR(K, V) \
39
+ if (!NIL_P(K)) { \
40
+ if (NIL_P(attr)) attr = rb_hash_new(); \
41
+ rb_hash_aset(attr, K, V); \
42
+ }
43
+
44
+ #define TEXT_PASS() \
45
+ if (text == 0) \
46
+ { \
47
+ if (ele_open == 1) { \
48
+ ele_open = 0; \
49
+ if (tokstart > 0) { \
50
+ mark_tag = tokstart; \
51
+ } \
52
+ } else { \
53
+ mark_tag = p; \
54
+ } \
55
+ attr = Qnil; \
56
+ tag = Qnil; \
57
+ text = 1; \
58
+ }
59
+
60
+ #define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
61
+
62
+ %%{
63
+ machine hpricot_scan;
64
+
65
+ action newEle {
66
+ if (text == 1) {
67
+ CAT(tag, p);
68
+ ELE(text);
69
+ text = 0;
70
+ }
71
+ attr = Qnil;
72
+ tag = Qnil;
73
+ mark_tag = NULL;
74
+ ele_open = 1;
75
+ }
76
+
77
+ action _tag { mark_tag = p; }
78
+ action _aval { mark_aval = p; }
79
+ action _akey { mark_akey = p; }
80
+ action tag { SET(tag, p); }
81
+ action tagc { SET(tag, p-1); }
82
+ action aval { SET(aval, p); }
83
+ action aunq {
84
+ if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
85
+ else { SET(aval, p); }
86
+ }
87
+ action akey { SET(akey, p); }
88
+ action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
89
+ action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
90
+ action xmlsd { SET(aval, p); ATTR(rb_str_new2("standalone"), aval); }
91
+ action pubid { SET(aval, p); ATTR(rb_str_new2("public_id"), aval); }
92
+ action sysid { SET(aval, p); ATTR(rb_str_new2("system_id"), aval); }
93
+
94
+ action new_attr {
95
+ akey = Qnil;
96
+ aval = Qnil;
97
+ mark_akey = NULL;
98
+ mark_aval = NULL;
99
+ }
100
+
101
+ action save_attr {
102
+ ATTR(akey, aval);
103
+ }
104
+
105
+ include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
106
+
107
+ }%%
108
+
109
+ %% write data nofinal;
110
+
111
+ #define BUFSIZE 16384
112
+
113
+ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
114
+ {
115
+ VALUE ary;
116
+ if (sym == sym_text) {
117
+ raw = tag;
118
+ }
119
+ ary = rb_ary_new3(4, sym, tag, attr, raw);
120
+ if (taint) {
121
+ OBJ_TAINT(ary);
122
+ OBJ_TAINT(tag);
123
+ OBJ_TAINT(attr);
124
+ OBJ_TAINT(raw);
125
+ }
126
+ rb_yield(ary);
127
+ }
128
+
129
+ VALUE hpricot_scan(VALUE self, VALUE port)
130
+ {
131
+ int cs, act, have = 0, nread = 0, curline = 1, text = 0;
132
+ char *tokstart = 0, *tokend = 0, *buf = NULL;
133
+
134
+ VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
135
+ char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
136
+ int done = 0, ele_open = 0, buffer_size = 0;
137
+
138
+ int taint = OBJ_TAINTED( port );
139
+ if ( !rb_respond_to( port, s_read ) )
140
+ {
141
+ if ( rb_respond_to( port, s_to_str ) )
142
+ {
143
+ port = rb_funcall( port, s_to_str, 0 );
144
+ StringValue(port);
145
+ }
146
+ else
147
+ {
148
+ rb_raise( rb_eArgError, "bad Hpricot argument, String or IO only please." );
149
+ }
150
+ }
151
+
152
+ buffer_size = BUFSIZE;
153
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
154
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
155
+ if (!NIL_P(bufsize)) {
156
+ buffer_size = NUM2INT(bufsize);
157
+ }
158
+ }
159
+ buf = ALLOC_N(char, buffer_size);
160
+
161
+ %% write init;
162
+
163
+ while ( !done ) {
164
+ VALUE str;
165
+ char *p = buf + have, *pe;
166
+ int len, space = buffer_size - have;
167
+
168
+ if ( space == 0 ) {
169
+ /* We've used up the entire buffer storing an already-parsed token
170
+ * prefix that must be preserved. Likely caused by super-long attributes.
171
+ * See ticket #13. */
172
+ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING(tag)->ptr, curline);
173
+ }
174
+
175
+ if ( rb_respond_to( port, s_read ) )
176
+ {
177
+ str = rb_funcall( port, s_read, 1, INT2FIX(space) );
178
+ }
179
+ else
180
+ {
181
+ str = rb_str_substr( port, nread, space );
182
+ }
183
+
184
+ StringValue(str);
185
+ memcpy( p, RSTRING(str)->ptr, RSTRING(str)->len );
186
+ len = RSTRING(str)->len;
187
+ nread += len;
188
+
189
+ /* If this is the last buffer, tack on an EOF. */
190
+ if ( len < space ) {
191
+ p[len++] = 0;
192
+ done = 1;
193
+ }
194
+
195
+ pe = p + len;
196
+ %% write exec;
197
+
198
+ if ( cs == hpricot_scan_error ) {
199
+ free(buf);
200
+ if ( !NIL_P(tag) )
201
+ {
202
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING(tag)->ptr, curline);
203
+ }
204
+ else
205
+ {
206
+ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
207
+ }
208
+ }
209
+
210
+ if ( done && ele_open )
211
+ {
212
+ ele_open = 0;
213
+ if (tokstart > 0) {
214
+ mark_tag = tokstart;
215
+ tokstart = 0;
216
+ text = 1;
217
+ }
218
+ }
219
+
220
+ if ( tokstart == 0 )
221
+ {
222
+ have = 0;
223
+ /* text nodes have no tokstart because each byte is parsed alone */
224
+ if ( mark_tag != NULL && text == 1 )
225
+ {
226
+ if (done)
227
+ {
228
+ if (mark_tag < p-1)
229
+ {
230
+ CAT(tag, p-1);
231
+ ELE(text);
232
+ }
233
+ }
234
+ else
235
+ {
236
+ CAT(tag, p);
237
+ }
238
+ }
239
+ mark_tag = buf;
240
+ }
241
+ else
242
+ {
243
+ have = pe - tokstart;
244
+ memmove( buf, tokstart, have );
245
+ SLIDE(tag);
246
+ SLIDE(akey);
247
+ SLIDE(aval);
248
+ tokend = buf + (tokend - tokstart);
249
+ tokstart = buf;
250
+ }
251
+ }
252
+ free(buf);
253
+ }
254
+
255
+ void Init_hpricot_scan()
256
+ {
257
+ VALUE mHpricot = rb_define_module("Hpricot");
258
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
259
+ rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
260
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException);
261
+
262
+ s_read = rb_intern("read");
263
+ s_to_str = rb_intern("to_str");
264
+ sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
265
+ sym_doctype = ID2SYM(rb_intern("doctype"));
266
+ sym_procins = ID2SYM(rb_intern("procins"));
267
+ sym_stag = ID2SYM(rb_intern("stag"));
268
+ sym_etag = ID2SYM(rb_intern("etag"));
269
+ sym_emptytag = ID2SYM(rb_intern("emptytag"));
270
+ sym_comment = ID2SYM(rb_intern("comment"));
271
+ sym_cdata = ID2SYM(rb_intern("cdata"));
272
+ sym_text = ID2SYM(rb_intern("text"));
273
+ }