hpricot 0.6-jruby
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +62 -0
- data/COPYING +18 -0
- data/README +284 -0
- data/Rakefile +211 -0
- data/ext/hpricot_scan/HpricotScanService.java +1340 -0
- data/ext/hpricot_scan/extconf.rb +6 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_scan.c +5976 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.java.rl +363 -0
- data/ext/hpricot_scan/hpricot_scan.rl +273 -0
- data/extras/mingw-rbconfig.rb +176 -0
- data/lib/hpricot.rb +26 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +200 -0
- data/lib/hpricot/elements.rb +510 -0
- data/lib/hpricot/htmlinfo.rb +672 -0
- data/lib/hpricot/inspect.rb +107 -0
- data/lib/hpricot/modules.rb +37 -0
- data/lib/hpricot/parse.rb +297 -0
- data/lib/hpricot/tag.rb +228 -0
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +821 -0
- data/lib/hpricot/xchar.rb +94 -0
- data/lib/i686-linux/hpricot_scan.jar +0 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/immob.html +400 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/uswebgen.html +220 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +7 -0
- data/test/test_alter.rb +65 -0
- data/test/test_builder.rb +24 -0
- data/test/test_parser.rb +379 -0
- data/test/test_paths.rb +16 -0
- data/test/test_preserved.rb +66 -0
- data/test/test_xml.rb +28 -0
- metadata +98 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
/*
|
2
|
+
* hpricot_scan.h
|
3
|
+
*
|
4
|
+
* $Author: why $
|
5
|
+
* $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
|
6
|
+
*
|
7
|
+
* Copyright (C) 2006 why the lucky stiff
|
8
|
+
* You can redistribute it and/or modify it under the same terms as Ruby.
|
9
|
+
*/
|
10
|
+
|
11
|
+
#ifndef hpricot_scan_h
|
12
|
+
#define hpricot_scan_h
|
13
|
+
|
14
|
+
#include <sys/types.h>
|
15
|
+
|
16
|
+
#if defined(_WIN32)
|
17
|
+
#include <stddef.h>
|
18
|
+
#endif
|
19
|
+
|
20
|
+
/*
|
21
|
+
* Memory Allocation
|
22
|
+
*/
|
23
|
+
#if defined(HAVE_ALLOCA_H) && !defined(__GNUC__)
|
24
|
+
#include <alloca.h>
|
25
|
+
#endif
|
26
|
+
|
27
|
+
#ifndef NULL
|
28
|
+
# define NULL (void *)0
|
29
|
+
#endif
|
30
|
+
|
31
|
+
#define BUFSIZE 16384
|
32
|
+
|
33
|
+
#define S_ALLOC_N(type,n) (type*)malloc(sizeof(type)*(n))
|
34
|
+
#define S_ALLOC(type) (type*)malloc(sizeof(type))
|
35
|
+
#define S_REALLOC_N(var,type,n) (var)=(type*)realloc((char*)(var),sizeof(type)*(n))
|
36
|
+
#define S_FREE(n) free(n); n = NULL;
|
37
|
+
|
38
|
+
#define S_ALLOCA_N(type,n) (type*)alloca(sizeof(type)*(n))
|
39
|
+
|
40
|
+
#define S_MEMZERO(p,type,n) memset((p), 0, sizeof(type)*(n))
|
41
|
+
#define S_MEMCPY(p1,p2,type,n) memcpy((p1), (p2), sizeof(type)*(n))
|
42
|
+
#define S_MEMMOVE(p1,p2,type,n) memmove((p1), (p2), sizeof(type)*(n))
|
43
|
+
#define S_MEMCMP(p1,p2,type,n) memcmp((p1), (p2), sizeof(type)*(n))
|
44
|
+
|
45
|
+
typedef struct {
|
46
|
+
void *name;
|
47
|
+
void *attributes;
|
48
|
+
} hpricot_element;
|
49
|
+
|
50
|
+
typedef void (*hpricot_element_cb)(void *data, hpricot_element *token);
|
51
|
+
|
52
|
+
typedef struct hpricot_scan {
|
53
|
+
int lineno;
|
54
|
+
int cs;
|
55
|
+
size_t nread;
|
56
|
+
size_t mark;
|
57
|
+
|
58
|
+
void *data;
|
59
|
+
|
60
|
+
hpricot_element_cb xmldecl;
|
61
|
+
hpricot_element_cb doctype;
|
62
|
+
hpricot_element_cb xmlprocins;
|
63
|
+
hpricot_element_cb starttag;
|
64
|
+
hpricot_element_cb endtag;
|
65
|
+
hpricot_element_cb emptytag;
|
66
|
+
hpricot_element_cb comment;
|
67
|
+
hpricot_element_cb cdata;
|
68
|
+
|
69
|
+
} http_scan;
|
70
|
+
|
71
|
+
// int hpricot_scan_init(hpricot_scan *scan);
|
72
|
+
// int hpricot_scan_finish(hpricot_scan *scan);
|
73
|
+
// size_t hpricot_scan_execute(hpricot_scan *scan, const char *data, size_t len, size_t off);
|
74
|
+
// int hpricot_scan_has_error(hpricot_scan *scan);
|
75
|
+
// int hpricot_scan_is_finished(hpricot_scan *scan);
|
76
|
+
//
|
77
|
+
// #define hpricot_scan_nread(scan) (scan)->nread
|
78
|
+
|
79
|
+
#endif
|
@@ -0,0 +1,363 @@
|
|
1
|
+
|
2
|
+
import java.io.IOException;
|
3
|
+
|
4
|
+
import org.jruby.Ruby;
|
5
|
+
import org.jruby.RubyClass;
|
6
|
+
import org.jruby.RubyHash;
|
7
|
+
import org.jruby.RubyModule;
|
8
|
+
import org.jruby.RubyNumeric;
|
9
|
+
import org.jruby.RubyString;
|
10
|
+
import org.jruby.runtime.Block;
|
11
|
+
import org.jruby.runtime.CallbackFactory;
|
12
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
13
|
+
import org.jruby.exceptions.RaiseException;
|
14
|
+
import org.jruby.runtime.load.BasicLibraryService;
|
15
|
+
|
16
|
+
public class HpricotScanService implements BasicLibraryService {
|
17
|
+
public static String NO_WAY_SERIOUSLY="*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!";
|
18
|
+
|
19
|
+
public void ELE(IRubyObject N) {
|
20
|
+
if (tokend > tokstart || text) {
|
21
|
+
IRubyObject raw_string = runtime.getNil();
|
22
|
+
ele_open = false; text = false;
|
23
|
+
if (tokstart != -1 && N != cdata && N != sym_text && N != procins && N != comment) {
|
24
|
+
raw_string = runtime.newString(new String(buf,tokstart,tokend-tokstart));
|
25
|
+
}
|
26
|
+
rb_yield_tokens(N, tag[0], attr, raw_string, taint);
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
30
|
+
public void SET(IRubyObject[] N, int E) {
|
31
|
+
int mark = 0;
|
32
|
+
if(N == tag) {
|
33
|
+
if(mark_tag == -1 || E == mark_tag) {
|
34
|
+
tag[0] = runtime.newString("");
|
35
|
+
} else if(E > mark_tag) {
|
36
|
+
tag[0] = runtime.newString(new String(buf,mark_tag, E-mark_tag));
|
37
|
+
}
|
38
|
+
} else if(N == akey) {
|
39
|
+
if(mark_akey == -1 || E == mark_akey) {
|
40
|
+
akey[0] = runtime.newString("");
|
41
|
+
} else if(E > mark_akey) {
|
42
|
+
akey[0] = runtime.newString(new String(buf,mark_akey, E-mark_akey));
|
43
|
+
}
|
44
|
+
} else if(N == aval) {
|
45
|
+
if(mark_aval == -1 || E == mark_aval) {
|
46
|
+
aval[0] = runtime.newString("");
|
47
|
+
} else if(E > mark_aval) {
|
48
|
+
aval[0] = runtime.newString(new String(buf,mark_aval, E-mark_aval));
|
49
|
+
}
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
public void CAT(IRubyObject[] N, int E) {
|
54
|
+
if(N[0].isNil()) {
|
55
|
+
SET(N,E);
|
56
|
+
} else {
|
57
|
+
int mark = 0;
|
58
|
+
if(N == tag) {
|
59
|
+
mark = mark_tag;
|
60
|
+
} else if(N == akey) {
|
61
|
+
mark = mark_akey;
|
62
|
+
} else if(N == aval) {
|
63
|
+
mark = mark_aval;
|
64
|
+
}
|
65
|
+
((RubyString)(N[0])).append(runtime.newString(new String(buf, mark, E-mark)));
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
public void SLIDE(Object N) {
|
70
|
+
int mark = 0;
|
71
|
+
if(N == tag) {
|
72
|
+
mark = mark_tag;
|
73
|
+
} else if(N == akey) {
|
74
|
+
mark = mark_akey;
|
75
|
+
} else if(N == aval) {
|
76
|
+
mark = mark_aval;
|
77
|
+
}
|
78
|
+
if(mark > tokstart) {
|
79
|
+
if(N == tag) {
|
80
|
+
mark_tag -= tokstart;
|
81
|
+
} else if(N == akey) {
|
82
|
+
mark_akey -= tokstart;
|
83
|
+
} else if(N == aval) {
|
84
|
+
mark_aval -= tokstart;
|
85
|
+
}
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
public void ATTR(IRubyObject K, IRubyObject V) {
|
90
|
+
if(!K.isNil()) {
|
91
|
+
if(attr.isNil()) {
|
92
|
+
attr = RubyHash.newHash(runtime);
|
93
|
+
}
|
94
|
+
((RubyHash)attr).aset(K,V);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
public void ATTR(IRubyObject[] K, IRubyObject V) {
|
99
|
+
ATTR(K[0],V);
|
100
|
+
}
|
101
|
+
|
102
|
+
public void ATTR(IRubyObject K, IRubyObject[] V) {
|
103
|
+
ATTR(K,V[0]);
|
104
|
+
}
|
105
|
+
|
106
|
+
public void ATTR(IRubyObject[] K, IRubyObject[] V) {
|
107
|
+
ATTR(K[0],V[0]);
|
108
|
+
}
|
109
|
+
|
110
|
+
public void TEXT_PASS() {
|
111
|
+
if(!text) {
|
112
|
+
if(ele_open) {
|
113
|
+
ele_open = false;
|
114
|
+
if(tokstart > -1) {
|
115
|
+
mark_tag = tokstart;
|
116
|
+
}
|
117
|
+
} else {
|
118
|
+
mark_tag = p;
|
119
|
+
}
|
120
|
+
attr = runtime.getNil();
|
121
|
+
tag[0] = runtime.getNil();
|
122
|
+
text = true;
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
public void EBLK(IRubyObject N, int T) {
|
127
|
+
CAT(tag, p - T + 1);
|
128
|
+
ELE(N);
|
129
|
+
}
|
130
|
+
|
131
|
+
|
132
|
+
public void rb_raise(RubyClass error, String message) {
|
133
|
+
throw new RaiseException(runtime, error, message, true);
|
134
|
+
}
|
135
|
+
|
136
|
+
public IRubyObject rb_str_new2(String s) {
|
137
|
+
return runtime.newString(s);
|
138
|
+
}
|
139
|
+
|
140
|
+
%%{
|
141
|
+
machine hpricot_scan;
|
142
|
+
|
143
|
+
action newEle {
|
144
|
+
if (text) {
|
145
|
+
CAT(tag, p);
|
146
|
+
ELE(sym_text);
|
147
|
+
text = false;
|
148
|
+
}
|
149
|
+
attr = runtime.getNil();
|
150
|
+
tag[0] = runtime.getNil();
|
151
|
+
mark_tag = -1;
|
152
|
+
ele_open = true;
|
153
|
+
}
|
154
|
+
|
155
|
+
action _tag { mark_tag = p; }
|
156
|
+
action _aval { mark_aval = p; }
|
157
|
+
action _akey { mark_akey = p; }
|
158
|
+
action tag { SET(tag, p); }
|
159
|
+
action tagc { SET(tag, p-1); }
|
160
|
+
action aval { SET(aval, p); }
|
161
|
+
action aunq {
|
162
|
+
if (buf[p-1] == '"' || buf[p-1] == '\'') { SET(aval, p-1); }
|
163
|
+
else { SET(aval, p); }
|
164
|
+
}
|
165
|
+
action akey { SET(akey, p); }
|
166
|
+
action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
|
167
|
+
action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
|
168
|
+
action xmlsd { SET(aval, p); ATTR(rb_str_new2("standalone"), aval); }
|
169
|
+
action pubid { SET(aval, p); ATTR(rb_str_new2("public_id"), aval); }
|
170
|
+
action sysid { SET(aval, p); ATTR(rb_str_new2("system_id"), aval); }
|
171
|
+
|
172
|
+
action new_attr {
|
173
|
+
akey[0] = runtime.getNil();
|
174
|
+
aval[0] = runtime.getNil();
|
175
|
+
mark_akey = -1;
|
176
|
+
mark_aval = -1;
|
177
|
+
}
|
178
|
+
|
179
|
+
action save_attr {
|
180
|
+
ATTR(akey, aval);
|
181
|
+
}
|
182
|
+
|
183
|
+
include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
|
184
|
+
|
185
|
+
}%%
|
186
|
+
|
187
|
+
%% write data nofinal;
|
188
|
+
|
189
|
+
public final static int BUFSIZE=16384;
|
190
|
+
|
191
|
+
private void rb_yield_tokens(IRubyObject sym, IRubyObject tag, IRubyObject attr, IRubyObject raw, boolean taint) {
|
192
|
+
IRubyObject ary;
|
193
|
+
if (sym == runtime.newSymbol("text")) {
|
194
|
+
raw = tag;
|
195
|
+
}
|
196
|
+
ary = runtime.newArray(new IRubyObject[]{sym, tag, attr, raw});
|
197
|
+
if (taint) {
|
198
|
+
ary.setTaint(true);
|
199
|
+
tag.setTaint(true);
|
200
|
+
attr.setTaint(true);
|
201
|
+
raw.setTaint(true);
|
202
|
+
}
|
203
|
+
block.yield(runtime.getCurrentContext(), ary, null, null, false);
|
204
|
+
}
|
205
|
+
|
206
|
+
|
207
|
+
int cs, act, have = 0, nread = 0, curline = 1, p=-1;
|
208
|
+
boolean text = false;
|
209
|
+
int tokstart=-1, tokend;
|
210
|
+
char[] buf;
|
211
|
+
Ruby runtime;
|
212
|
+
IRubyObject attr, bufsize;
|
213
|
+
IRubyObject[] tag, akey, aval;
|
214
|
+
int mark_tag, mark_akey, mark_aval;
|
215
|
+
boolean done = false, ele_open = false;
|
216
|
+
int buffer_size = 0;
|
217
|
+
boolean taint = false;
|
218
|
+
Block block = null;
|
219
|
+
|
220
|
+
|
221
|
+
IRubyObject xmldecl, doctype, procins, stag, etag, emptytag, comment,
|
222
|
+
cdata, sym_text;
|
223
|
+
|
224
|
+
IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
|
225
|
+
attr = bufsize = runtime.getNil();
|
226
|
+
tag = new IRubyObject[]{runtime.getNil()};
|
227
|
+
akey = new IRubyObject[]{runtime.getNil()};
|
228
|
+
aval = new IRubyObject[]{runtime.getNil()};
|
229
|
+
|
230
|
+
RubyClass rb_eHpricotParseError = runtime.getModule("Hpricot").getClass("ParseError");
|
231
|
+
|
232
|
+
taint = port.isTaint();
|
233
|
+
if ( !port.respondsTo("read")) {
|
234
|
+
if ( port.respondsTo("to_str")) {
|
235
|
+
port = port.callMethod(runtime.getCurrentContext(),"to_str");
|
236
|
+
} else {
|
237
|
+
throw runtime.newArgumentError("bad Hpricot argument, String or IO only please.");
|
238
|
+
}
|
239
|
+
}
|
240
|
+
|
241
|
+
buffer_size = BUFSIZE;
|
242
|
+
if (recv.getInstanceVariable("@buffer_size") != null) {
|
243
|
+
bufsize = recv.getInstanceVariable("@buffer_size");
|
244
|
+
if (!bufsize.isNil()) {
|
245
|
+
buffer_size = RubyNumeric.fix2int(bufsize);
|
246
|
+
}
|
247
|
+
}
|
248
|
+
buf = new char[buffer_size];
|
249
|
+
|
250
|
+
%% write init;
|
251
|
+
|
252
|
+
while( !done ) {
|
253
|
+
IRubyObject str;
|
254
|
+
p = have;
|
255
|
+
int pe;
|
256
|
+
int len, space = buffer_size - have;
|
257
|
+
|
258
|
+
if ( space == 0 ) {
|
259
|
+
/* We've used up the entire buffer storing an already-parsed token
|
260
|
+
* prefix that must be preserved. Likely caused by super-long attributes.
|
261
|
+
* See ticket #13. */
|
262
|
+
rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <" + tag.toString() + ">, starting on line "+curline+".");
|
263
|
+
}
|
264
|
+
|
265
|
+
if (port.respondsTo("read")) {
|
266
|
+
str = port.callMethod(runtime.getCurrentContext(),"read",runtime.newFixnum(space));
|
267
|
+
} else {
|
268
|
+
str = ((RubyString)port).substr(nread,space);
|
269
|
+
}
|
270
|
+
|
271
|
+
str = str.convertToString();
|
272
|
+
String sss = str.toString();
|
273
|
+
char[] chars = sss.toCharArray();
|
274
|
+
System.arraycopy(chars,0,buf,p,chars.length);
|
275
|
+
|
276
|
+
len = sss.length();
|
277
|
+
nread += len;
|
278
|
+
|
279
|
+
if ( len < space ) {
|
280
|
+
len++;
|
281
|
+
done = true;
|
282
|
+
}
|
283
|
+
|
284
|
+
pe = p + len;
|
285
|
+
char[] data = buf;
|
286
|
+
|
287
|
+
%% write exec;
|
288
|
+
|
289
|
+
if ( cs == hpricot_scan_error ) {
|
290
|
+
if(!tag[0].isNil()) {
|
291
|
+
rb_raise(rb_eHpricotParseError, "parse error on element <"+tag.toString()+">, starting on line "+curline+".\n" + NO_WAY_SERIOUSLY);
|
292
|
+
} else {
|
293
|
+
rb_raise(rb_eHpricotParseError, "parse error on line "+curline+".\n" + NO_WAY_SERIOUSLY);
|
294
|
+
}
|
295
|
+
}
|
296
|
+
|
297
|
+
if ( done && ele_open ) {
|
298
|
+
ele_open = false;
|
299
|
+
if(tokstart > -1) {
|
300
|
+
mark_tag = tokstart;
|
301
|
+
tokstart = -1;
|
302
|
+
text = true;
|
303
|
+
}
|
304
|
+
}
|
305
|
+
|
306
|
+
if(tokstart == -1) {
|
307
|
+
have = 0;
|
308
|
+
/* text nodes have no tokstart because each byte is parsed alone */
|
309
|
+
if(mark_tag != -1 && text) {
|
310
|
+
if (done) {
|
311
|
+
if(mark_tag < p-1) {
|
312
|
+
CAT(tag, p-1);
|
313
|
+
ELE(sym_text);
|
314
|
+
}
|
315
|
+
} else {
|
316
|
+
CAT(tag, p);
|
317
|
+
}
|
318
|
+
}
|
319
|
+
mark_tag = 0;
|
320
|
+
} else {
|
321
|
+
have = pe - tokstart;
|
322
|
+
System.arraycopy(buf,tokstart,buf,0,have);
|
323
|
+
SLIDE(tag);
|
324
|
+
SLIDE(akey);
|
325
|
+
SLIDE(aval);
|
326
|
+
tokend = (tokend - tokstart);
|
327
|
+
tokstart = 0;
|
328
|
+
}
|
329
|
+
}
|
330
|
+
return runtime.getNil();
|
331
|
+
}
|
332
|
+
|
333
|
+
public static IRubyObject __hpricot_scan(IRubyObject recv, IRubyObject port, Block block) {
|
334
|
+
Ruby runtime = recv.getRuntime();
|
335
|
+
HpricotScanService service = new HpricotScanService();
|
336
|
+
service.runtime = runtime;
|
337
|
+
service.xmldecl = runtime.newSymbol("xmldecl");
|
338
|
+
service.doctype = runtime.newSymbol("doctype");
|
339
|
+
service.procins = runtime.newSymbol("procins");
|
340
|
+
service.stag = runtime.newSymbol("stag");
|
341
|
+
service.etag = runtime.newSymbol("etag");
|
342
|
+
service.emptytag = runtime.newSymbol("emptytag");
|
343
|
+
service.comment = runtime.newSymbol("comment");
|
344
|
+
service.cdata = runtime.newSymbol("cdata");
|
345
|
+
service.sym_text = runtime.newSymbol("text");
|
346
|
+
service.block = block;
|
347
|
+
return service.hpricot_scan(recv, port);
|
348
|
+
}
|
349
|
+
|
350
|
+
|
351
|
+
public boolean basicLoad(final Ruby runtime) throws IOException {
|
352
|
+
Init_hpricot_scan(runtime);
|
353
|
+
return true;
|
354
|
+
}
|
355
|
+
|
356
|
+
public static void Init_hpricot_scan(Ruby runtime) {
|
357
|
+
RubyModule mHpricot = runtime.defineModule("Hpricot");
|
358
|
+
mHpricot.getMetaClass().attr_accessor(new IRubyObject[]{runtime.newSymbol("buffer_size")});
|
359
|
+
CallbackFactory fact = runtime.callbackFactory(HpricotScanService.class);
|
360
|
+
mHpricot.getMetaClass().defineMethod("scan",fact.getSingletonMethod("__hpricot_scan",IRubyObject.class));
|
361
|
+
mHpricot.defineClassUnder("ParseError",runtime.getClass("Exception"),runtime.getClass("Exception").getAllocator());
|
362
|
+
}
|
363
|
+
}
|
@@ -0,0 +1,273 @@
|
|
1
|
+
/*
|
2
|
+
* hpricot_scan.rl
|
3
|
+
*
|
4
|
+
* $Author: why $
|
5
|
+
* $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
|
6
|
+
*
|
7
|
+
* Copyright (C) 2006 why the lucky stiff
|
8
|
+
*/
|
9
|
+
#include <ruby.h>
|
10
|
+
|
11
|
+
#define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
|
12
|
+
|
13
|
+
static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
|
14
|
+
sym_cdata, sym_text;
|
15
|
+
static VALUE rb_eHpricotParseError;
|
16
|
+
static ID s_read, s_to_str;
|
17
|
+
|
18
|
+
#define ELE(N) \
|
19
|
+
if (tokend > tokstart || text == 1) { \
|
20
|
+
VALUE raw_string = Qnil; \
|
21
|
+
ele_open = 0; text = 0; \
|
22
|
+
if (tokstart != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
|
23
|
+
raw_string = rb_str_new(tokstart, tokend-tokstart); \
|
24
|
+
} \
|
25
|
+
rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
|
26
|
+
}
|
27
|
+
|
28
|
+
#define SET(N, E) \
|
29
|
+
if (mark_##N == NULL || E == mark_##N) \
|
30
|
+
N = rb_str_new2(""); \
|
31
|
+
else if (E > mark_##N) \
|
32
|
+
N = rb_str_new(mark_##N, E - mark_##N);
|
33
|
+
|
34
|
+
#define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
|
35
|
+
|
36
|
+
#define SLIDE(N) if ( mark_##N > tokstart ) mark_##N = buf + (mark_##N - tokstart);
|
37
|
+
|
38
|
+
#define ATTR(K, V) \
|
39
|
+
if (!NIL_P(K)) { \
|
40
|
+
if (NIL_P(attr)) attr = rb_hash_new(); \
|
41
|
+
rb_hash_aset(attr, K, V); \
|
42
|
+
}
|
43
|
+
|
44
|
+
#define TEXT_PASS() \
|
45
|
+
if (text == 0) \
|
46
|
+
{ \
|
47
|
+
if (ele_open == 1) { \
|
48
|
+
ele_open = 0; \
|
49
|
+
if (tokstart > 0) { \
|
50
|
+
mark_tag = tokstart; \
|
51
|
+
} \
|
52
|
+
} else { \
|
53
|
+
mark_tag = p; \
|
54
|
+
} \
|
55
|
+
attr = Qnil; \
|
56
|
+
tag = Qnil; \
|
57
|
+
text = 1; \
|
58
|
+
}
|
59
|
+
|
60
|
+
#define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
|
61
|
+
|
62
|
+
%%{
|
63
|
+
machine hpricot_scan;
|
64
|
+
|
65
|
+
action newEle {
|
66
|
+
if (text == 1) {
|
67
|
+
CAT(tag, p);
|
68
|
+
ELE(text);
|
69
|
+
text = 0;
|
70
|
+
}
|
71
|
+
attr = Qnil;
|
72
|
+
tag = Qnil;
|
73
|
+
mark_tag = NULL;
|
74
|
+
ele_open = 1;
|
75
|
+
}
|
76
|
+
|
77
|
+
action _tag { mark_tag = p; }
|
78
|
+
action _aval { mark_aval = p; }
|
79
|
+
action _akey { mark_akey = p; }
|
80
|
+
action tag { SET(tag, p); }
|
81
|
+
action tagc { SET(tag, p-1); }
|
82
|
+
action aval { SET(aval, p); }
|
83
|
+
action aunq {
|
84
|
+
if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
|
85
|
+
else { SET(aval, p); }
|
86
|
+
}
|
87
|
+
action akey { SET(akey, p); }
|
88
|
+
action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
|
89
|
+
action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
|
90
|
+
action xmlsd { SET(aval, p); ATTR(rb_str_new2("standalone"), aval); }
|
91
|
+
action pubid { SET(aval, p); ATTR(rb_str_new2("public_id"), aval); }
|
92
|
+
action sysid { SET(aval, p); ATTR(rb_str_new2("system_id"), aval); }
|
93
|
+
|
94
|
+
action new_attr {
|
95
|
+
akey = Qnil;
|
96
|
+
aval = Qnil;
|
97
|
+
mark_akey = NULL;
|
98
|
+
mark_aval = NULL;
|
99
|
+
}
|
100
|
+
|
101
|
+
action save_attr {
|
102
|
+
ATTR(akey, aval);
|
103
|
+
}
|
104
|
+
|
105
|
+
include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
|
106
|
+
|
107
|
+
}%%
|
108
|
+
|
109
|
+
%% write data nofinal;
|
110
|
+
|
111
|
+
#define BUFSIZE 16384
|
112
|
+
|
113
|
+
void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
|
114
|
+
{
|
115
|
+
VALUE ary;
|
116
|
+
if (sym == sym_text) {
|
117
|
+
raw = tag;
|
118
|
+
}
|
119
|
+
ary = rb_ary_new3(4, sym, tag, attr, raw);
|
120
|
+
if (taint) {
|
121
|
+
OBJ_TAINT(ary);
|
122
|
+
OBJ_TAINT(tag);
|
123
|
+
OBJ_TAINT(attr);
|
124
|
+
OBJ_TAINT(raw);
|
125
|
+
}
|
126
|
+
rb_yield(ary);
|
127
|
+
}
|
128
|
+
|
129
|
+
VALUE hpricot_scan(VALUE self, VALUE port)
|
130
|
+
{
|
131
|
+
int cs, act, have = 0, nread = 0, curline = 1, text = 0;
|
132
|
+
char *tokstart = 0, *tokend = 0, *buf = NULL;
|
133
|
+
|
134
|
+
VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
|
135
|
+
char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
|
136
|
+
int done = 0, ele_open = 0, buffer_size = 0;
|
137
|
+
|
138
|
+
int taint = OBJ_TAINTED( port );
|
139
|
+
if ( !rb_respond_to( port, s_read ) )
|
140
|
+
{
|
141
|
+
if ( rb_respond_to( port, s_to_str ) )
|
142
|
+
{
|
143
|
+
port = rb_funcall( port, s_to_str, 0 );
|
144
|
+
StringValue(port);
|
145
|
+
}
|
146
|
+
else
|
147
|
+
{
|
148
|
+
rb_raise( rb_eArgError, "bad Hpricot argument, String or IO only please." );
|
149
|
+
}
|
150
|
+
}
|
151
|
+
|
152
|
+
buffer_size = BUFSIZE;
|
153
|
+
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
|
154
|
+
bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
|
155
|
+
if (!NIL_P(bufsize)) {
|
156
|
+
buffer_size = NUM2INT(bufsize);
|
157
|
+
}
|
158
|
+
}
|
159
|
+
buf = ALLOC_N(char, buffer_size);
|
160
|
+
|
161
|
+
%% write init;
|
162
|
+
|
163
|
+
while ( !done ) {
|
164
|
+
VALUE str;
|
165
|
+
char *p = buf + have, *pe;
|
166
|
+
int len, space = buffer_size - have;
|
167
|
+
|
168
|
+
if ( space == 0 ) {
|
169
|
+
/* We've used up the entire buffer storing an already-parsed token
|
170
|
+
* prefix that must be preserved. Likely caused by super-long attributes.
|
171
|
+
* See ticket #13. */
|
172
|
+
rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING(tag)->ptr, curline);
|
173
|
+
}
|
174
|
+
|
175
|
+
if ( rb_respond_to( port, s_read ) )
|
176
|
+
{
|
177
|
+
str = rb_funcall( port, s_read, 1, INT2FIX(space) );
|
178
|
+
}
|
179
|
+
else
|
180
|
+
{
|
181
|
+
str = rb_str_substr( port, nread, space );
|
182
|
+
}
|
183
|
+
|
184
|
+
StringValue(str);
|
185
|
+
memcpy( p, RSTRING(str)->ptr, RSTRING(str)->len );
|
186
|
+
len = RSTRING(str)->len;
|
187
|
+
nread += len;
|
188
|
+
|
189
|
+
/* If this is the last buffer, tack on an EOF. */
|
190
|
+
if ( len < space ) {
|
191
|
+
p[len++] = 0;
|
192
|
+
done = 1;
|
193
|
+
}
|
194
|
+
|
195
|
+
pe = p + len;
|
196
|
+
%% write exec;
|
197
|
+
|
198
|
+
if ( cs == hpricot_scan_error ) {
|
199
|
+
free(buf);
|
200
|
+
if ( !NIL_P(tag) )
|
201
|
+
{
|
202
|
+
rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING(tag)->ptr, curline);
|
203
|
+
}
|
204
|
+
else
|
205
|
+
{
|
206
|
+
rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
|
207
|
+
}
|
208
|
+
}
|
209
|
+
|
210
|
+
if ( done && ele_open )
|
211
|
+
{
|
212
|
+
ele_open = 0;
|
213
|
+
if (tokstart > 0) {
|
214
|
+
mark_tag = tokstart;
|
215
|
+
tokstart = 0;
|
216
|
+
text = 1;
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
220
|
+
if ( tokstart == 0 )
|
221
|
+
{
|
222
|
+
have = 0;
|
223
|
+
/* text nodes have no tokstart because each byte is parsed alone */
|
224
|
+
if ( mark_tag != NULL && text == 1 )
|
225
|
+
{
|
226
|
+
if (done)
|
227
|
+
{
|
228
|
+
if (mark_tag < p-1)
|
229
|
+
{
|
230
|
+
CAT(tag, p-1);
|
231
|
+
ELE(text);
|
232
|
+
}
|
233
|
+
}
|
234
|
+
else
|
235
|
+
{
|
236
|
+
CAT(tag, p);
|
237
|
+
}
|
238
|
+
}
|
239
|
+
mark_tag = buf;
|
240
|
+
}
|
241
|
+
else
|
242
|
+
{
|
243
|
+
have = pe - tokstart;
|
244
|
+
memmove( buf, tokstart, have );
|
245
|
+
SLIDE(tag);
|
246
|
+
SLIDE(akey);
|
247
|
+
SLIDE(aval);
|
248
|
+
tokend = buf + (tokend - tokstart);
|
249
|
+
tokstart = buf;
|
250
|
+
}
|
251
|
+
}
|
252
|
+
free(buf);
|
253
|
+
}
|
254
|
+
|
255
|
+
void Init_hpricot_scan()
|
256
|
+
{
|
257
|
+
VALUE mHpricot = rb_define_module("Hpricot");
|
258
|
+
rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
|
259
|
+
rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
|
260
|
+
rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException);
|
261
|
+
|
262
|
+
s_read = rb_intern("read");
|
263
|
+
s_to_str = rb_intern("to_str");
|
264
|
+
sym_xmldecl = ID2SYM(rb_intern("xmldecl"));
|
265
|
+
sym_doctype = ID2SYM(rb_intern("doctype"));
|
266
|
+
sym_procins = ID2SYM(rb_intern("procins"));
|
267
|
+
sym_stag = ID2SYM(rb_intern("stag"));
|
268
|
+
sym_etag = ID2SYM(rb_intern("etag"));
|
269
|
+
sym_emptytag = ID2SYM(rb_intern("emptytag"));
|
270
|
+
sym_comment = ID2SYM(rb_intern("comment"));
|
271
|
+
sym_cdata = ID2SYM(rb_intern("cdata"));
|
272
|
+
sym_text = ID2SYM(rb_intern("text"));
|
273
|
+
}
|