hpricot 0.7-x86-mswin32

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/CHANGELOG +68 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +260 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +200 -0
  8. data/ext/hpricot_scan/HpricotScanService.java +1305 -0
  9. data/ext/hpricot_scan/extconf.rb +6 -0
  10. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  11. data/ext/hpricot_scan/hpricot_css.c +3502 -0
  12. data/ext/hpricot_scan/hpricot_css.rl +115 -0
  13. data/ext/hpricot_scan/hpricot_scan.c +6704 -0
  14. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  15. data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
  16. data/ext/hpricot_scan/hpricot_scan.rl +722 -0
  17. data/ext/hpricot_scan/test.rb +4 -0
  18. data/extras/mingw-rbconfig.rb +176 -0
  19. data/lib/fast_xs.so +0 -0
  20. data/lib/hpricot.rb +26 -0
  21. data/lib/hpricot/blankslate.rb +63 -0
  22. data/lib/hpricot/builder.rb +216 -0
  23. data/lib/hpricot/elements.rb +510 -0
  24. data/lib/hpricot/htmlinfo.rb +691 -0
  25. data/lib/hpricot/inspect.rb +103 -0
  26. data/lib/hpricot/modules.rb +38 -0
  27. data/lib/hpricot/parse.rb +38 -0
  28. data/lib/hpricot/tag.rb +198 -0
  29. data/lib/hpricot/tags.rb +164 -0
  30. data/lib/hpricot/traverse.rb +838 -0
  31. data/lib/hpricot/xchar.rb +94 -0
  32. data/lib/hpricot_scan.so +0 -0
  33. data/test/files/basic.xhtml +17 -0
  34. data/test/files/boingboing.html +2266 -0
  35. data/test/files/cy0.html +3653 -0
  36. data/test/files/immob.html +400 -0
  37. data/test/files/pace_application.html +1320 -0
  38. data/test/files/tenderlove.html +16 -0
  39. data/test/files/uswebgen.html +220 -0
  40. data/test/files/utf8.html +1054 -0
  41. data/test/files/week9.html +1723 -0
  42. data/test/files/why.xml +19 -0
  43. data/test/load_files.rb +7 -0
  44. data/test/nokogiri-bench.rb +64 -0
  45. data/test/test_alter.rb +77 -0
  46. data/test/test_builder.rb +37 -0
  47. data/test/test_parser.rb +409 -0
  48. data/test/test_paths.rb +25 -0
  49. data/test/test_preserved.rb +70 -0
  50. data/test/test_xml.rb +28 -0
  51. metadata +111 -0
@@ -0,0 +1,79 @@
1
+ /*
2
+ * hpricot_scan.h
3
+ *
4
+ * $Author: why $
5
+ * $Date: 2006-05-08 22:03:50 -0600 (Mon, 08 May 2006) $
6
+ *
7
+ * Copyright (C) 2006 why the lucky stiff
8
+ * You can redistribute it and/or modify it under the same terms as Ruby.
9
+ */
10
+
11
+ #ifndef hpricot_scan_h
12
+ #define hpricot_scan_h
13
+
14
+ #include <sys/types.h>
15
+
16
+ #if defined(_WIN32)
17
+ #include <stddef.h>
18
+ #endif
19
+
20
+ /*
21
+ * Memory Allocation
22
+ */
23
+ #if defined(HAVE_ALLOCA_H) && !defined(__GNUC__)
24
+ #include <alloca.h>
25
+ #endif
26
+
27
+ #ifndef NULL
28
+ # define NULL (void *)0
29
+ #endif
30
+
31
+ #define BUFSIZE 16384
32
+
33
+ #define S_ALLOC_N(type,n) (type*)malloc(sizeof(type)*(n))
34
+ #define S_ALLOC(type) (type*)malloc(sizeof(type))
35
+ #define S_REALLOC_N(var,type,n) (var)=(type*)realloc((char*)(var),sizeof(type)*(n))
36
+ #define S_FREE(n) free(n); n = NULL;
37
+
38
+ #define S_ALLOCA_N(type,n) (type*)alloca(sizeof(type)*(n))
39
+
40
+ #define S_MEMZERO(p,type,n) memset((p), 0, sizeof(type)*(n))
41
+ #define S_MEMCPY(p1,p2,type,n) memcpy((p1), (p2), sizeof(type)*(n))
42
+ #define S_MEMMOVE(p1,p2,type,n) memmove((p1), (p2), sizeof(type)*(n))
43
+ #define S_MEMCMP(p1,p2,type,n) memcmp((p1), (p2), sizeof(type)*(n))
44
+
45
+ typedef struct {
46
+ void *name;
47
+ void *attributes;
48
+ } hpricot_element;
49
+
50
+ typedef void (*hpricot_element_cb)(void *data, hpricot_element *token);
51
+
52
+ typedef struct hpricot_scan {
53
+ int lineno;
54
+ int cs;
55
+ size_t nread;
56
+ size_t mark;
57
+
58
+ void *data;
59
+
60
+ hpricot_element_cb xmldecl;
61
+ hpricot_element_cb doctype;
62
+ hpricot_element_cb xmlprocins;
63
+ hpricot_element_cb starttag;
64
+ hpricot_element_cb endtag;
65
+ hpricot_element_cb emptytag;
66
+ hpricot_element_cb comment;
67
+ hpricot_element_cb cdata;
68
+
69
+ } http_scan;
70
+
71
+ // int hpricot_scan_init(hpricot_scan *scan);
72
+ // int hpricot_scan_finish(hpricot_scan *scan);
73
+ // size_t hpricot_scan_execute(hpricot_scan *scan, const char *data, size_t len, size_t off);
74
+ // int hpricot_scan_has_error(hpricot_scan *scan);
75
+ // int hpricot_scan_is_finished(hpricot_scan *scan);
76
+ //
77
+ // #define hpricot_scan_nread(scan) (scan)->nread
78
+
79
+ #endif
@@ -0,0 +1,373 @@
1
+
2
+ import java.io.IOException;
3
+
4
+ import org.jruby.Ruby;
5
+ import org.jruby.RubyClass;
6
+ import org.jruby.RubyHash;
7
+ import org.jruby.RubyModule;
8
+ import org.jruby.RubyNumeric;
9
+ import org.jruby.RubyObjectAdapter;
10
+ import org.jruby.RubyString;
11
+ import org.jruby.javasupport.JavaEmbedUtils;
12
+ import org.jruby.runtime.Block;
13
+ import org.jruby.runtime.CallbackFactory;
14
+ import org.jruby.runtime.builtin.IRubyObject;
15
+ import org.jruby.exceptions.RaiseException;
16
+ import org.jruby.runtime.load.BasicLibraryService;
17
+
18
+ public class HpricotScanService implements BasicLibraryService {
19
+ public static String NO_WAY_SERIOUSLY="*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!";
20
+ private static RubyObjectAdapter rubyApi;
21
+
22
+ public void ELE(IRubyObject N) {
23
+ if (te > ts || text) {
24
+ IRubyObject raw_string = runtime.getNil();
25
+ ele_open = false; text = false;
26
+ if (ts != -1 && N != cdata && N != sym_text && N != procins && N != comment) {
27
+ raw_string = runtime.newString(new String(buf,ts,te-ts));
28
+ }
29
+ rb_yield_tokens(N, tag[0], attr, raw_string, taint);
30
+ }
31
+ }
32
+
33
+ public void SET(IRubyObject[] N, int E) {
34
+ int mark = 0;
35
+ if(N == tag) {
36
+ if(mark_tag == -1 || E == mark_tag) {
37
+ tag[0] = runtime.newString("");
38
+ } else if(E > mark_tag) {
39
+ tag[0] = runtime.newString(new String(buf,mark_tag, E-mark_tag));
40
+ }
41
+ } else if(N == akey) {
42
+ if(mark_akey == -1 || E == mark_akey) {
43
+ akey[0] = runtime.newString("");
44
+ } else if(E > mark_akey) {
45
+ akey[0] = runtime.newString(new String(buf,mark_akey, E-mark_akey));
46
+ }
47
+ } else if(N == aval) {
48
+ if(mark_aval == -1 || E == mark_aval) {
49
+ aval[0] = runtime.newString("");
50
+ } else if(E > mark_aval) {
51
+ aval[0] = runtime.newString(new String(buf,mark_aval, E-mark_aval));
52
+ }
53
+ }
54
+ }
55
+
56
+ public void CAT(IRubyObject[] N, int E) {
57
+ if(N[0].isNil()) {
58
+ SET(N,E);
59
+ } else {
60
+ int mark = 0;
61
+ if(N == tag) {
62
+ mark = mark_tag;
63
+ } else if(N == akey) {
64
+ mark = mark_akey;
65
+ } else if(N == aval) {
66
+ mark = mark_aval;
67
+ }
68
+ ((RubyString)(N[0])).append(runtime.newString(new String(buf, mark, E-mark)));
69
+ }
70
+ }
71
+
72
+ public void SLIDE(Object N) {
73
+ int mark = 0;
74
+ if(N == tag) {
75
+ mark = mark_tag;
76
+ } else if(N == akey) {
77
+ mark = mark_akey;
78
+ } else if(N == aval) {
79
+ mark = mark_aval;
80
+ }
81
+ if(mark > ts) {
82
+ if(N == tag) {
83
+ mark_tag -= ts;
84
+ } else if(N == akey) {
85
+ mark_akey -= ts;
86
+ } else if(N == aval) {
87
+ mark_aval -= ts;
88
+ }
89
+ }
90
+ }
91
+
92
+ public void ATTR(IRubyObject K, IRubyObject V) {
93
+ if(!K.isNil()) {
94
+ if(attr.isNil()) {
95
+ attr = RubyHash.newHash(runtime);
96
+ }
97
+ ((RubyHash)attr).op_aset(runtime.getCurrentContext(),K,V);
98
+ // ((RubyHash)attr).aset(K,V);
99
+ }
100
+ }
101
+
102
+ public void ATTR(IRubyObject[] K, IRubyObject V) {
103
+ ATTR(K[0],V);
104
+ }
105
+
106
+ public void ATTR(IRubyObject K, IRubyObject[] V) {
107
+ ATTR(K,V[0]);
108
+ }
109
+
110
+ public void ATTR(IRubyObject[] K, IRubyObject[] V) {
111
+ ATTR(K[0],V[0]);
112
+ }
113
+
114
+ public void TEXT_PASS() {
115
+ if(!text) {
116
+ if(ele_open) {
117
+ ele_open = false;
118
+ if(ts > -1) {
119
+ mark_tag = ts;
120
+ }
121
+ } else {
122
+ mark_tag = p;
123
+ }
124
+ attr = runtime.getNil();
125
+ tag[0] = runtime.getNil();
126
+ text = true;
127
+ }
128
+ }
129
+
130
+ public void EBLK(IRubyObject N, int T) {
131
+ CAT(tag, p - T + 1);
132
+ ELE(N);
133
+ }
134
+
135
+
136
+ public void rb_raise(RubyClass error, String message) {
137
+ throw new RaiseException(runtime, error, message, true);
138
+ }
139
+
140
+ public IRubyObject rb_str_new2(String s) {
141
+ return runtime.newString(s);
142
+ }
143
+
144
+ %%{
145
+ machine hpricot_scan;
146
+
147
+ action newEle {
148
+ if (text) {
149
+ CAT(tag, p);
150
+ ELE(sym_text);
151
+ text = false;
152
+ }
153
+ attr = runtime.getNil();
154
+ tag[0] = runtime.getNil();
155
+ mark_tag = -1;
156
+ ele_open = true;
157
+ }
158
+
159
+ action _tag { mark_tag = p; }
160
+ action _aval { mark_aval = p; }
161
+ action _akey { mark_akey = p; }
162
+ action tag { SET(tag, p); }
163
+ action tagc { SET(tag, p-1); }
164
+ action aval { SET(aval, p); }
165
+ action aunq {
166
+ if (buf[p-1] == '"' || buf[p-1] == '\'') { SET(aval, p-1); }
167
+ else { SET(aval, p); }
168
+ }
169
+ action akey { SET(akey, p); }
170
+ action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
171
+ action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
172
+ action xmlsd { SET(aval, p); ATTR(rb_str_new2("standalone"), aval); }
173
+ action pubid { SET(aval, p); ATTR(rb_str_new2("public_id"), aval); }
174
+ action sysid { SET(aval, p); ATTR(rb_str_new2("system_id"), aval); }
175
+
176
+ action new_attr {
177
+ akey[0] = runtime.getNil();
178
+ aval[0] = runtime.getNil();
179
+ mark_akey = -1;
180
+ mark_aval = -1;
181
+ }
182
+
183
+ action save_attr {
184
+ ATTR(akey, aval);
185
+ }
186
+
187
+ include hpricot_common "hpricot_common.rl";
188
+
189
+ }%%
190
+
191
+ %% write data nofinal;
192
+
193
+ public final static int BUFSIZE=16384;
194
+
195
+ private void rb_yield_tokens(IRubyObject sym, IRubyObject tag, IRubyObject attr, IRubyObject raw, boolean taint) {
196
+ IRubyObject ary;
197
+ if (sym == runtime.newSymbol("text")) {
198
+ raw = tag;
199
+ }
200
+ ary = runtime.newArray(new IRubyObject[]{sym, tag, attr, raw});
201
+ if (taint) {
202
+ ary.setTaint(true);
203
+ tag.setTaint(true);
204
+ attr.setTaint(true);
205
+ raw.setTaint(true);
206
+ }
207
+ block.yield(runtime.getCurrentContext(), ary, null, null, false);
208
+ }
209
+
210
+
211
+ int cs, act, have = 0, nread = 0, curline = 1, p=-1;
212
+ boolean text = false;
213
+ int ts=-1, te;
214
+ int eof=-1;
215
+ char[] buf;
216
+ Ruby runtime;
217
+ IRubyObject attr, bufsize;
218
+ IRubyObject[] tag, akey, aval;
219
+ int mark_tag, mark_akey, mark_aval;
220
+ boolean done = false, ele_open = false;
221
+ int buffer_size = 0;
222
+ boolean taint = false;
223
+ Block block = null;
224
+
225
+
226
+ IRubyObject xmldecl, doctype, procins, stag, etag, emptytag, comment,
227
+ cdata, sym_text;
228
+
229
+ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
230
+ attr = bufsize = runtime.getNil();
231
+ tag = new IRubyObject[]{runtime.getNil()};
232
+ akey = new IRubyObject[]{runtime.getNil()};
233
+ aval = new IRubyObject[]{runtime.getNil()};
234
+
235
+ RubyClass rb_eHpricotParseError = runtime.getModule("Hpricot").getClass("ParseError");
236
+
237
+ taint = port.isTaint();
238
+ if ( !port.respondsTo("read")) {
239
+ if ( port.respondsTo("to_str")) {
240
+ port = port.callMethod(runtime.getCurrentContext(),"to_str");
241
+ } else {
242
+ throw runtime.newArgumentError("bad Hpricot argument, String or IO only please.");
243
+ }
244
+ }
245
+
246
+ buffer_size = BUFSIZE;
247
+ if (rubyApi.getInstanceVariable(recv, "@buffer_size") != null) {
248
+ bufsize = rubyApi.getInstanceVariable(recv, "@buffer_size");
249
+ if (!bufsize.isNil()) {
250
+ buffer_size = RubyNumeric.fix2int(bufsize);
251
+ }
252
+ }
253
+ buf = new char[buffer_size];
254
+
255
+ %% write init;
256
+
257
+ while( !done ) {
258
+ IRubyObject str;
259
+ p = have;
260
+ int pe;
261
+ int len, space = buffer_size - have;
262
+
263
+ if ( space == 0 ) {
264
+ /* We've used up the entire buffer storing an already-parsed token
265
+ * prefix that must be preserved. Likely caused by super-long attributes.
266
+ * See ticket #13. */
267
+ buffer_size += BUFSIZE;
268
+ char[] new_buf = new char[buffer_size];
269
+ System.arraycopy(buf, 0, new_buf, 0, buf.length);
270
+ buf = new_buf;
271
+ space = buffer_size - have;
272
+ }
273
+
274
+ if (port.respondsTo("read")) {
275
+ str = port.callMethod(runtime.getCurrentContext(),"read",runtime.newFixnum(space));
276
+ } else {
277
+ str = ((RubyString)port).substr(nread,space);
278
+ }
279
+
280
+ str = str.convertToString();
281
+ String sss = str.toString();
282
+ char[] chars = sss.toCharArray();
283
+ System.arraycopy(chars,0,buf,p,chars.length);
284
+
285
+ len = sss.length();
286
+ nread += len;
287
+
288
+ if ( len < space ) {
289
+ len++;
290
+ done = true;
291
+ }
292
+
293
+ pe = p + len;
294
+ char[] data = buf;
295
+
296
+ %% write exec;
297
+
298
+ if ( cs == hpricot_scan_error ) {
299
+ if(!tag[0].isNil()) {
300
+ rb_raise(rb_eHpricotParseError, "parse error on element <"+tag.toString()+">, starting on line "+curline+".\n" + NO_WAY_SERIOUSLY);
301
+ } else {
302
+ rb_raise(rb_eHpricotParseError, "parse error on line "+curline+".\n" + NO_WAY_SERIOUSLY);
303
+ }
304
+ }
305
+
306
+ if ( done && ele_open ) {
307
+ ele_open = false;
308
+ if(ts > -1) {
309
+ mark_tag = ts;
310
+ ts = -1;
311
+ text = true;
312
+ }
313
+ }
314
+
315
+ if(ts == -1) {
316
+ have = 0;
317
+ /* text nodes have no ts because each byte is parsed alone */
318
+ if(mark_tag != -1 && text) {
319
+ if (done) {
320
+ if(mark_tag < p-1) {
321
+ CAT(tag, p-1);
322
+ ELE(sym_text);
323
+ }
324
+ } else {
325
+ CAT(tag, p);
326
+ }
327
+ }
328
+ mark_tag = 0;
329
+ } else {
330
+ have = pe - ts;
331
+ System.arraycopy(buf,ts,buf,0,have);
332
+ SLIDE(tag);
333
+ SLIDE(akey);
334
+ SLIDE(aval);
335
+ te = (te - ts);
336
+ ts = 0;
337
+ }
338
+ }
339
+ return runtime.getNil();
340
+ }
341
+
342
+ public static IRubyObject __hpricot_scan(IRubyObject recv, IRubyObject port, Block block) {
343
+ Ruby runtime = recv.getRuntime();
344
+ HpricotScanService service = new HpricotScanService();
345
+ service.runtime = runtime;
346
+ service.xmldecl = runtime.newSymbol("xmldecl");
347
+ service.doctype = runtime.newSymbol("doctype");
348
+ service.procins = runtime.newSymbol("procins");
349
+ service.stag = runtime.newSymbol("stag");
350
+ service.etag = runtime.newSymbol("etag");
351
+ service.emptytag = runtime.newSymbol("emptytag");
352
+ service.comment = runtime.newSymbol("comment");
353
+ service.cdata = runtime.newSymbol("cdata");
354
+ service.sym_text = runtime.newSymbol("text");
355
+ service.block = block;
356
+ return service.hpricot_scan(recv, port);
357
+ }
358
+
359
+
360
+ public boolean basicLoad(final Ruby runtime) throws IOException {
361
+ Init_hpricot_scan(runtime);
362
+ return true;
363
+ }
364
+
365
+ public static void Init_hpricot_scan(Ruby runtime) {
366
+ RubyModule mHpricot = runtime.defineModule("Hpricot");
367
+ mHpricot.getMetaClass().attr_accessor(runtime.getCurrentContext(),new IRubyObject[]{runtime.newSymbol("buffer_size")});
368
+ CallbackFactory fact = runtime.callbackFactory(HpricotScanService.class);
369
+ mHpricot.getMetaClass().defineMethod("scan",fact.getSingletonMethod("__hpricot_scan",IRubyObject.class));
370
+ mHpricot.defineClassUnder("ParseError",runtime.getClass("StandardError"),runtime.getClass("StandardError").getAllocator());
371
+ rubyApi = JavaEmbedUtils.newObjectAdapter();
372
+ }
373
+ }