hpricot 0.5 → 0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,363 @@
1
+
2
+ import java.io.IOException;
3
+
4
+ import org.jruby.Ruby;
5
+ import org.jruby.RubyClass;
6
+ import org.jruby.RubyHash;
7
+ import org.jruby.RubyModule;
8
+ import org.jruby.RubyNumeric;
9
+ import org.jruby.RubyString;
10
+ import org.jruby.runtime.Block;
11
+ import org.jruby.runtime.CallbackFactory;
12
+ import org.jruby.runtime.builtin.IRubyObject;
13
+ import org.jruby.exceptions.RaiseException;
14
+ import org.jruby.runtime.load.BasicLibraryService;
15
+
16
+ public class HpricotScanService implements BasicLibraryService {
17
+ public static String NO_WAY_SERIOUSLY="*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!";
18
+
19
+ public void ELE(IRubyObject N) {
20
+ if (tokend > tokstart || text) {
21
+ IRubyObject raw_string = runtime.getNil();
22
+ ele_open = false; text = false;
23
+ if (tokstart != -1 && N != cdata && N != sym_text && N != procins && N != comment) {
24
+ raw_string = runtime.newString(new String(buf,tokstart,tokend-tokstart));
25
+ }
26
+ rb_yield_tokens(N, tag[0], attr, raw_string, taint);
27
+ }
28
+ }
29
+
30
+ public void SET(IRubyObject[] N, int E) {
31
+ int mark = 0;
32
+ if(N == tag) {
33
+ if(mark_tag == -1 || E == mark_tag) {
34
+ tag[0] = runtime.newString("");
35
+ } else if(E > mark_tag) {
36
+ tag[0] = runtime.newString(new String(buf,mark_tag, E-mark_tag));
37
+ }
38
+ } else if(N == akey) {
39
+ if(mark_akey == -1 || E == mark_akey) {
40
+ akey[0] = runtime.newString("");
41
+ } else if(E > mark_akey) {
42
+ akey[0] = runtime.newString(new String(buf,mark_akey, E-mark_akey));
43
+ }
44
+ } else if(N == aval) {
45
+ if(mark_aval == -1 || E == mark_aval) {
46
+ aval[0] = runtime.newString("");
47
+ } else if(E > mark_aval) {
48
+ aval[0] = runtime.newString(new String(buf,mark_aval, E-mark_aval));
49
+ }
50
+ }
51
+ }
52
+
53
+ public void CAT(IRubyObject[] N, int E) {
54
+ if(N[0].isNil()) {
55
+ SET(N,E);
56
+ } else {
57
+ int mark = 0;
58
+ if(N == tag) {
59
+ mark = mark_tag;
60
+ } else if(N == akey) {
61
+ mark = mark_akey;
62
+ } else if(N == aval) {
63
+ mark = mark_aval;
64
+ }
65
+ ((RubyString)(N[0])).append(runtime.newString(new String(buf, mark, E-mark)));
66
+ }
67
+ }
68
+
69
+ public void SLIDE(Object N) {
70
+ int mark = 0;
71
+ if(N == tag) {
72
+ mark = mark_tag;
73
+ } else if(N == akey) {
74
+ mark = mark_akey;
75
+ } else if(N == aval) {
76
+ mark = mark_aval;
77
+ }
78
+ if(mark > tokstart) {
79
+ if(N == tag) {
80
+ mark_tag -= tokstart;
81
+ } else if(N == akey) {
82
+ mark_akey -= tokstart;
83
+ } else if(N == aval) {
84
+ mark_aval -= tokstart;
85
+ }
86
+ }
87
+ }
88
+
89
+ public void ATTR(IRubyObject K, IRubyObject V) {
90
+ if(!K.isNil()) {
91
+ if(attr.isNil()) {
92
+ attr = RubyHash.newHash(runtime);
93
+ }
94
+ ((RubyHash)attr).aset(K,V);
95
+ }
96
+ }
97
+
98
+ public void ATTR(IRubyObject[] K, IRubyObject V) {
99
+ ATTR(K[0],V);
100
+ }
101
+
102
+ public void ATTR(IRubyObject K, IRubyObject[] V) {
103
+ ATTR(K,V[0]);
104
+ }
105
+
106
+ public void ATTR(IRubyObject[] K, IRubyObject[] V) {
107
+ ATTR(K[0],V[0]);
108
+ }
109
+
110
+ public void TEXT_PASS() {
111
+ if(!text) {
112
+ if(ele_open) {
113
+ ele_open = false;
114
+ if(tokstart > -1) {
115
+ mark_tag = tokstart;
116
+ }
117
+ } else {
118
+ mark_tag = p;
119
+ }
120
+ attr = runtime.getNil();
121
+ tag[0] = runtime.getNil();
122
+ text = true;
123
+ }
124
+ }
125
+
126
+ public void EBLK(IRubyObject N, int T) {
127
+ CAT(tag, p - T + 1);
128
+ ELE(N);
129
+ }
130
+
131
+
132
+ public void rb_raise(RubyClass error, String message) {
133
+ throw new RaiseException(runtime, error, message, true);
134
+ }
135
+
136
+ public IRubyObject rb_str_new2(String s) {
137
+ return runtime.newString(s);
138
+ }
139
+
140
+ %%{
141
+ machine hpricot_scan;
142
+
143
+ action newEle {
144
+ if (text) {
145
+ CAT(tag, p);
146
+ ELE(sym_text);
147
+ text = false;
148
+ }
149
+ attr = runtime.getNil();
150
+ tag[0] = runtime.getNil();
151
+ mark_tag = -1;
152
+ ele_open = true;
153
+ }
154
+
155
+ action _tag { mark_tag = p; }
156
+ action _aval { mark_aval = p; }
157
+ action _akey { mark_akey = p; }
158
+ action tag { SET(tag, p); }
159
+ action tagc { SET(tag, p-1); }
160
+ action aval { SET(aval, p); }
161
+ action aunq {
162
+ if (buf[p-1] == '"' || buf[p-1] == '\'') { SET(aval, p-1); }
163
+ else { SET(aval, p); }
164
+ }
165
+ action akey { SET(akey, p); }
166
+ action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
167
+ action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
168
+ action xmlsd { SET(aval, p); ATTR(rb_str_new2("standalone"), aval); }
169
+ action pubid { SET(aval, p); ATTR(rb_str_new2("public_id"), aval); }
170
+ action sysid { SET(aval, p); ATTR(rb_str_new2("system_id"), aval); }
171
+
172
+ action new_attr {
173
+ akey[0] = runtime.getNil();
174
+ aval[0] = runtime.getNil();
175
+ mark_akey = -1;
176
+ mark_aval = -1;
177
+ }
178
+
179
+ action save_attr {
180
+ ATTR(akey, aval);
181
+ }
182
+
183
+ include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
184
+
185
+ }%%
186
+
187
+ %% write data nofinal;
188
+
189
+ public final static int BUFSIZE=16384;
190
+
191
+ private void rb_yield_tokens(IRubyObject sym, IRubyObject tag, IRubyObject attr, IRubyObject raw, boolean taint) {
192
+ IRubyObject ary;
193
+ if (sym == runtime.newSymbol("text")) {
194
+ raw = tag;
195
+ }
196
+ ary = runtime.newArray(new IRubyObject[]{sym, tag, attr, raw});
197
+ if (taint) {
198
+ ary.setTaint(true);
199
+ tag.setTaint(true);
200
+ attr.setTaint(true);
201
+ raw.setTaint(true);
202
+ }
203
+ block.yield(runtime.getCurrentContext(), ary, null, null, false);
204
+ }
205
+
206
+
207
+ int cs, act, have = 0, nread = 0, curline = 1, p=-1;
208
+ boolean text = false;
209
+ int tokstart=-1, tokend;
210
+ char[] buf;
211
+ Ruby runtime;
212
+ IRubyObject attr, bufsize;
213
+ IRubyObject[] tag, akey, aval;
214
+ int mark_tag, mark_akey, mark_aval;
215
+ boolean done = false, ele_open = false;
216
+ int buffer_size = 0;
217
+ boolean taint = false;
218
+ Block block = null;
219
+
220
+
221
+ IRubyObject xmldecl, doctype, procins, stag, etag, emptytag, comment,
222
+ cdata, sym_text;
223
+
224
+ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
225
+ attr = bufsize = runtime.getNil();
226
+ tag = new IRubyObject[]{runtime.getNil()};
227
+ akey = new IRubyObject[]{runtime.getNil()};
228
+ aval = new IRubyObject[]{runtime.getNil()};
229
+
230
+ RubyClass rb_eHpricotParseError = runtime.getModule("Hpricot").getClass("ParseError");
231
+
232
+ taint = port.isTaint();
233
+ if ( !port.respondsTo("read")) {
234
+ if ( port.respondsTo("to_str")) {
235
+ port = port.callMethod(runtime.getCurrentContext(),"to_str");
236
+ } else {
237
+ throw runtime.newArgumentError("bad Hpricot argument, String or IO only please.");
238
+ }
239
+ }
240
+
241
+ buffer_size = BUFSIZE;
242
+ if (recv.getInstanceVariable("@buffer_size") != null) {
243
+ bufsize = recv.getInstanceVariable("@buffer_size");
244
+ if (!bufsize.isNil()) {
245
+ buffer_size = RubyNumeric.fix2int(bufsize);
246
+ }
247
+ }
248
+ buf = new char[buffer_size];
249
+
250
+ %% write init;
251
+
252
+ while( !done ) {
253
+ IRubyObject str;
254
+ p = have;
255
+ int pe;
256
+ int len, space = buffer_size - have;
257
+
258
+ if ( space == 0 ) {
259
+ /* We've used up the entire buffer storing an already-parsed token
260
+ * prefix that must be preserved. Likely caused by super-long attributes.
261
+ * See ticket #13. */
262
+ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <" + tag.toString() + ">, starting on line "+curline+".");
263
+ }
264
+
265
+ if (port.respondsTo("read")) {
266
+ str = port.callMethod(runtime.getCurrentContext(),"read",runtime.newFixnum(space));
267
+ } else {
268
+ str = ((RubyString)port).substr(nread,space);
269
+ }
270
+
271
+ str = str.convertToString();
272
+ String sss = str.toString();
273
+ char[] chars = sss.toCharArray();
274
+ System.arraycopy(chars,0,buf,p,chars.length);
275
+
276
+ len = sss.length();
277
+ nread += len;
278
+
279
+ if ( len < space ) {
280
+ len++;
281
+ done = true;
282
+ }
283
+
284
+ pe = p + len;
285
+ char[] data = buf;
286
+
287
+ %% write exec;
288
+
289
+ if ( cs == hpricot_scan_error ) {
290
+ if(!tag[0].isNil()) {
291
+ rb_raise(rb_eHpricotParseError, "parse error on element <"+tag.toString()+">, starting on line "+curline+".\n" + NO_WAY_SERIOUSLY);
292
+ } else {
293
+ rb_raise(rb_eHpricotParseError, "parse error on line "+curline+".\n" + NO_WAY_SERIOUSLY);
294
+ }
295
+ }
296
+
297
+ if ( done && ele_open ) {
298
+ ele_open = false;
299
+ if(tokstart > -1) {
300
+ mark_tag = tokstart;
301
+ tokstart = -1;
302
+ text = true;
303
+ }
304
+ }
305
+
306
+ if(tokstart == -1) {
307
+ have = 0;
308
+ /* text nodes have no tokstart because each byte is parsed alone */
309
+ if(mark_tag != -1 && text) {
310
+ if (done) {
311
+ if(mark_tag < p-1) {
312
+ CAT(tag, p-1);
313
+ ELE(sym_text);
314
+ }
315
+ } else {
316
+ CAT(tag, p);
317
+ }
318
+ }
319
+ mark_tag = 0;
320
+ } else {
321
+ have = pe - tokstart;
322
+ System.arraycopy(buf,tokstart,buf,0,have);
323
+ SLIDE(tag);
324
+ SLIDE(akey);
325
+ SLIDE(aval);
326
+ tokend = (tokend - tokstart);
327
+ tokstart = 0;
328
+ }
329
+ }
330
+ return runtime.getNil();
331
+ }
332
+
333
+ public static IRubyObject __hpricot_scan(IRubyObject recv, IRubyObject port, Block block) {
334
+ Ruby runtime = recv.getRuntime();
335
+ HpricotScanService service = new HpricotScanService();
336
+ service.runtime = runtime;
337
+ service.xmldecl = runtime.newSymbol("xmldecl");
338
+ service.doctype = runtime.newSymbol("doctype");
339
+ service.procins = runtime.newSymbol("procins");
340
+ service.stag = runtime.newSymbol("stag");
341
+ service.etag = runtime.newSymbol("etag");
342
+ service.emptytag = runtime.newSymbol("emptytag");
343
+ service.comment = runtime.newSymbol("comment");
344
+ service.cdata = runtime.newSymbol("cdata");
345
+ service.sym_text = runtime.newSymbol("text");
346
+ service.block = block;
347
+ return service.hpricot_scan(recv, port);
348
+ }
349
+
350
+
351
+ public boolean basicLoad(final Ruby runtime) throws IOException {
352
+ Init_hpricot_scan(runtime);
353
+ return true;
354
+ }
355
+
356
+ public static void Init_hpricot_scan(Ruby runtime) {
357
+ RubyModule mHpricot = runtime.defineModule("Hpricot");
358
+ mHpricot.getMetaClass().attr_accessor(new IRubyObject[]{runtime.newSymbol("buffer_size")});
359
+ CallbackFactory fact = runtime.callbackFactory(HpricotScanService.class);
360
+ mHpricot.getMetaClass().defineMethod("scan",fact.getSingletonMethod("__hpricot_scan",IRubyObject.class));
361
+ mHpricot.defineClassUnder("ParseError",runtime.getClass("Exception"),runtime.getClass("Exception").getAllocator());
362
+ }
363
+ }
@@ -102,75 +102,8 @@ static ID s_read, s_to_str;
102
102
  ATTR(akey, aval);
103
103
  }
104
104
 
105
- #
106
- # HTML tokens
107
- # (a blatant rip from HTree)
108
- #
109
- newline = '\n' @{curline += 1;} ;
110
- # qtext = '"' ( '\"' | [^\n"] )* '"' | "'" ( "\\'" | [^\n'] )* "'" ;
111
- NameChar = [\-A-Za-z0-9._:?] ;
112
- Name = [A-Za-z_:] NameChar* ;
113
- StartComment = "<!--" ;
114
- EndComment = "-->" ;
115
- StartCdata = "<![CDATA[" ;
116
- EndCdata = "]]>" ;
117
-
118
- NameCap = Name >_tag %tag;
119
- NameAttr = NameChar+ >_akey %akey ;
120
- Q1Attr = [^']* >_aval %aval ;
121
- Q2Attr = [^"]* >_aval %aval ;
122
- UnqAttr = ( space >_aval | [^ \t\n<>"'] >_aval [^ \t\n<>]* %aunq ) ;
123
- Nmtoken = NameChar+ >_akey %akey ;
124
-
125
- Attr = NameAttr space* "=" space* ('"' Q2Attr '"' | "'" Q1Attr "'" | UnqAttr space+ ) space* ;
126
- AttrEnd = ( NameAttr space* "=" space* UnqAttr? | Nmtoken >new_attr %save_attr ) ;
127
- AttrSet = ( Attr >new_attr %save_attr | Nmtoken >new_attr space+ %save_attr ) ;
128
- StartTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? ">" | "<" NameCap ">";
129
- EmptyTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? "/>" | "<" NameCap "/>" ;
130
-
131
- EndTag = "</" NameCap space* ">" ;
132
- XmlVersionNum = [a-zA-Z0-9_.:\-]+ >_aval %xmlver ;
133
- XmlVersionInfo = space+ "version" space* "=" space* ("'" XmlVersionNum "'" | '"' XmlVersionNum '"' ) ;
134
- XmlEncName = [A-Za-z] >_aval [A-Za-z0-9._\-]* %xmlenc ;
135
- XmlEncodingDecl = space+ "encoding" space* "=" space* ("'" XmlEncName "'" | '"' XmlEncName '"' ) ;
136
- XmlYesNo = ("yes" | "no") >_aval %xmlsd ;
137
- XmlSDDecl = space+ "standalone" space* "=" space* ("'" XmlYesNo "'" | '"' XmlYesNo '"') ;
138
- XmlDecl = "<?xml" XmlVersionInfo XmlEncodingDecl? XmlSDDecl? space* "?>" ;
139
-
140
- SystemLiteral = '"' [^"]* >_aval %sysid '"' | "'" [^']* >_aval %sysid "'" ;
141
- PubidLiteral = '"' [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid '"' |
142
- "'" [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid "'" ;
143
- ExternalID = ( "SYSTEM" | "PUBLIC" space+ PubidLiteral ) (space+ SystemLiteral)? ;
144
- DocType = "<!DOCTYPE" space+ NameCap (space+ ExternalID)? space* ("[" [^\]]* "]" space*)? ">" ;
145
- StartXmlProcIns = "<?" Name >{ TEXT_PASS(); } space+ ;
146
- EndXmlProcIns = "?>" ;
147
-
148
- html_comment := |*
149
- EndComment @{ EBLK(comment, 3); fgoto main; };
150
- any | newline { TEXT_PASS(); };
151
- *|;
152
-
153
- html_cdata := |*
154
- EndCdata @{ EBLK(cdata, 3); fgoto main; };
155
- any | newline { TEXT_PASS(); };
156
- *|;
157
-
158
- html_procins := |*
159
- EndXmlProcIns @{ EBLK(procins, 2); fgoto main; };
160
- any | newline { TEXT_PASS(); };
161
- *|;
162
-
163
- main := |*
164
- XmlDecl >newEle { ELE(xmldecl); };
165
- DocType >newEle { ELE(doctype); };
166
- StartXmlProcIns >newEle { fgoto html_procins; };
167
- StartTag >newEle { ELE(stag); };
168
- EndTag >newEle { ELE(etag); };
169
- EmptyTag >newEle { ELE(emptytag); };
170
- StartComment >newEle { fgoto html_comment; };
171
- StartCdata >newEle { fgoto html_cdata; };
172
- any | newline { TEXT_PASS(); };
173
- *|;
105
+ include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
106
+
174
107
  }%%
175
108
 
176
109
  %% write data nofinal;
@@ -212,7 +145,7 @@ VALUE hpricot_scan(VALUE self, VALUE port)
212
145
  }
213
146
  else
214
147
  {
215
- rb_raise( rb_eArgError, "bad argument, String or IO only please." );
148
+ rb_raise( rb_eArgError, "bad Hpricot argument, String or IO only please." );
216
149
  }
217
150
  }
218
151
 
data/lib/hpricot.rb CHANGED
@@ -23,3 +23,4 @@ require 'hpricot/modules'
23
23
  require 'hpricot/traverse'
24
24
  require 'hpricot/inspect'
25
25
  require 'hpricot/parse'
26
+ require 'hpricot/builder'