hpricot 0.5-mswin32 → 0.6-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,363 @@
1
+
2
+ import java.io.IOException;
3
+
4
+ import org.jruby.Ruby;
5
+ import org.jruby.RubyClass;
6
+ import org.jruby.RubyHash;
7
+ import org.jruby.RubyModule;
8
+ import org.jruby.RubyNumeric;
9
+ import org.jruby.RubyString;
10
+ import org.jruby.runtime.Block;
11
+ import org.jruby.runtime.CallbackFactory;
12
+ import org.jruby.runtime.builtin.IRubyObject;
13
+ import org.jruby.exceptions.RaiseException;
14
+ import org.jruby.runtime.load.BasicLibraryService;
15
+
16
+ public class HpricotScanService implements BasicLibraryService {
17
+ public static String NO_WAY_SERIOUSLY="*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!";
18
+
19
+ public void ELE(IRubyObject N) {
20
+ if (tokend > tokstart || text) {
21
+ IRubyObject raw_string = runtime.getNil();
22
+ ele_open = false; text = false;
23
+ if (tokstart != -1 && N != cdata && N != sym_text && N != procins && N != comment) {
24
+ raw_string = runtime.newString(new String(buf,tokstart,tokend-tokstart));
25
+ }
26
+ rb_yield_tokens(N, tag[0], attr, raw_string, taint);
27
+ }
28
+ }
29
+
30
+ public void SET(IRubyObject[] N, int E) {
31
+ int mark = 0;
32
+ if(N == tag) {
33
+ if(mark_tag == -1 || E == mark_tag) {
34
+ tag[0] = runtime.newString("");
35
+ } else if(E > mark_tag) {
36
+ tag[0] = runtime.newString(new String(buf,mark_tag, E-mark_tag));
37
+ }
38
+ } else if(N == akey) {
39
+ if(mark_akey == -1 || E == mark_akey) {
40
+ akey[0] = runtime.newString("");
41
+ } else if(E > mark_akey) {
42
+ akey[0] = runtime.newString(new String(buf,mark_akey, E-mark_akey));
43
+ }
44
+ } else if(N == aval) {
45
+ if(mark_aval == -1 || E == mark_aval) {
46
+ aval[0] = runtime.newString("");
47
+ } else if(E > mark_aval) {
48
+ aval[0] = runtime.newString(new String(buf,mark_aval, E-mark_aval));
49
+ }
50
+ }
51
+ }
52
+
53
+ public void CAT(IRubyObject[] N, int E) {
54
+ if(N[0].isNil()) {
55
+ SET(N,E);
56
+ } else {
57
+ int mark = 0;
58
+ if(N == tag) {
59
+ mark = mark_tag;
60
+ } else if(N == akey) {
61
+ mark = mark_akey;
62
+ } else if(N == aval) {
63
+ mark = mark_aval;
64
+ }
65
+ ((RubyString)(N[0])).append(runtime.newString(new String(buf, mark, E-mark)));
66
+ }
67
+ }
68
+
69
+ public void SLIDE(Object N) {
70
+ int mark = 0;
71
+ if(N == tag) {
72
+ mark = mark_tag;
73
+ } else if(N == akey) {
74
+ mark = mark_akey;
75
+ } else if(N == aval) {
76
+ mark = mark_aval;
77
+ }
78
+ if(mark > tokstart) {
79
+ if(N == tag) {
80
+ mark_tag -= tokstart;
81
+ } else if(N == akey) {
82
+ mark_akey -= tokstart;
83
+ } else if(N == aval) {
84
+ mark_aval -= tokstart;
85
+ }
86
+ }
87
+ }
88
+
89
+ public void ATTR(IRubyObject K, IRubyObject V) {
90
+ if(!K.isNil()) {
91
+ if(attr.isNil()) {
92
+ attr = RubyHash.newHash(runtime);
93
+ }
94
+ ((RubyHash)attr).aset(K,V);
95
+ }
96
+ }
97
+
98
+ public void ATTR(IRubyObject[] K, IRubyObject V) {
99
+ ATTR(K[0],V);
100
+ }
101
+
102
+ public void ATTR(IRubyObject K, IRubyObject[] V) {
103
+ ATTR(K,V[0]);
104
+ }
105
+
106
+ public void ATTR(IRubyObject[] K, IRubyObject[] V) {
107
+ ATTR(K[0],V[0]);
108
+ }
109
+
110
+ public void TEXT_PASS() {
111
+ if(!text) {
112
+ if(ele_open) {
113
+ ele_open = false;
114
+ if(tokstart > -1) {
115
+ mark_tag = tokstart;
116
+ }
117
+ } else {
118
+ mark_tag = p;
119
+ }
120
+ attr = runtime.getNil();
121
+ tag[0] = runtime.getNil();
122
+ text = true;
123
+ }
124
+ }
125
+
126
+ public void EBLK(IRubyObject N, int T) {
127
+ CAT(tag, p - T + 1);
128
+ ELE(N);
129
+ }
130
+
131
+
132
+ public void rb_raise(RubyClass error, String message) {
133
+ throw new RaiseException(runtime, error, message, true);
134
+ }
135
+
136
+ public IRubyObject rb_str_new2(String s) {
137
+ return runtime.newString(s);
138
+ }
139
+
140
+ %%{
141
+ machine hpricot_scan;
142
+
143
+ action newEle {
144
+ if (text) {
145
+ CAT(tag, p);
146
+ ELE(sym_text);
147
+ text = false;
148
+ }
149
+ attr = runtime.getNil();
150
+ tag[0] = runtime.getNil();
151
+ mark_tag = -1;
152
+ ele_open = true;
153
+ }
154
+
155
+ action _tag { mark_tag = p; }
156
+ action _aval { mark_aval = p; }
157
+ action _akey { mark_akey = p; }
158
+ action tag { SET(tag, p); }
159
+ action tagc { SET(tag, p-1); }
160
+ action aval { SET(aval, p); }
161
+ action aunq {
162
+ if (buf[p-1] == '"' || buf[p-1] == '\'') { SET(aval, p-1); }
163
+ else { SET(aval, p); }
164
+ }
165
+ action akey { SET(akey, p); }
166
+ action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
167
+ action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
168
+ action xmlsd { SET(aval, p); ATTR(rb_str_new2("standalone"), aval); }
169
+ action pubid { SET(aval, p); ATTR(rb_str_new2("public_id"), aval); }
170
+ action sysid { SET(aval, p); ATTR(rb_str_new2("system_id"), aval); }
171
+
172
+ action new_attr {
173
+ akey[0] = runtime.getNil();
174
+ aval[0] = runtime.getNil();
175
+ mark_akey = -1;
176
+ mark_aval = -1;
177
+ }
178
+
179
+ action save_attr {
180
+ ATTR(akey, aval);
181
+ }
182
+
183
+ include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
184
+
185
+ }%%
186
+
187
+ %% write data nofinal;
188
+
189
+ public final static int BUFSIZE=16384;
190
+
191
+ private void rb_yield_tokens(IRubyObject sym, IRubyObject tag, IRubyObject attr, IRubyObject raw, boolean taint) {
192
+ IRubyObject ary;
193
+ if (sym == runtime.newSymbol("text")) {
194
+ raw = tag;
195
+ }
196
+ ary = runtime.newArray(new IRubyObject[]{sym, tag, attr, raw});
197
+ if (taint) {
198
+ ary.setTaint(true);
199
+ tag.setTaint(true);
200
+ attr.setTaint(true);
201
+ raw.setTaint(true);
202
+ }
203
+ block.yield(runtime.getCurrentContext(), ary, null, null, false);
204
+ }
205
+
206
+
207
+ int cs, act, have = 0, nread = 0, curline = 1, p=-1;
208
+ boolean text = false;
209
+ int tokstart=-1, tokend;
210
+ char[] buf;
211
+ Ruby runtime;
212
+ IRubyObject attr, bufsize;
213
+ IRubyObject[] tag, akey, aval;
214
+ int mark_tag, mark_akey, mark_aval;
215
+ boolean done = false, ele_open = false;
216
+ int buffer_size = 0;
217
+ boolean taint = false;
218
+ Block block = null;
219
+
220
+
221
+ IRubyObject xmldecl, doctype, procins, stag, etag, emptytag, comment,
222
+ cdata, sym_text;
223
+
224
+ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
225
+ attr = bufsize = runtime.getNil();
226
+ tag = new IRubyObject[]{runtime.getNil()};
227
+ akey = new IRubyObject[]{runtime.getNil()};
228
+ aval = new IRubyObject[]{runtime.getNil()};
229
+
230
+ RubyClass rb_eHpricotParseError = runtime.getModule("Hpricot").getClass("ParseError");
231
+
232
+ taint = port.isTaint();
233
+ if ( !port.respondsTo("read")) {
234
+ if ( port.respondsTo("to_str")) {
235
+ port = port.callMethod(runtime.getCurrentContext(),"to_str");
236
+ } else {
237
+ throw runtime.newArgumentError("bad Hpricot argument, String or IO only please.");
238
+ }
239
+ }
240
+
241
+ buffer_size = BUFSIZE;
242
+ if (recv.getInstanceVariable("@buffer_size") != null) {
243
+ bufsize = recv.getInstanceVariable("@buffer_size");
244
+ if (!bufsize.isNil()) {
245
+ buffer_size = RubyNumeric.fix2int(bufsize);
246
+ }
247
+ }
248
+ buf = new char[buffer_size];
249
+
250
+ %% write init;
251
+
252
+ while( !done ) {
253
+ IRubyObject str;
254
+ p = have;
255
+ int pe;
256
+ int len, space = buffer_size - have;
257
+
258
+ if ( space == 0 ) {
259
+ /* We've used up the entire buffer storing an already-parsed token
260
+ * prefix that must be preserved. Likely caused by super-long attributes.
261
+ * See ticket #13. */
262
+ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <" + tag.toString() + ">, starting on line "+curline+".");
263
+ }
264
+
265
+ if (port.respondsTo("read")) {
266
+ str = port.callMethod(runtime.getCurrentContext(),"read",runtime.newFixnum(space));
267
+ } else {
268
+ str = ((RubyString)port).substr(nread,space);
269
+ }
270
+
271
+ str = str.convertToString();
272
+ String sss = str.toString();
273
+ char[] chars = sss.toCharArray();
274
+ System.arraycopy(chars,0,buf,p,chars.length);
275
+
276
+ len = sss.length();
277
+ nread += len;
278
+
279
+ if ( len < space ) {
280
+ len++;
281
+ done = true;
282
+ }
283
+
284
+ pe = p + len;
285
+ char[] data = buf;
286
+
287
+ %% write exec;
288
+
289
+ if ( cs == hpricot_scan_error ) {
290
+ if(!tag[0].isNil()) {
291
+ rb_raise(rb_eHpricotParseError, "parse error on element <"+tag.toString()+">, starting on line "+curline+".\n" + NO_WAY_SERIOUSLY);
292
+ } else {
293
+ rb_raise(rb_eHpricotParseError, "parse error on line "+curline+".\n" + NO_WAY_SERIOUSLY);
294
+ }
295
+ }
296
+
297
+ if ( done && ele_open ) {
298
+ ele_open = false;
299
+ if(tokstart > -1) {
300
+ mark_tag = tokstart;
301
+ tokstart = -1;
302
+ text = true;
303
+ }
304
+ }
305
+
306
+ if(tokstart == -1) {
307
+ have = 0;
308
+ /* text nodes have no tokstart because each byte is parsed alone */
309
+ if(mark_tag != -1 && text) {
310
+ if (done) {
311
+ if(mark_tag < p-1) {
312
+ CAT(tag, p-1);
313
+ ELE(sym_text);
314
+ }
315
+ } else {
316
+ CAT(tag, p);
317
+ }
318
+ }
319
+ mark_tag = 0;
320
+ } else {
321
+ have = pe - tokstart;
322
+ System.arraycopy(buf,tokstart,buf,0,have);
323
+ SLIDE(tag);
324
+ SLIDE(akey);
325
+ SLIDE(aval);
326
+ tokend = (tokend - tokstart);
327
+ tokstart = 0;
328
+ }
329
+ }
330
+ return runtime.getNil();
331
+ }
332
+
333
+ public static IRubyObject __hpricot_scan(IRubyObject recv, IRubyObject port, Block block) {
334
+ Ruby runtime = recv.getRuntime();
335
+ HpricotScanService service = new HpricotScanService();
336
+ service.runtime = runtime;
337
+ service.xmldecl = runtime.newSymbol("xmldecl");
338
+ service.doctype = runtime.newSymbol("doctype");
339
+ service.procins = runtime.newSymbol("procins");
340
+ service.stag = runtime.newSymbol("stag");
341
+ service.etag = runtime.newSymbol("etag");
342
+ service.emptytag = runtime.newSymbol("emptytag");
343
+ service.comment = runtime.newSymbol("comment");
344
+ service.cdata = runtime.newSymbol("cdata");
345
+ service.sym_text = runtime.newSymbol("text");
346
+ service.block = block;
347
+ return service.hpricot_scan(recv, port);
348
+ }
349
+
350
+
351
+ public boolean basicLoad(final Ruby runtime) throws IOException {
352
+ Init_hpricot_scan(runtime);
353
+ return true;
354
+ }
355
+
356
+ public static void Init_hpricot_scan(Ruby runtime) {
357
+ RubyModule mHpricot = runtime.defineModule("Hpricot");
358
+ mHpricot.getMetaClass().attr_accessor(new IRubyObject[]{runtime.newSymbol("buffer_size")});
359
+ CallbackFactory fact = runtime.callbackFactory(HpricotScanService.class);
360
+ mHpricot.getMetaClass().defineMethod("scan",fact.getSingletonMethod("__hpricot_scan",IRubyObject.class));
361
+ mHpricot.defineClassUnder("ParseError",runtime.getClass("Exception"),runtime.getClass("Exception").getAllocator());
362
+ }
363
+ }
@@ -102,75 +102,8 @@ static ID s_read, s_to_str;
102
102
  ATTR(akey, aval);
103
103
  }
104
104
 
105
- #
106
- # HTML tokens
107
- # (a blatant rip from HTree)
108
- #
109
- newline = '\n' @{curline += 1;} ;
110
- # qtext = '"' ( '\"' | [^\n"] )* '"' | "'" ( "\\'" | [^\n'] )* "'" ;
111
- NameChar = [\-A-Za-z0-9._:?] ;
112
- Name = [A-Za-z_:] NameChar* ;
113
- StartComment = "<!--" ;
114
- EndComment = "-->" ;
115
- StartCdata = "<![CDATA[" ;
116
- EndCdata = "]]>" ;
117
-
118
- NameCap = Name >_tag %tag;
119
- NameAttr = NameChar+ >_akey %akey ;
120
- Q1Attr = [^']* >_aval %aval ;
121
- Q2Attr = [^"]* >_aval %aval ;
122
- UnqAttr = ( space >_aval | [^ \t\n<>"'] >_aval [^ \t\n<>]* %aunq ) ;
123
- Nmtoken = NameChar+ >_akey %akey ;
124
-
125
- Attr = NameAttr space* "=" space* ('"' Q2Attr '"' | "'" Q1Attr "'" | UnqAttr space+ ) space* ;
126
- AttrEnd = ( NameAttr space* "=" space* UnqAttr? | Nmtoken >new_attr %save_attr ) ;
127
- AttrSet = ( Attr >new_attr %save_attr | Nmtoken >new_attr space+ %save_attr ) ;
128
- StartTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? ">" | "<" NameCap ">";
129
- EmptyTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? "/>" | "<" NameCap "/>" ;
130
-
131
- EndTag = "</" NameCap space* ">" ;
132
- XmlVersionNum = [a-zA-Z0-9_.:\-]+ >_aval %xmlver ;
133
- XmlVersionInfo = space+ "version" space* "=" space* ("'" XmlVersionNum "'" | '"' XmlVersionNum '"' ) ;
134
- XmlEncName = [A-Za-z] >_aval [A-Za-z0-9._\-]* %xmlenc ;
135
- XmlEncodingDecl = space+ "encoding" space* "=" space* ("'" XmlEncName "'" | '"' XmlEncName '"' ) ;
136
- XmlYesNo = ("yes" | "no") >_aval %xmlsd ;
137
- XmlSDDecl = space+ "standalone" space* "=" space* ("'" XmlYesNo "'" | '"' XmlYesNo '"') ;
138
- XmlDecl = "<?xml" XmlVersionInfo XmlEncodingDecl? XmlSDDecl? space* "?>" ;
139
-
140
- SystemLiteral = '"' [^"]* >_aval %sysid '"' | "'" [^']* >_aval %sysid "'" ;
141
- PubidLiteral = '"' [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid '"' |
142
- "'" [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid "'" ;
143
- ExternalID = ( "SYSTEM" | "PUBLIC" space+ PubidLiteral ) (space+ SystemLiteral)? ;
144
- DocType = "<!DOCTYPE" space+ NameCap (space+ ExternalID)? space* ("[" [^\]]* "]" space*)? ">" ;
145
- StartXmlProcIns = "<?" Name >{ TEXT_PASS(); } space+ ;
146
- EndXmlProcIns = "?>" ;
147
-
148
- html_comment := |*
149
- EndComment @{ EBLK(comment, 3); fgoto main; };
150
- any | newline { TEXT_PASS(); };
151
- *|;
152
-
153
- html_cdata := |*
154
- EndCdata @{ EBLK(cdata, 3); fgoto main; };
155
- any | newline { TEXT_PASS(); };
156
- *|;
157
-
158
- html_procins := |*
159
- EndXmlProcIns @{ EBLK(procins, 2); fgoto main; };
160
- any | newline { TEXT_PASS(); };
161
- *|;
162
-
163
- main := |*
164
- XmlDecl >newEle { ELE(xmldecl); };
165
- DocType >newEle { ELE(doctype); };
166
- StartXmlProcIns >newEle { fgoto html_procins; };
167
- StartTag >newEle { ELE(stag); };
168
- EndTag >newEle { ELE(etag); };
169
- EmptyTag >newEle { ELE(emptytag); };
170
- StartComment >newEle { fgoto html_comment; };
171
- StartCdata >newEle { fgoto html_cdata; };
172
- any | newline { TEXT_PASS(); };
173
- *|;
105
+ include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
106
+
174
107
  }%%
175
108
 
176
109
  %% write data nofinal;
@@ -212,7 +145,7 @@ VALUE hpricot_scan(VALUE self, VALUE port)
212
145
  }
213
146
  else
214
147
  {
215
- rb_raise( rb_eArgError, "bad argument, String or IO only please." );
148
+ rb_raise( rb_eArgError, "bad Hpricot argument, String or IO only please." );
216
149
  }
217
150
  }
218
151
 
@@ -23,3 +23,4 @@ require 'hpricot/modules'
23
23
  require 'hpricot/traverse'
24
24
  require 'hpricot/inspect'
25
25
  require 'hpricot/parse'
26
+ require 'hpricot/builder'