hpricot 0.6-mswin32 → 0.6.164-mswin32

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,7 +6,9 @@ import org.jruby.RubyClass;
6
6
  import org.jruby.RubyHash;
7
7
  import org.jruby.RubyModule;
8
8
  import org.jruby.RubyNumeric;
9
+ import org.jruby.RubyObjectAdapter;
9
10
  import org.jruby.RubyString;
11
+ import org.jruby.javasupport.JavaEmbedUtils;
10
12
  import org.jruby.runtime.Block;
11
13
  import org.jruby.runtime.CallbackFactory;
12
14
  import org.jruby.runtime.builtin.IRubyObject;
@@ -15,13 +17,14 @@ import org.jruby.runtime.load.BasicLibraryService;
15
17
 
16
18
  public class HpricotScanService implements BasicLibraryService {
17
19
  public static String NO_WAY_SERIOUSLY="*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!";
20
+ private static RubyObjectAdapter rubyApi;
18
21
 
19
22
  public void ELE(IRubyObject N) {
20
- if (tokend > tokstart || text) {
23
+ if (te > ts || text) {
21
24
  IRubyObject raw_string = runtime.getNil();
22
25
  ele_open = false; text = false;
23
- if (tokstart != -1 && N != cdata && N != sym_text && N != procins && N != comment) {
24
- raw_string = runtime.newString(new String(buf,tokstart,tokend-tokstart));
26
+ if (ts != -1 && N != cdata && N != sym_text && N != procins && N != comment) {
27
+ raw_string = runtime.newString(new String(buf,ts,te-ts));
25
28
  }
26
29
  rb_yield_tokens(N, tag[0], attr, raw_string, taint);
27
30
  }
@@ -75,13 +78,13 @@ public class HpricotScanService implements BasicLibraryService {
75
78
  } else if(N == aval) {
76
79
  mark = mark_aval;
77
80
  }
78
- if(mark > tokstart) {
81
+ if(mark > ts) {
79
82
  if(N == tag) {
80
- mark_tag -= tokstart;
83
+ mark_tag -= ts;
81
84
  } else if(N == akey) {
82
- mark_akey -= tokstart;
85
+ mark_akey -= ts;
83
86
  } else if(N == aval) {
84
- mark_aval -= tokstart;
87
+ mark_aval -= ts;
85
88
  }
86
89
  }
87
90
  }
@@ -91,7 +94,8 @@ public class HpricotScanService implements BasicLibraryService {
91
94
  if(attr.isNil()) {
92
95
  attr = RubyHash.newHash(runtime);
93
96
  }
94
- ((RubyHash)attr).aset(K,V);
97
+ ((RubyHash)attr).op_aset(runtime.getCurrentContext(),K,V);
98
+ // ((RubyHash)attr).aset(K,V);
95
99
  }
96
100
  }
97
101
 
@@ -111,8 +115,8 @@ public class HpricotScanService implements BasicLibraryService {
111
115
  if(!text) {
112
116
  if(ele_open) {
113
117
  ele_open = false;
114
- if(tokstart > -1) {
115
- mark_tag = tokstart;
118
+ if(ts > -1) {
119
+ mark_tag = ts;
116
120
  }
117
121
  } else {
118
122
  mark_tag = p;
@@ -180,7 +184,7 @@ public class HpricotScanService implements BasicLibraryService {
180
184
  ATTR(akey, aval);
181
185
  }
182
186
 
183
- include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
187
+ include hpricot_common "hpricot_common.rl";
184
188
 
185
189
  }%%
186
190
 
@@ -206,7 +210,8 @@ private void rb_yield_tokens(IRubyObject sym, IRubyObject tag, IRubyObject attr,
206
210
 
207
211
  int cs, act, have = 0, nread = 0, curline = 1, p=-1;
208
212
  boolean text = false;
209
- int tokstart=-1, tokend;
213
+ int ts=-1, te;
214
+ int eof=-1;
210
215
  char[] buf;
211
216
  Ruby runtime;
212
217
  IRubyObject attr, bufsize;
@@ -239,8 +244,8 @@ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
239
244
  }
240
245
 
241
246
  buffer_size = BUFSIZE;
242
- if (recv.getInstanceVariable("@buffer_size") != null) {
243
- bufsize = recv.getInstanceVariable("@buffer_size");
247
+ if (rubyApi.getInstanceVariable(recv, "@buffer_size") != null) {
248
+ bufsize = rubyApi.getInstanceVariable(recv, "@buffer_size");
244
249
  if (!bufsize.isNil()) {
245
250
  buffer_size = RubyNumeric.fix2int(bufsize);
246
251
  }
@@ -296,16 +301,16 @@ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
296
301
 
297
302
  if ( done && ele_open ) {
298
303
  ele_open = false;
299
- if(tokstart > -1) {
300
- mark_tag = tokstart;
301
- tokstart = -1;
304
+ if(ts > -1) {
305
+ mark_tag = ts;
306
+ ts = -1;
302
307
  text = true;
303
308
  }
304
309
  }
305
310
 
306
- if(tokstart == -1) {
311
+ if(ts == -1) {
307
312
  have = 0;
308
- /* text nodes have no tokstart because each byte is parsed alone */
313
+ /* text nodes have no ts because each byte is parsed alone */
309
314
  if(mark_tag != -1 && text) {
310
315
  if (done) {
311
316
  if(mark_tag < p-1) {
@@ -318,13 +323,13 @@ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
318
323
  }
319
324
  mark_tag = 0;
320
325
  } else {
321
- have = pe - tokstart;
322
- System.arraycopy(buf,tokstart,buf,0,have);
326
+ have = pe - ts;
327
+ System.arraycopy(buf,ts,buf,0,have);
323
328
  SLIDE(tag);
324
329
  SLIDE(akey);
325
330
  SLIDE(aval);
326
- tokend = (tokend - tokstart);
327
- tokstart = 0;
331
+ te = (te - ts);
332
+ ts = 0;
328
333
  }
329
334
  }
330
335
  return runtime.getNil();
@@ -355,9 +360,10 @@ public boolean basicLoad(final Ruby runtime) throws IOException {
355
360
 
356
361
  public static void Init_hpricot_scan(Ruby runtime) {
357
362
  RubyModule mHpricot = runtime.defineModule("Hpricot");
358
- mHpricot.getMetaClass().attr_accessor(new IRubyObject[]{runtime.newSymbol("buffer_size")});
363
+ mHpricot.getMetaClass().attr_accessor(runtime.getCurrentContext(),new IRubyObject[]{runtime.newSymbol("buffer_size")});
359
364
  CallbackFactory fact = runtime.callbackFactory(HpricotScanService.class);
360
365
  mHpricot.getMetaClass().defineMethod("scan",fact.getSingletonMethod("__hpricot_scan",IRubyObject.class));
361
- mHpricot.defineClassUnder("ParseError",runtime.getClass("Exception"),runtime.getClass("Exception").getAllocator());
366
+ mHpricot.defineClassUnder("ParseError",runtime.getClass("StandardError"),runtime.getClass("StandardError").getAllocator());
367
+ rubyApi = JavaEmbedUtils.newObjectAdapter();
362
368
  }
363
369
  }
@@ -8,6 +8,12 @@
8
8
  */
9
9
  #include <ruby.h>
10
10
 
11
+ #ifndef RARRAY_LEN
12
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
13
+ #define RSTRING_LEN(str) RSTRING(str)->len
14
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
15
+ #endif
16
+
11
17
  #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
12
18
 
13
19
  static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
@@ -16,11 +22,11 @@ static VALUE rb_eHpricotParseError;
16
22
  static ID s_read, s_to_str;
17
23
 
18
24
  #define ELE(N) \
19
- if (tokend > tokstart || text == 1) { \
25
+ if (te > ts || text == 1) { \
20
26
  VALUE raw_string = Qnil; \
21
27
  ele_open = 0; text = 0; \
22
- if (tokstart != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
23
- raw_string = rb_str_new(tokstart, tokend-tokstart); \
28
+ if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
29
+ raw_string = rb_str_new(ts, te-ts); \
24
30
  } \
25
31
  rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
26
32
  }
@@ -33,7 +39,7 @@ static ID s_read, s_to_str;
33
39
 
34
40
  #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
35
41
 
36
- #define SLIDE(N) if ( mark_##N > tokstart ) mark_##N = buf + (mark_##N - tokstart);
42
+ #define SLIDE(N) if ( mark_##N > ts ) mark_##N = buf + (mark_##N - ts);
37
43
 
38
44
  #define ATTR(K, V) \
39
45
  if (!NIL_P(K)) { \
@@ -46,8 +52,8 @@ static ID s_read, s_to_str;
46
52
  { \
47
53
  if (ele_open == 1) { \
48
54
  ele_open = 0; \
49
- if (tokstart > 0) { \
50
- mark_tag = tokstart; \
55
+ if (ts > 0) { \
56
+ mark_tag = ts; \
51
57
  } \
52
58
  } else { \
53
59
  mark_tag = p; \
@@ -102,7 +108,7 @@ static ID s_read, s_to_str;
102
108
  ATTR(akey, aval);
103
109
  }
104
110
 
105
- include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
111
+ include hpricot_common "hpricot_common.rl";
106
112
 
107
113
  }%%
108
114
 
@@ -129,7 +135,7 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
129
135
  VALUE hpricot_scan(VALUE self, VALUE port)
130
136
  {
131
137
  int cs, act, have = 0, nread = 0, curline = 1, text = 0;
132
- char *tokstart = 0, *tokend = 0, *buf = NULL;
138
+ char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
133
139
 
134
140
  VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
135
141
  char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
@@ -169,7 +175,7 @@ VALUE hpricot_scan(VALUE self, VALUE port)
169
175
  /* We've used up the entire buffer storing an already-parsed token
170
176
  * prefix that must be preserved. Likely caused by super-long attributes.
171
177
  * See ticket #13. */
172
- rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING(tag)->ptr, curline);
178
+ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING_PTR(tag), curline);
173
179
  }
174
180
 
175
181
  if ( rb_respond_to( port, s_read ) )
@@ -182,8 +188,8 @@ VALUE hpricot_scan(VALUE self, VALUE port)
182
188
  }
183
189
 
184
190
  StringValue(str);
185
- memcpy( p, RSTRING(str)->ptr, RSTRING(str)->len );
186
- len = RSTRING(str)->len;
191
+ memcpy( p, RSTRING_PTR(str), RSTRING_LEN(str) );
192
+ len = RSTRING_LEN(str);
187
193
  nread += len;
188
194
 
189
195
  /* If this is the last buffer, tack on an EOF. */
@@ -199,7 +205,7 @@ VALUE hpricot_scan(VALUE self, VALUE port)
199
205
  free(buf);
200
206
  if ( !NIL_P(tag) )
201
207
  {
202
- rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING(tag)->ptr, curline);
208
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
203
209
  }
204
210
  else
205
211
  {
@@ -210,17 +216,17 @@ VALUE hpricot_scan(VALUE self, VALUE port)
210
216
  if ( done && ele_open )
211
217
  {
212
218
  ele_open = 0;
213
- if (tokstart > 0) {
214
- mark_tag = tokstart;
215
- tokstart = 0;
219
+ if (ts > 0) {
220
+ mark_tag = ts;
221
+ ts = 0;
216
222
  text = 1;
217
223
  }
218
224
  }
219
225
 
220
- if ( tokstart == 0 )
226
+ if ( ts == 0 )
221
227
  {
222
228
  have = 0;
223
- /* text nodes have no tokstart because each byte is parsed alone */
229
+ /* text nodes have no ts because each byte is parsed alone */
224
230
  if ( mark_tag != NULL && text == 1 )
225
231
  {
226
232
  if (done)
@@ -240,13 +246,13 @@ VALUE hpricot_scan(VALUE self, VALUE port)
240
246
  }
241
247
  else
242
248
  {
243
- have = pe - tokstart;
244
- memmove( buf, tokstart, have );
249
+ have = pe - ts;
250
+ memmove( buf, ts, have );
245
251
  SLIDE(tag);
246
252
  SLIDE(akey);
247
253
  SLIDE(aval);
248
- tokend = buf + (tokend - tokstart);
249
- tokstart = buf;
254
+ te = buf + (te - ts);
255
+ ts = buf;
250
256
  }
251
257
  }
252
258
  free(buf);
@@ -257,7 +263,7 @@ void Init_hpricot_scan()
257
263
  VALUE mHpricot = rb_define_module("Hpricot");
258
264
  rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
259
265
  rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
260
- rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException);
266
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
261
267
 
262
268
  s_read = rb_intern("read");
263
269
  s_to_str = rb_intern("to_str");
@@ -0,0 +1,5 @@
1
+ require './hpricot_scan.so'
2
+
3
+ doc = "<doc><person><test>YESSS</test></person><train>SET</train></doc>"
4
+ Hpricot.scan(doc) { |x| p x }
5
+ p Hpricot.lemon(doc)
data/lib/fast_xs.so ADDED
Binary file
@@ -1,8 +1,23 @@
1
1
  require 'hpricot/tags'
2
- require 'hpricot/xchar'
2
+ require 'fast_xs'
3
3
  require 'hpricot/blankslate'
4
4
 
5
5
  module Hpricot
6
+ PREDEFINED = {
7
+ 34 => '&quot;', # quotation mark
8
+ 38 => '&amp;', # ampersand
9
+ 60 => '&lt;', # left angle bracket
10
+ 62 => '&gt;' # right angle bracket
11
+ }
12
+ PREDEFINED_U = PREDEFINED.inject({}) { |hsh, (k, v)| hsh[v] = k; hsh }
13
+
14
+ # XML unescape
15
+ def self.uxs(str)
16
+ str.to_s.
17
+ gsub(/\&\w+;/) { |x| (PREDEFINED_U[x] || ??).chr }.
18
+ gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
19
+ end
20
+
6
21
  def self.build(ele = Doc.new, assigns = {}, &blk)
7
22
  ele.extend Builder
8
23
  assigns.each do |k, v|
@@ -32,7 +47,7 @@ module Hpricot
32
47
 
33
48
  # Write a +string+ to the HTML stream, making sure to escape it.
34
49
  def text!(string)
35
- @children << Text.new(Hpricot.xs(string))
50
+ @children << Text.new(string.fast_xs)
36
51
  end
37
52
 
38
53
  # Write a +string+ to the HTML stream without escaping it.
@@ -75,16 +90,16 @@ module Hpricot
75
90
  # turn arguments into children or attributes
76
91
  childs = []
77
92
  attrs = args.grep(Hash)
78
- childs.concat((args - attrs).map do |x|
93
+ childs.concat((args - attrs).flatten.map do |x|
79
94
  if x.respond_to? :to_html
80
95
  Hpricot.make(x.to_html)
81
96
  elsif x
82
- Text.new(Hpricot.xs(x))
97
+ Text.new(x.fast_xs)
83
98
  end
84
99
  end.flatten)
85
100
  attrs = attrs.inject({}) do |hsh, ath|
86
101
  ath.each do |k, v|
87
- hsh[k] = Hpricot.xs(v.to_s) if v
102
+ hsh[k] = v.to_s.fast_xs if v
88
103
  end
89
104
  hsh
90
105
  end
@@ -130,25 +130,25 @@ module Hpricot
130
130
  # Add to the end of the contents inside each element in this list.
131
131
  # Pass in an HTML +str+, which is turned into Hpricot elements.
132
132
  def append(str = nil, &blk)
133
- each { |x| x.html(x.children + Hpricot.make(str, &blk)) }
133
+ each { |x| x.html(x.children + x.make(str, &blk)) }
134
134
  end
135
135
 
136
136
  # Add to the start of the contents inside each element in this list.
137
137
  # Pass in an HTML +str+, which is turned into Hpricot elements.
138
138
  def prepend(str = nil, &blk)
139
- each { |x| x.html(Hpricot.make(str, &blk) + x.children) }
139
+ each { |x| x.html(x.make(str, &blk) + x.children) }
140
140
  end
141
141
 
142
142
  # Add some HTML just previous to each element in this list.
143
143
  # Pass in an HTML +str+, which is turned into Hpricot elements.
144
144
  def before(str = nil, &blk)
145
- each { |x| x.parent.insert_before Hpricot.make(str, &blk), x }
145
+ each { |x| x.parent.insert_before x.make(str, &blk), x }
146
146
  end
147
147
 
148
148
  # Just after each element in this list, add some HTML.
149
149
  # Pass in an HTML +str+, which is turned into Hpricot elements.
150
150
  def after(str = nil, &blk)
151
- each { |x| x.parent.insert_after Hpricot.make(str, &blk), x }
151
+ each { |x| x.parent.insert_after x.make(str, &blk), x }
152
152
  end
153
153
 
154
154
  # Wraps each element in the list inside the element created by HTML +str+.
@@ -161,10 +161,10 @@ module Hpricot
161
161
  # This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
162
162
  def wrap(str = nil, &blk)
163
163
  each do |x|
164
- wrap = Hpricot.make(str, &blk)
164
+ wrap = x.make(str, &blk)
165
165
  nest = wrap.detect { |w| w.respond_to? :children }
166
166
  unless nest
167
- raise Exception, "No wrapping element found."
167
+ raise "No wrapping element found."
168
168
  end
169
169
  x.parent.replace_child(x, wrap)
170
170
  nest = nest.children.first until nest.empty?
@@ -261,7 +261,7 @@ module Hpricot
261
261
  self
262
262
  end
263
263
 
264
- ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^\]'"]*)'?"? *\]!i
264
+ ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
265
265
  BRACK_RE = %r!(\[) *([^\]]*) *\]+!i
266
266
  FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
267
267
  CUST_RE = %r!(:)([a-zA-Z0-9\*_-]*)()!
data/lib/hpricot/parse.rb CHANGED
@@ -12,13 +12,14 @@ module Hpricot
12
12
  # Hpricot.parse parses <i>input</i> and return a document tree.
13
13
  # represented by Hpricot::Doc.
14
14
  def Hpricot.parse(input = nil, opts = {}, &blk)
15
- Doc.new(make(input, opts, &blk))
15
+ Doc.new(make(input, opts, &blk), opts)
16
16
  end
17
17
 
18
18
  # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
19
19
  # and returning a document tree.
20
- def Hpricot.XML(input, opts = {})
21
- Doc.new(make(input, opts.merge(:xml => true)))
20
+ def Hpricot.XML(input = nil, opts = {}, &blk)
21
+ opts.merge! :xml => true
22
+ Doc.new(make(input, opts, &blk), opts)
22
23
  end
23
24
 
24
25
  # :stopdoc:
@@ -137,7 +138,7 @@ module Hpricot
137
138
  matched_elem = stack[i]
138
139
  stack[i][1] += token
139
140
  eles = stack.slice!((i+1)..-1)
140
- stack.last[2] += eles
141
+ stack.last[2] += eles if eles
141
142
  break
142
143
  end
143
144
  end
@@ -208,7 +209,7 @@ module Hpricot
208
209
  when :cdata
209
210
  Text.parse_cdata_section(structure[1])
210
211
  else
211
- raise Exception, "[bug] unknown structure: #{structure.inspect}"
212
+ raise "[bug] unknown structure: #{structure.inspect}"
212
213
  end
213
214
  end
214
215
 
data/lib/hpricot/tag.rb CHANGED
@@ -3,8 +3,9 @@ module Hpricot
3
3
 
4
4
  class Doc
5
5
  attr_accessor :children
6
- def initialize(children = [])
6
+ def initialize(children = [], options = {})
7
7
  @children = children ? children.each { |c| c.parent = self } : []
8
+ @options = options
8
9
  end
9
10
  def output(out, opts = {})
10
11
  @children.each do |n|
@@ -12,6 +13,9 @@ module Hpricot
12
13
  end
13
14
  out
14
15
  end
16
+ def make(input = nil, &blk)
17
+ Hpricot.make(input, @options, &blk)
18
+ end
15
19
  def altered!; end
16
20
  end
17
21
 
@@ -100,7 +104,7 @@ module Hpricot
100
104
  if @raw_attributes
101
105
  @raw_attributes.map do |aname, aval|
102
106
  " #{aname}" +
103
- (aval ? "=\"#{aval}\"" : "")
107
+ (aval ? "=#{html_quote aval}" : "")
104
108
  end.join
105
109
  end
106
110
  end