hpricot 0.6-mswin32 → 0.6.164-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,9 @@ import org.jruby.RubyClass;
6
6
  import org.jruby.RubyHash;
7
7
  import org.jruby.RubyModule;
8
8
  import org.jruby.RubyNumeric;
9
+ import org.jruby.RubyObjectAdapter;
9
10
  import org.jruby.RubyString;
11
+ import org.jruby.javasupport.JavaEmbedUtils;
10
12
  import org.jruby.runtime.Block;
11
13
  import org.jruby.runtime.CallbackFactory;
12
14
  import org.jruby.runtime.builtin.IRubyObject;
@@ -15,13 +17,14 @@ import org.jruby.runtime.load.BasicLibraryService;
15
17
 
16
18
  public class HpricotScanService implements BasicLibraryService {
17
19
  public static String NO_WAY_SERIOUSLY="*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!";
20
+ private static RubyObjectAdapter rubyApi;
18
21
 
19
22
  public void ELE(IRubyObject N) {
20
- if (tokend > tokstart || text) {
23
+ if (te > ts || text) {
21
24
  IRubyObject raw_string = runtime.getNil();
22
25
  ele_open = false; text = false;
23
- if (tokstart != -1 && N != cdata && N != sym_text && N != procins && N != comment) {
24
- raw_string = runtime.newString(new String(buf,tokstart,tokend-tokstart));
26
+ if (ts != -1 && N != cdata && N != sym_text && N != procins && N != comment) {
27
+ raw_string = runtime.newString(new String(buf,ts,te-ts));
25
28
  }
26
29
  rb_yield_tokens(N, tag[0], attr, raw_string, taint);
27
30
  }
@@ -75,13 +78,13 @@ public class HpricotScanService implements BasicLibraryService {
75
78
  } else if(N == aval) {
76
79
  mark = mark_aval;
77
80
  }
78
- if(mark > tokstart) {
81
+ if(mark > ts) {
79
82
  if(N == tag) {
80
- mark_tag -= tokstart;
83
+ mark_tag -= ts;
81
84
  } else if(N == akey) {
82
- mark_akey -= tokstart;
85
+ mark_akey -= ts;
83
86
  } else if(N == aval) {
84
- mark_aval -= tokstart;
87
+ mark_aval -= ts;
85
88
  }
86
89
  }
87
90
  }
@@ -91,7 +94,8 @@ public class HpricotScanService implements BasicLibraryService {
91
94
  if(attr.isNil()) {
92
95
  attr = RubyHash.newHash(runtime);
93
96
  }
94
- ((RubyHash)attr).aset(K,V);
97
+ ((RubyHash)attr).op_aset(runtime.getCurrentContext(),K,V);
98
+ // ((RubyHash)attr).aset(K,V);
95
99
  }
96
100
  }
97
101
 
@@ -111,8 +115,8 @@ public class HpricotScanService implements BasicLibraryService {
111
115
  if(!text) {
112
116
  if(ele_open) {
113
117
  ele_open = false;
114
- if(tokstart > -1) {
115
- mark_tag = tokstart;
118
+ if(ts > -1) {
119
+ mark_tag = ts;
116
120
  }
117
121
  } else {
118
122
  mark_tag = p;
@@ -180,7 +184,7 @@ public class HpricotScanService implements BasicLibraryService {
180
184
  ATTR(akey, aval);
181
185
  }
182
186
 
183
- include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
187
+ include hpricot_common "hpricot_common.rl";
184
188
 
185
189
  }%%
186
190
 
@@ -206,7 +210,8 @@ private void rb_yield_tokens(IRubyObject sym, IRubyObject tag, IRubyObject attr,
206
210
 
207
211
  int cs, act, have = 0, nread = 0, curline = 1, p=-1;
208
212
  boolean text = false;
209
- int tokstart=-1, tokend;
213
+ int ts=-1, te;
214
+ int eof=-1;
210
215
  char[] buf;
211
216
  Ruby runtime;
212
217
  IRubyObject attr, bufsize;
@@ -239,8 +244,8 @@ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
239
244
  }
240
245
 
241
246
  buffer_size = BUFSIZE;
242
- if (recv.getInstanceVariable("@buffer_size") != null) {
243
- bufsize = recv.getInstanceVariable("@buffer_size");
247
+ if (rubyApi.getInstanceVariable(recv, "@buffer_size") != null) {
248
+ bufsize = rubyApi.getInstanceVariable(recv, "@buffer_size");
244
249
  if (!bufsize.isNil()) {
245
250
  buffer_size = RubyNumeric.fix2int(bufsize);
246
251
  }
@@ -296,16 +301,16 @@ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
296
301
 
297
302
  if ( done && ele_open ) {
298
303
  ele_open = false;
299
- if(tokstart > -1) {
300
- mark_tag = tokstart;
301
- tokstart = -1;
304
+ if(ts > -1) {
305
+ mark_tag = ts;
306
+ ts = -1;
302
307
  text = true;
303
308
  }
304
309
  }
305
310
 
306
- if(tokstart == -1) {
311
+ if(ts == -1) {
307
312
  have = 0;
308
- /* text nodes have no tokstart because each byte is parsed alone */
313
+ /* text nodes have no ts because each byte is parsed alone */
309
314
  if(mark_tag != -1 && text) {
310
315
  if (done) {
311
316
  if(mark_tag < p-1) {
@@ -318,13 +323,13 @@ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
318
323
  }
319
324
  mark_tag = 0;
320
325
  } else {
321
- have = pe - tokstart;
322
- System.arraycopy(buf,tokstart,buf,0,have);
326
+ have = pe - ts;
327
+ System.arraycopy(buf,ts,buf,0,have);
323
328
  SLIDE(tag);
324
329
  SLIDE(akey);
325
330
  SLIDE(aval);
326
- tokend = (tokend - tokstart);
327
- tokstart = 0;
331
+ te = (te - ts);
332
+ ts = 0;
328
333
  }
329
334
  }
330
335
  return runtime.getNil();
@@ -355,9 +360,10 @@ public boolean basicLoad(final Ruby runtime) throws IOException {
355
360
 
356
361
  public static void Init_hpricot_scan(Ruby runtime) {
357
362
  RubyModule mHpricot = runtime.defineModule("Hpricot");
358
- mHpricot.getMetaClass().attr_accessor(new IRubyObject[]{runtime.newSymbol("buffer_size")});
363
+ mHpricot.getMetaClass().attr_accessor(runtime.getCurrentContext(),new IRubyObject[]{runtime.newSymbol("buffer_size")});
359
364
  CallbackFactory fact = runtime.callbackFactory(HpricotScanService.class);
360
365
  mHpricot.getMetaClass().defineMethod("scan",fact.getSingletonMethod("__hpricot_scan",IRubyObject.class));
361
- mHpricot.defineClassUnder("ParseError",runtime.getClass("Exception"),runtime.getClass("Exception").getAllocator());
366
+ mHpricot.defineClassUnder("ParseError",runtime.getClass("StandardError"),runtime.getClass("StandardError").getAllocator());
367
+ rubyApi = JavaEmbedUtils.newObjectAdapter();
362
368
  }
363
369
  }
@@ -8,6 +8,12 @@
8
8
  */
9
9
  #include <ruby.h>
10
10
 
11
+ #ifndef RARRAY_LEN
12
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
13
+ #define RSTRING_LEN(str) RSTRING(str)->len
14
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
15
+ #endif
16
+
11
17
  #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
12
18
 
13
19
  static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
@@ -16,11 +22,11 @@ static VALUE rb_eHpricotParseError;
16
22
  static ID s_read, s_to_str;
17
23
 
18
24
  #define ELE(N) \
19
- if (tokend > tokstart || text == 1) { \
25
+ if (te > ts || text == 1) { \
20
26
  VALUE raw_string = Qnil; \
21
27
  ele_open = 0; text = 0; \
22
- if (tokstart != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
23
- raw_string = rb_str_new(tokstart, tokend-tokstart); \
28
+ if (ts != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
29
+ raw_string = rb_str_new(ts, te-ts); \
24
30
  } \
25
31
  rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
26
32
  }
@@ -33,7 +39,7 @@ static ID s_read, s_to_str;
33
39
 
34
40
  #define CAT(N, E) if (NIL_P(N)) { SET(N, E); } else { rb_str_cat(N, mark_##N, E - mark_##N); }
35
41
 
36
- #define SLIDE(N) if ( mark_##N > tokstart ) mark_##N = buf + (mark_##N - tokstart);
42
+ #define SLIDE(N) if ( mark_##N > ts ) mark_##N = buf + (mark_##N - ts);
37
43
 
38
44
  #define ATTR(K, V) \
39
45
  if (!NIL_P(K)) { \
@@ -46,8 +52,8 @@ static ID s_read, s_to_str;
46
52
  { \
47
53
  if (ele_open == 1) { \
48
54
  ele_open = 0; \
49
- if (tokstart > 0) { \
50
- mark_tag = tokstart; \
55
+ if (ts > 0) { \
56
+ mark_tag = ts; \
51
57
  } \
52
58
  } else { \
53
59
  mark_tag = p; \
@@ -102,7 +108,7 @@ static ID s_read, s_to_str;
102
108
  ATTR(akey, aval);
103
109
  }
104
110
 
105
- include hpricot_common "ext/hpricot_scan/hpricot_common.rl";
111
+ include hpricot_common "hpricot_common.rl";
106
112
 
107
113
  }%%
108
114
 
@@ -129,7 +135,7 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
129
135
  VALUE hpricot_scan(VALUE self, VALUE port)
130
136
  {
131
137
  int cs, act, have = 0, nread = 0, curline = 1, text = 0;
132
- char *tokstart = 0, *tokend = 0, *buf = NULL;
138
+ char *ts = 0, *te = 0, *buf = NULL, *eof = NULL;
133
139
 
134
140
  VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
135
141
  char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
@@ -169,7 +175,7 @@ VALUE hpricot_scan(VALUE self, VALUE port)
169
175
  /* We've used up the entire buffer storing an already-parsed token
170
176
  * prefix that must be preserved. Likely caused by super-long attributes.
171
177
  * See ticket #13. */
172
- rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING(tag)->ptr, curline);
178
+ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING_PTR(tag), curline);
173
179
  }
174
180
 
175
181
  if ( rb_respond_to( port, s_read ) )
@@ -182,8 +188,8 @@ VALUE hpricot_scan(VALUE self, VALUE port)
182
188
  }
183
189
 
184
190
  StringValue(str);
185
- memcpy( p, RSTRING(str)->ptr, RSTRING(str)->len );
186
- len = RSTRING(str)->len;
191
+ memcpy( p, RSTRING_PTR(str), RSTRING_LEN(str) );
192
+ len = RSTRING_LEN(str);
187
193
  nread += len;
188
194
 
189
195
  /* If this is the last buffer, tack on an EOF. */
@@ -199,7 +205,7 @@ VALUE hpricot_scan(VALUE self, VALUE port)
199
205
  free(buf);
200
206
  if ( !NIL_P(tag) )
201
207
  {
202
- rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING(tag)->ptr, curline);
208
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
203
209
  }
204
210
  else
205
211
  {
@@ -210,17 +216,17 @@ VALUE hpricot_scan(VALUE self, VALUE port)
210
216
  if ( done && ele_open )
211
217
  {
212
218
  ele_open = 0;
213
- if (tokstart > 0) {
214
- mark_tag = tokstart;
215
- tokstart = 0;
219
+ if (ts > 0) {
220
+ mark_tag = ts;
221
+ ts = 0;
216
222
  text = 1;
217
223
  }
218
224
  }
219
225
 
220
- if ( tokstart == 0 )
226
+ if ( ts == 0 )
221
227
  {
222
228
  have = 0;
223
- /* text nodes have no tokstart because each byte is parsed alone */
229
+ /* text nodes have no ts because each byte is parsed alone */
224
230
  if ( mark_tag != NULL && text == 1 )
225
231
  {
226
232
  if (done)
@@ -240,13 +246,13 @@ VALUE hpricot_scan(VALUE self, VALUE port)
240
246
  }
241
247
  else
242
248
  {
243
- have = pe - tokstart;
244
- memmove( buf, tokstart, have );
249
+ have = pe - ts;
250
+ memmove( buf, ts, have );
245
251
  SLIDE(tag);
246
252
  SLIDE(akey);
247
253
  SLIDE(aval);
248
- tokend = buf + (tokend - tokstart);
249
- tokstart = buf;
254
+ te = buf + (te - ts);
255
+ ts = buf;
250
256
  }
251
257
  }
252
258
  free(buf);
@@ -257,7 +263,7 @@ void Init_hpricot_scan()
257
263
  VALUE mHpricot = rb_define_module("Hpricot");
258
264
  rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
259
265
  rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
260
- rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException);
266
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eStandardError);
261
267
 
262
268
  s_read = rb_intern("read");
263
269
  s_to_str = rb_intern("to_str");
@@ -0,0 +1,5 @@
1
+ require './hpricot_scan.so'
2
+
3
+ doc = "<doc><person><test>YESSS</test></person><train>SET</train></doc>"
4
+ Hpricot.scan(doc) { |x| p x }
5
+ p Hpricot.lemon(doc)
data/lib/fast_xs.so ADDED
Binary file
@@ -1,8 +1,23 @@
1
1
  require 'hpricot/tags'
2
- require 'hpricot/xchar'
2
+ require 'fast_xs'
3
3
  require 'hpricot/blankslate'
4
4
 
5
5
  module Hpricot
6
+ PREDEFINED = {
7
+ 34 => '&quot;', # quotation mark
8
+ 38 => '&amp;', # ampersand
9
+ 60 => '&lt;', # left angle bracket
10
+ 62 => '&gt;' # right angle bracket
11
+ }
12
+ PREDEFINED_U = PREDEFINED.inject({}) { |hsh, (k, v)| hsh[v] = k; hsh }
13
+
14
+ # XML unescape
15
+ def self.uxs(str)
16
+ str.to_s.
17
+ gsub(/\&\w+;/) { |x| (PREDEFINED_U[x] || ??).chr }.
18
+ gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
19
+ end
20
+
6
21
  def self.build(ele = Doc.new, assigns = {}, &blk)
7
22
  ele.extend Builder
8
23
  assigns.each do |k, v|
@@ -32,7 +47,7 @@ module Hpricot
32
47
 
33
48
  # Write a +string+ to the HTML stream, making sure to escape it.
34
49
  def text!(string)
35
- @children << Text.new(Hpricot.xs(string))
50
+ @children << Text.new(string.fast_xs)
36
51
  end
37
52
 
38
53
  # Write a +string+ to the HTML stream without escaping it.
@@ -75,16 +90,16 @@ module Hpricot
75
90
  # turn arguments into children or attributes
76
91
  childs = []
77
92
  attrs = args.grep(Hash)
78
- childs.concat((args - attrs).map do |x|
93
+ childs.concat((args - attrs).flatten.map do |x|
79
94
  if x.respond_to? :to_html
80
95
  Hpricot.make(x.to_html)
81
96
  elsif x
82
- Text.new(Hpricot.xs(x))
97
+ Text.new(x.fast_xs)
83
98
  end
84
99
  end.flatten)
85
100
  attrs = attrs.inject({}) do |hsh, ath|
86
101
  ath.each do |k, v|
87
- hsh[k] = Hpricot.xs(v.to_s) if v
102
+ hsh[k] = v.to_s.fast_xs if v
88
103
  end
89
104
  hsh
90
105
  end
@@ -130,25 +130,25 @@ module Hpricot
130
130
  # Add to the end of the contents inside each element in this list.
131
131
  # Pass in an HTML +str+, which is turned into Hpricot elements.
132
132
  def append(str = nil, &blk)
133
- each { |x| x.html(x.children + Hpricot.make(str, &blk)) }
133
+ each { |x| x.html(x.children + x.make(str, &blk)) }
134
134
  end
135
135
 
136
136
  # Add to the start of the contents inside each element in this list.
137
137
  # Pass in an HTML +str+, which is turned into Hpricot elements.
138
138
  def prepend(str = nil, &blk)
139
- each { |x| x.html(Hpricot.make(str, &blk) + x.children) }
139
+ each { |x| x.html(x.make(str, &blk) + x.children) }
140
140
  end
141
141
 
142
142
  # Add some HTML just previous to each element in this list.
143
143
  # Pass in an HTML +str+, which is turned into Hpricot elements.
144
144
  def before(str = nil, &blk)
145
- each { |x| x.parent.insert_before Hpricot.make(str, &blk), x }
145
+ each { |x| x.parent.insert_before x.make(str, &blk), x }
146
146
  end
147
147
 
148
148
  # Just after each element in this list, add some HTML.
149
149
  # Pass in an HTML +str+, which is turned into Hpricot elements.
150
150
  def after(str = nil, &blk)
151
- each { |x| x.parent.insert_after Hpricot.make(str, &blk), x }
151
+ each { |x| x.parent.insert_after x.make(str, &blk), x }
152
152
  end
153
153
 
154
154
  # Wraps each element in the list inside the element created by HTML +str+.
@@ -161,10 +161,10 @@ module Hpricot
161
161
  # This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
162
162
  def wrap(str = nil, &blk)
163
163
  each do |x|
164
- wrap = Hpricot.make(str, &blk)
164
+ wrap = x.make(str, &blk)
165
165
  nest = wrap.detect { |w| w.respond_to? :children }
166
166
  unless nest
167
- raise Exception, "No wrapping element found."
167
+ raise "No wrapping element found."
168
168
  end
169
169
  x.parent.replace_child(x, wrap)
170
170
  nest = nest.children.first until nest.empty?
@@ -261,7 +261,7 @@ module Hpricot
261
261
  self
262
262
  end
263
263
 
264
- ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^\]'"]*)'?"? *\]!i
264
+ ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
265
265
  BRACK_RE = %r!(\[) *([^\]]*) *\]+!i
266
266
  FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
267
267
  CUST_RE = %r!(:)([a-zA-Z0-9\*_-]*)()!
data/lib/hpricot/parse.rb CHANGED
@@ -12,13 +12,14 @@ module Hpricot
12
12
  # Hpricot.parse parses <i>input</i> and return a document tree.
13
13
  # represented by Hpricot::Doc.
14
14
  def Hpricot.parse(input = nil, opts = {}, &blk)
15
- Doc.new(make(input, opts, &blk))
15
+ Doc.new(make(input, opts, &blk), opts)
16
16
  end
17
17
 
18
18
  # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
19
19
  # and returning a document tree.
20
- def Hpricot.XML(input, opts = {})
21
- Doc.new(make(input, opts.merge(:xml => true)))
20
+ def Hpricot.XML(input = nil, opts = {}, &blk)
21
+ opts.merge! :xml => true
22
+ Doc.new(make(input, opts, &blk), opts)
22
23
  end
23
24
 
24
25
  # :stopdoc:
@@ -137,7 +138,7 @@ module Hpricot
137
138
  matched_elem = stack[i]
138
139
  stack[i][1] += token
139
140
  eles = stack.slice!((i+1)..-1)
140
- stack.last[2] += eles
141
+ stack.last[2] += eles if eles
141
142
  break
142
143
  end
143
144
  end
@@ -208,7 +209,7 @@ module Hpricot
208
209
  when :cdata
209
210
  Text.parse_cdata_section(structure[1])
210
211
  else
211
- raise Exception, "[bug] unknown structure: #{structure.inspect}"
212
+ raise "[bug] unknown structure: #{structure.inspect}"
212
213
  end
213
214
  end
214
215
 
data/lib/hpricot/tag.rb CHANGED
@@ -3,8 +3,9 @@ module Hpricot
3
3
 
4
4
  class Doc
5
5
  attr_accessor :children
6
- def initialize(children = [])
6
+ def initialize(children = [], options = {})
7
7
  @children = children ? children.each { |c| c.parent = self } : []
8
+ @options = options
8
9
  end
9
10
  def output(out, opts = {})
10
11
  @children.each do |n|
@@ -12,6 +13,9 @@ module Hpricot
12
13
  end
13
14
  out
14
15
  end
16
+ def make(input = nil, &blk)
17
+ Hpricot.make(input, @options, &blk)
18
+ end
15
19
  def altered!; end
16
20
  end
17
21
 
@@ -100,7 +104,7 @@ module Hpricot
100
104
  if @raw_attributes
101
105
  @raw_attributes.map do |aname, aval|
102
106
  " #{aname}" +
103
- (aval ? "=\"#{aval}\"" : "")
107
+ (aval ? "=#{html_quote aval}" : "")
104
108
  end.join
105
109
  end
106
110
  end