hpricot 0.6 → 0.6.161

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,7 +6,9 @@ import org.jruby.RubyClass;
6
6
  import org.jruby.RubyHash;
7
7
  import org.jruby.RubyModule;
8
8
  import org.jruby.RubyNumeric;
9
+ import org.jruby.RubyObjectAdapter;
9
10
  import org.jruby.RubyString;
11
+ import org.jruby.javasupport.JavaEmbedUtils;
10
12
  import org.jruby.runtime.Block;
11
13
  import org.jruby.runtime.CallbackFactory;
12
14
  import org.jruby.runtime.builtin.IRubyObject;
@@ -15,6 +17,7 @@ import org.jruby.runtime.load.BasicLibraryService;
15
17
 
16
18
  public class HpricotScanService implements BasicLibraryService {
17
19
  public static String NO_WAY_SERIOUSLY="*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!";
20
+ private static RubyObjectAdapter rubyApi;
18
21
 
19
22
  public void ELE(IRubyObject N) {
20
23
  if (tokend > tokstart || text) {
@@ -239,8 +242,8 @@ IRubyObject hpricot_scan(IRubyObject recv, IRubyObject port) {
239
242
  }
240
243
 
241
244
  buffer_size = BUFSIZE;
242
- if (recv.getInstanceVariable("@buffer_size") != null) {
243
- bufsize = recv.getInstanceVariable("@buffer_size");
245
+ if (rubyApi.getInstanceVariable(recv, "@buffer_size") != null) {
246
+ bufsize = rubyApi.getInstanceVariable(recv, "@buffer_size");
244
247
  if (!bufsize.isNil()) {
245
248
  buffer_size = RubyNumeric.fix2int(bufsize);
246
249
  }
@@ -359,5 +362,6 @@ public static void Init_hpricot_scan(Ruby runtime) {
359
362
  CallbackFactory fact = runtime.callbackFactory(HpricotScanService.class);
360
363
  mHpricot.getMetaClass().defineMethod("scan",fact.getSingletonMethod("__hpricot_scan",IRubyObject.class));
361
364
  mHpricot.defineClassUnder("ParseError",runtime.getClass("Exception"),runtime.getClass("Exception").getAllocator());
365
+ rubyApi = JavaEmbedUtils.newObjectAdapter();
362
366
  }
363
367
  }
@@ -8,6 +8,12 @@
8
8
  */
9
9
  #include <ruby.h>
10
10
 
11
+ #ifndef RARRAY_LEN
12
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
13
+ #define RSTRING_LEN(str) RSTRING(str)->len
14
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
15
+ #endif
16
+
11
17
  #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
12
18
 
13
19
  static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
@@ -169,7 +175,7 @@ VALUE hpricot_scan(VALUE self, VALUE port)
169
175
  /* We've used up the entire buffer storing an already-parsed token
170
176
  * prefix that must be preserved. Likely caused by super-long attributes.
171
177
  * See ticket #13. */
172
- rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING(tag)->ptr, curline);
178
+ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING_PTR(tag), curline);
173
179
  }
174
180
 
175
181
  if ( rb_respond_to( port, s_read ) )
@@ -182,8 +188,8 @@ VALUE hpricot_scan(VALUE self, VALUE port)
182
188
  }
183
189
 
184
190
  StringValue(str);
185
- memcpy( p, RSTRING(str)->ptr, RSTRING(str)->len );
186
- len = RSTRING(str)->len;
191
+ memcpy( p, RSTRING_PTR(str), RSTRING_LEN(str) );
192
+ len = RSTRING_LEN(str);
187
193
  nread += len;
188
194
 
189
195
  /* If this is the last buffer, tack on an EOF. */
@@ -199,7 +205,7 @@ VALUE hpricot_scan(VALUE self, VALUE port)
199
205
  free(buf);
200
206
  if ( !NIL_P(tag) )
201
207
  {
202
- rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING(tag)->ptr, curline);
208
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING_PTR(tag), curline);
203
209
  }
204
210
  else
205
211
  {
@@ -1,8 +1,23 @@
1
1
  require 'hpricot/tags'
2
- require 'hpricot/xchar'
2
+ require 'fast_xs'
3
3
  require 'hpricot/blankslate'
4
4
 
5
5
  module Hpricot
6
+ PREDEFINED = {
7
+ 34 => '&quot;', # quotation mark
8
+ 38 => '&amp;', # ampersand
9
+ 60 => '&lt;', # left angle bracket
10
+ 62 => '&gt;' # right angle bracket
11
+ }
12
+ PREDEFINED_U = PREDEFINED.inject({}) { |hsh, (k, v)| hsh[v] = k; hsh }
13
+
14
+ # XML unescape
15
+ def self.uxs(str)
16
+ str.to_s.
17
+ gsub(/\&\w+;/) { |x| (PREDEFINED_U[x] || ??).chr }.
18
+ gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
19
+ end
20
+
6
21
  def self.build(ele = Doc.new, assigns = {}, &blk)
7
22
  ele.extend Builder
8
23
  assigns.each do |k, v|
@@ -32,7 +47,7 @@ module Hpricot
32
47
 
33
48
  # Write a +string+ to the HTML stream, making sure to escape it.
34
49
  def text!(string)
35
- @children << Text.new(Hpricot.xs(string))
50
+ @children << Text.new(string.fast_xs)
36
51
  end
37
52
 
38
53
  # Write a +string+ to the HTML stream without escaping it.
@@ -75,16 +90,16 @@ module Hpricot
75
90
  # turn arguments into children or attributes
76
91
  childs = []
77
92
  attrs = args.grep(Hash)
78
- childs.concat((args - attrs).map do |x|
93
+ childs.concat((args - attrs).flatten.map do |x|
79
94
  if x.respond_to? :to_html
80
95
  Hpricot.make(x.to_html)
81
96
  elsif x
82
- Text.new(Hpricot.xs(x))
97
+ Text.new(x.fast_xs)
83
98
  end
84
99
  end.flatten)
85
100
  attrs = attrs.inject({}) do |hsh, ath|
86
101
  ath.each do |k, v|
87
- hsh[k] = Hpricot.xs(v.to_s) if v
102
+ hsh[k] = v.to_s.fast_xs if v
88
103
  end
89
104
  hsh
90
105
  end
@@ -130,25 +130,25 @@ module Hpricot
130
130
  # Add to the end of the contents inside each element in this list.
131
131
  # Pass in an HTML +str+, which is turned into Hpricot elements.
132
132
  def append(str = nil, &blk)
133
- each { |x| x.html(x.children + Hpricot.make(str, &blk)) }
133
+ each { |x| x.html(x.children + x.make(str, &blk)) }
134
134
  end
135
135
 
136
136
  # Add to the start of the contents inside each element in this list.
137
137
  # Pass in an HTML +str+, which is turned into Hpricot elements.
138
138
  def prepend(str = nil, &blk)
139
- each { |x| x.html(Hpricot.make(str, &blk) + x.children) }
139
+ each { |x| x.html(x.make(str, &blk) + x.children) }
140
140
  end
141
141
 
142
142
  # Add some HTML just previous to each element in this list.
143
143
  # Pass in an HTML +str+, which is turned into Hpricot elements.
144
144
  def before(str = nil, &blk)
145
- each { |x| x.parent.insert_before Hpricot.make(str, &blk), x }
145
+ each { |x| x.parent.insert_before x.make(str, &blk), x }
146
146
  end
147
147
 
148
148
  # Just after each element in this list, add some HTML.
149
149
  # Pass in an HTML +str+, which is turned into Hpricot elements.
150
150
  def after(str = nil, &blk)
151
- each { |x| x.parent.insert_after Hpricot.make(str, &blk), x }
151
+ each { |x| x.parent.insert_after x.make(str, &blk), x }
152
152
  end
153
153
 
154
154
  # Wraps each element in the list inside the element created by HTML +str+.
@@ -161,7 +161,7 @@ module Hpricot
161
161
  # This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
162
162
  def wrap(str = nil, &blk)
163
163
  each do |x|
164
- wrap = Hpricot.make(str, &blk)
164
+ wrap = x.make(str, &blk)
165
165
  nest = wrap.detect { |w| w.respond_to? :children }
166
166
  unless nest
167
167
  raise Exception, "No wrapping element found."
@@ -261,7 +261,7 @@ module Hpricot
261
261
  self
262
262
  end
263
263
 
264
- ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^\]'"]*)'?"? *\]!i
264
+ ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
265
265
  BRACK_RE = %r!(\[) *([^\]]*) *\]+!i
266
266
  FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
267
267
  CUST_RE = %r!(:)([a-zA-Z0-9\*_-]*)()!
@@ -12,13 +12,14 @@ module Hpricot
12
12
  # Hpricot.parse parses <i>input</i> and return a document tree.
13
13
  # represented by Hpricot::Doc.
14
14
  def Hpricot.parse(input = nil, opts = {}, &blk)
15
- Doc.new(make(input, opts, &blk))
15
+ Doc.new(make(input, opts, &blk), opts)
16
16
  end
17
17
 
18
18
  # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
19
19
  # and returning a document tree.
20
- def Hpricot.XML(input, opts = {})
21
- Doc.new(make(input, opts.merge(:xml => true)))
20
+ def Hpricot.XML(input = nil, opts = {}, &blk)
21
+ opts.merge! :xml => true
22
+ Doc.new(make(input, opts, &blk), opts)
22
23
  end
23
24
 
24
25
  # :stopdoc:
@@ -3,8 +3,9 @@ module Hpricot
3
3
 
4
4
  class Doc
5
5
  attr_accessor :children
6
- def initialize(children = [])
6
+ def initialize(children = [], options = {})
7
7
  @children = children ? children.each { |c| c.parent = self } : []
8
+ @options = options
8
9
  end
9
10
  def output(out, opts = {})
10
11
  @children.each do |n|
@@ -12,6 +13,9 @@ module Hpricot
12
13
  end
13
14
  out
14
15
  end
16
+ def make(input = nil, &blk)
17
+ Hpricot.make(input, @options, &blk)
18
+ end
15
19
  def altered!; end
16
20
  end
17
21
 
@@ -100,7 +104,7 @@ module Hpricot
100
104
  if @raw_attributes
101
105
  @raw_attributes.map do |aname, aval|
102
106
  " #{aname}" +
103
- (aval ? "=\"#{aval}\"" : "")
107
+ (aval ? "=#{html_quote aval}" : "")
104
108
  end.join
105
109
  end
106
110
  end
@@ -20,6 +20,16 @@ module Hpricot
20
20
  # Is this object a stranded end tag?
21
21
  def bogusetag?() BogusETag::Trav === self end
22
22
 
23
+ # Parses an HTML string, making an HTML fragment based on
24
+ # the options used to create the container document.
25
+ def make(input = nil, &blk)
26
+ if parent and parent.respond_to? :make
27
+ parent.make(input, &blk)
28
+ else
29
+ Hpricot.make(input, &blk)
30
+ end
31
+ end
32
+
23
33
  # Builds an HTML string from this node and its contents.
24
34
  # If you need to write to a stream, try calling <tt>output(io)</tt>
25
35
  # as a method on this object.
@@ -109,12 +119,12 @@ module Hpricot
109
119
 
110
120
  # Adds elements immediately after this element, contained in the +html+ string.
111
121
  def after(html = nil, &blk)
112
- parent.insert_after(Hpricot.make(html, &blk), self)
122
+ parent.insert_after(make(html, &blk), self)
113
123
  end
114
124
 
115
125
  # Adds elements immediately before this element, contained in the +html+ string.
116
126
  def before(html = nil, &blk)
117
- parent.insert_before(Hpricot.make(html, &blk), self)
127
+ parent.insert_before(make(html, &blk), self)
118
128
  end
119
129
 
120
130
 
@@ -122,7 +132,7 @@ module Hpricot
122
132
  # in the +html+ string.
123
133
  def swap(html = nil, &blk)
124
134
  parent.altered!
125
- parent.replace_child(self, Hpricot.make(html, &blk))
135
+ parent.replace_child(self, make(html, &blk))
126
136
  end
127
137
 
128
138
  def get_subnode(*indexes)
@@ -158,7 +168,7 @@ module Hpricot
158
168
  when Array
159
169
  self.children = inner
160
170
  else
161
- self.children = Hpricot.make(inner, &blk)
171
+ self.children = make(inner, &blk)
162
172
  end
163
173
  reparent self.children
164
174
  else
@@ -513,8 +523,9 @@ module Hpricot
513
523
 
514
524
  def get_elements_by_tag_name(*a)
515
525
  list = Elements[]
526
+ a.delete("*")
516
527
  traverse_element(*a.map { |tag| [tag, "{http://www.w3.org/1999/xhtml}#{tag}"] }.flatten) do |e|
517
- list << e
528
+ list << e if e.elem?
518
529
  end
519
530
  list
520
531
  end
@@ -806,7 +817,7 @@ module Hpricot
806
817
  def set_attribute(name, val)
807
818
  altered!
808
819
  self.raw_attributes ||= {}
809
- self.raw_attributes[name.to_s] = Hpricot.xs(val)
820
+ self.raw_attributes[name.to_s] = val.fast_xs
810
821
  end
811
822
  alias_method :[]=, :set_attribute
812
823
  def remove_attribute(name)
@@ -58,6 +58,18 @@ class TestAlter < Test::Unit::TestCase
58
58
  assert_changed(@basic, "p[@class]", all_c2) { |p| p['class'].nil? }
59
59
  end
60
60
 
61
+ def test_xml_casing
62
+ doc = Hpricot.XML("<root><wildCat>text</wildCat></root>")
63
+ (doc/:root/:wildCat).after("<beanPole>gravity</beanPole>")
64
+ assert_equal doc.to_s, "<root><wildCat>text</wildCat><beanPole>gravity</beanPole></root>"
65
+
66
+ frag = Hpricot.XML do
67
+ b { i "A bit of HTML" }
68
+ end
69
+ (frag/:b).after("<beanPole>gravity</beanPole>")
70
+ assert_equal frag.to_s, "<b><i>A bit of HTML</i></b><beanPole>gravity</beanPole>"
71
+ end
72
+
61
73
  def assert_changed original, selector, set, &block
62
74
  assert set.all?(&block)
63
75
  assert Hpricot(original.to_html).search(selector).all?(&block)
@@ -21,4 +21,17 @@ class TestBuilder < Test::Unit::TestCase
21
21
  assert_equal "<b>&#8364;&#8226;</b>", doc.to_html
22
22
  assert_equal "\342\202\254\342\200\242", doc.at("text()").to_s
23
23
  end
24
+
25
+ def test_escaping_attrs
26
+ text = "<span style='font-family:\"MS Mincho\"'>Some text</span>"
27
+ assert_equal "<span style=\"font-family:\\\"MS Mincho\\\"\">Some text</span>",
28
+ Hpricot(text).to_html
29
+ end
30
+
31
+ def test_korean_utf8_entities
32
+ # a = '한글'
33
+ a = "\xed\x95\x9c\xea\xb8\x80"
34
+ doc = Hpricot() { b a }
35
+ assert_equal "<b>&#54620;&#44544;</b>", doc.to_html
36
+ end
24
37
  end
@@ -47,6 +47,13 @@ class TestParser < Test::Unit::TestCase
47
47
  assert_equal 'link1', @basic.get_elements_by_tag_name('a')[0].get_attribute('id')
48
48
  assert_equal 'link1', @basic.get_elements_by_tag_name('body')[0].get_element_by_id('link1').get_attribute('id')
49
49
  end
50
+
51
+ def test_get_elements_by_tag_name_star
52
+ simple = Hpricot.parse("<div><p id='first'>First</p><p id='second'>Second</p></div>")
53
+ assert_equal 3, simple.get_elements_by_tag_name("*").size
54
+ assert_equal 1, simple.get_elements_by_tag_name("div").size
55
+ assert_equal 2, simple.get_elements_by_tag_name("p").size
56
+ end
50
57
 
51
58
  def test_output_basic
52
59
  @basic = Hpricot.parse(TestFiles::BASIC)
@@ -13,4 +13,13 @@ class TestParser < Test::Unit::TestCase
13
13
  assert_equal ele, @basic.at(ele.xpath)
14
14
  end
15
15
  end
16
+ def test_attr_brackets
17
+ doc = Hpricot('<input name="vendor[porkpies]"/>')
18
+ assert_equal 1, (doc/'input[@name^="vendor[porkpies]"]').length
19
+ assert_equal 1, (doc/'input[@name="vendor[porkpies]"]').length
20
+ assert_equal 0, (doc/'input[@name$="]]]]]"]').length
21
+
22
+ doc = Hpricot('<input name="vendor[porkpies][meaty]"/>')
23
+ assert_equal 1, (doc/'input[@name^="vendor[porkpies][meaty]"]').length
24
+ end
16
25
  end
metadata CHANGED
@@ -1,81 +1,80 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.0
3
- specification_version: 1
4
2
  name: hpricot
5
3
  version: !ruby/object:Gem::Version
6
- version: "0.6"
7
- date: 2007-06-15 00:00:00 -07:00
8
- summary: a swift, liberal HTML parser with a fantastic library
9
- require_paths:
10
- - lib/i686-linux
11
- - lib
12
- email: why@ruby-lang.org
13
- homepage: http://code.whytheluckystiff.net/hpricot/
14
- rubyforge_project:
15
- description: a swift, liberal HTML parser with a fantastic library
16
- autorequire:
17
- default_executable:
18
- bindir: bin
19
- has_rdoc: true
20
- required_ruby_version: !ruby/object:Gem::Version::Requirement
21
- requirements:
22
- - - ">"
23
- - !ruby/object:Gem::Version
24
- version: 0.0.0
25
- version:
4
+ version: 0.6.161
26
5
  platform: ruby
27
- signing_key:
28
- cert_chain:
29
- post_install_message:
30
6
  authors:
31
7
  - why the lucky stiff
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-03-19 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: a swift, liberal HTML parser with a fantastic library
17
+ email: why@ruby-lang.org
18
+ executables: []
19
+
20
+ extensions:
21
+ - ext/fast_xs/extconf.rb
22
+ - ext/hpricot_scan/extconf.rb
23
+ extra_rdoc_files:
24
+ - README
25
+ - CHANGELOG
26
+ - COPYING
32
27
  files:
33
28
  - CHANGELOG
34
29
  - COPYING
35
30
  - README
36
31
  - Rakefile
37
32
  - test/files
38
- - test/test_preserved.rb
39
- - test/test_paths.rb
40
- - test/load_files.rb
41
- - test/test_xml.rb
42
- - test/test_parser.rb
43
- - test/test_alter.rb
44
- - test/test_builder.rb
45
- - test/files/why.xml
46
- - test/files/boingboing.html
47
- - test/files/uswebgen.html
48
- - test/files/immob.html
49
- - test/files/week9.html
50
- - test/files/utf8.html
51
33
  - test/files/basic.xhtml
34
+ - test/files/boingboing.html
52
35
  - test/files/cy0.html
53
- - test/files/tenderlove.html
36
+ - test/files/immob.html
54
37
  - test/files/pace_application.html
38
+ - test/files/tenderlove.html
39
+ - test/files/uswebgen.html
40
+ - test/files/utf8.html
41
+ - test/files/week9.html
42
+ - test/files/why.xml
43
+ - test/load_files.rb
44
+ - test/test_alter.rb
45
+ - test/test_builder.rb
46
+ - test/test_parser.rb
47
+ - test/test_paths.rb
48
+ - test/test_preserved.rb
49
+ - test/test_xml.rb
55
50
  - lib/hpricot
56
- - lib/hpricot.rb
57
- - lib/i686-linux
51
+ - lib/hpricot/blankslate.rb
58
52
  - lib/hpricot/builder.rb
53
+ - lib/hpricot/elements.rb
59
54
  - lib/hpricot/htmlinfo.rb
60
- - lib/hpricot/xchar.rb
61
55
  - lib/hpricot/inspect.rb
62
56
  - lib/hpricot/modules.rb
63
57
  - lib/hpricot/parse.rb
64
58
  - lib/hpricot/tag.rb
65
- - lib/hpricot/traverse.rb
66
- - lib/hpricot/elements.rb
67
59
  - lib/hpricot/tags.rb
68
- - lib/hpricot/blankslate.rb
60
+ - lib/hpricot/traverse.rb
61
+ - lib/hpricot/xchar.rb
62
+ - lib/hpricot.rb
69
63
  - extras/mingw-rbconfig.rb
70
64
  - ext/hpricot_scan/hpricot_scan.h
65
+ - ext/fast_xs/FastXsService.java
66
+ - ext/hpricot_scan/hpricot_scan.java.java
71
67
  - ext/hpricot_scan/HpricotScanService.java
68
+ - ext/fast_xs/fast_xs.c
72
69
  - ext/hpricot_scan/hpricot_scan.c
70
+ - ext/fast_xs/extconf.rb
73
71
  - ext/hpricot_scan/extconf.rb
74
72
  - ext/hpricot_scan/hpricot_common.rl
75
- - ext/hpricot_scan/hpricot_scan.rl
76
73
  - ext/hpricot_scan/hpricot_scan.java.rl
77
- test_files: []
78
-
74
+ - ext/hpricot_scan/hpricot_scan.rl
75
+ has_rdoc: true
76
+ homepage: http://code.whytheluckystiff.net/hpricot/
77
+ post_install_message:
79
78
  rdoc_options:
80
79
  - --quiet
81
80
  - --title
@@ -83,15 +82,27 @@ rdoc_options:
83
82
  - --main
84
83
  - README
85
84
  - --inline-source
86
- extra_rdoc_files:
87
- - README
88
- - CHANGELOG
89
- - COPYING
90
- executables: []
91
-
92
- extensions:
93
- - ext/hpricot_scan/extconf.rb
85
+ require_paths:
86
+ - lib/universal-darwin9.0
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: "0"
93
+ version:
94
+ required_rubygems_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: "0"
99
+ version:
94
100
  requirements: []
95
101
 
96
- dependencies: []
102
+ rubyforge_project:
103
+ rubygems_version: 1.0.1
104
+ signing_key:
105
+ specification_version: 2
106
+ summary: a swift, liberal HTML parser with a fantastic library
107
+ test_files: []
97
108