hpricot 0.6.164 → 0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,66 +2,55 @@ module Hpricot
2
2
  # :stopdoc:
3
3
 
4
4
  class Doc
5
- attr_accessor :children
6
- def initialize(children = [], options = {})
7
- @children = children ? children.each { |c| c.parent = self } : []
8
- @options = options
9
- end
10
5
  def output(out, opts = {})
11
- @children.each do |n|
6
+ children.each do |n|
12
7
  n.output(out, opts)
13
- end
8
+ end if children
14
9
  out
15
10
  end
16
11
  def make(input = nil, &blk)
17
- Hpricot.make(input, @options, &blk)
12
+ Hpricot.make(input, @options, &blk).children
18
13
  end
19
14
  def altered!; end
15
+ def inspect_tree
16
+ children.map { |x| x.inspect_tree }.join if children
17
+ end
20
18
  end
21
19
 
22
20
  class BaseEle
23
- attr_accessor :raw_string, :parent
24
21
  def html_quote(str)
25
22
  "\"" + str.gsub('"', '\\"') + "\""
26
23
  end
27
24
  def if_output(opts)
28
- if opts[:preserve] and not @raw_string.nil?
29
- @raw_string
25
+ if opts[:preserve] and not raw_string.nil?
26
+ raw_string
30
27
  else
31
28
  yield opts
32
29
  end
33
30
  end
34
31
  def pathname; self.name end
35
32
  def altered!
36
- @raw_string = nil
37
- end
38
- def self.alterable(*fields)
39
- attr_accessor(*fields)
40
- fields.each do |f|
41
- define_method("#{f}=") do |v|
42
- altered!
43
- instance_variable_set("@#{f}", v)
44
- end
45
- end
33
+ clear_raw
34
+ end
35
+ def inspect_tree(depth = 0)
36
+ %{#{" " * depth}} + self.class.name.split(/::/).last.downcase + "\n"
46
37
  end
47
38
  end
48
39
 
49
40
  class Elem
50
- attr_accessor :stag, :etag, :children
51
- def initialize(stag, children=nil, etag=nil)
52
- @stag, @etag = stag, etag
53
- @children = children ? children.each { |c| c.parent = self } : []
54
- end
55
- def empty?; @children.empty? end
56
- [:name, :raw_attributes, :parent, :altered!].each do |m|
57
- [m, "#{m}="].each { |m2| define_method(m2) { |*a| [@etag, @stag].inject { |_,t| t.send(m2, *a) if t and t.respond_to?(m2) } } }
41
+ def initialize tag, attrs = nil, children = nil, etag = nil
42
+ self.name, self.raw_attributes, self.children, self.etag =
43
+ tag, attrs, children, etag
58
44
  end
45
+ def empty?; children.nil? or children.empty? end
59
46
  def attributes
60
47
  if raw_attributes
61
48
  raw_attributes.inject({}) do |hsh, (k, v)|
62
49
  hsh[k] = Hpricot.uxs(v)
63
50
  hsh
64
51
  end
52
+ else
53
+ {}
65
54
  end
66
55
  end
67
56
  def to_plain_text
@@ -79,151 +68,128 @@ module Hpricot
79
68
  end
80
69
  def pathname; self.name end
81
70
  def output(out, opts = {})
82
- if empty? and ElementContent[@stag.name] == :EMPTY
83
- @stag.output(out, opts.merge(:style => :empty))
84
- else
85
- @stag.output(out, opts)
86
- @children.each { |n| n.output(out, opts) }
87
- if @etag
88
- @etag.output(out, opts)
89
- elsif !opts[:preserve]
90
- ETag.new(@stag.name).output(out, opts)
71
+ out <<
72
+ if_output(opts) do
73
+ "<#{name}#{attributes_as_html}" +
74
+ ((empty? and not etag) ? " /" : "") +
75
+ ">"
91
76
  end
77
+ if children
78
+ children.each { |n| n.output(out, opts) }
79
+ end
80
+ if etag
81
+ etag.output(out, opts)
82
+ elsif !opts[:preserve] && !empty?
83
+ out <<
84
+ if_output(opts) do
85
+ "</#{name}>"
86
+ end
92
87
  end
93
88
  out
94
89
  end
95
- end
96
-
97
- class STag < BaseEle
98
- def initialize(name, attributes=nil)
99
- @name = name.to_s
100
- @raw_attributes = attributes || {}
101
- end
102
- alterable :name, :raw_attributes
103
90
  def attributes_as_html
104
- if @raw_attributes
105
- @raw_attributes.map do |aname, aval|
91
+ if raw_attributes
92
+ raw_attributes.map do |aname, aval|
106
93
  " #{aname}" +
107
94
  (aval ? "=#{html_quote aval}" : "")
108
95
  end.join
109
96
  end
110
97
  end
111
- def output(out, opts = {})
112
- out <<
113
- if_output(opts) do
114
- "<#{@name}#{attributes_as_html}" +
115
- (opts[:style] == :empty ? " /" : "") +
116
- ">"
117
- end
98
+ def inspect_tree(depth = 0)
99
+ %{#{" " * depth}} + name + "\n" +
100
+ (children ? children.map { |x| x.inspect_tree(depth + 1) }.join : "")
118
101
  end
119
102
  end
120
103
 
121
- class ETag < BaseEle
122
- def initialize(qualified_name)
123
- @name = qualified_name.to_s
124
- end
125
- alterable :name
104
+ class ETag
105
+ def initialize name; self.name = name end
126
106
  def output(out, opts = {})
127
107
  out <<
128
108
  if_output(opts) do
129
- "</#{@name}>"
109
+ "</#{name}>"
130
110
  end
131
111
  end
132
112
  end
133
113
 
134
- class BogusETag < ETag
114
+ class BogusETag
135
115
  def output(out, opts = {}); out << if_output(opts) { '' }; end
136
116
  end
137
117
 
138
- class Text < BaseEle
139
- def initialize(text)
140
- @content = text
141
- end
142
- alterable :content
118
+ class Text
119
+ def initialize content; self.content = content end
143
120
  def pathname; "text()" end
144
121
  def to_s
145
- Hpricot.uxs(@content)
122
+ Hpricot.uxs(content)
146
123
  end
147
124
  alias_method :inner_text, :to_s
148
125
  alias_method :to_plain_text, :to_s
126
+ def << str; self.content << str end
149
127
  def output(out, opts = {})
150
128
  out <<
151
129
  if_output(opts) do
152
- @content
130
+ content.to_s
153
131
  end
154
132
  end
155
133
  end
156
134
 
157
- class CData < Text
135
+ class CData
136
+ def initialize content; self.content = content end
158
137
  alias_method :to_s, :content
159
138
  alias_method :to_plain_text, :content
160
139
  def output(out, opts = {})
161
140
  out <<
162
141
  if_output(opts) do
163
- "<![CDATA[#@content]]>"
142
+ "<![CDATA[#{content}]]>"
164
143
  end
165
144
  end
166
145
  end
167
146
 
168
- class XMLDecl < BaseEle
169
- def initialize(version, encoding, standalone)
170
- @version, @encoding, @standalone = version, encoding, standalone
171
- end
172
- alterable :version, :encoding, :standalone
147
+ class XMLDecl
173
148
  def pathname; "xmldecl()" end
174
149
  def output(out, opts = {})
175
150
  out <<
176
151
  if_output(opts) do
177
- "<?xml version=\"#{@version}\"" +
178
- (@encoding ? " encoding=\"#{encoding}\"" : "") +
179
- (@standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
152
+ "<?xml version=\"#{version}\"" +
153
+ (encoding ? " encoding=\"#{encoding}\"" : "") +
154
+ (standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
180
155
  "?>"
181
156
  end
182
157
  end
183
158
  end
184
159
 
185
- class DocType < BaseEle
186
- def initialize(target, pubid, sysid)
187
- @target, @public_id, @system_id = target, pubid, sysid
160
+ class DocType
161
+ def initialize target, pub, sys
162
+ self.target, self.public_id, self.system_id = target, pub, sys
188
163
  end
189
- alterable :target, :public_id, :system_id
190
164
  def pathname; "doctype()" end
191
165
  def output(out, opts = {})
192
166
  out <<
193
167
  if_output(opts) do
194
- "<!DOCTYPE #{@target} " +
195
- (@public_id ? "PUBLIC \"#{@public_id}\"" : "SYSTEM") +
196
- (@system_id ? " #{html_quote(@system_id)}" : "") + ">"
168
+ "<!DOCTYPE #{target} " +
169
+ (public_id ? "PUBLIC \"#{public_id}\"" : "SYSTEM") +
170
+ (system_id ? " #{html_quote(system_id)}" : "") + ">"
197
171
  end
198
172
  end
199
173
  end
200
174
 
201
- class ProcIns < BaseEle
202
- def initialize(target, content)
203
- @target, @content = target, content
204
- end
175
+ class ProcIns
205
176
  def pathname; "procins()" end
206
- alterable :target, :content
207
177
  def output(out, opts = {})
208
178
  out <<
209
179
  if_output(opts) do
210
- "<?#{@target}" +
211
- (@content ? " #{@content}" : "") +
180
+ "<?#{target}" +
181
+ (content ? " #{content}" : "") +
212
182
  "?>"
213
183
  end
214
184
  end
215
185
  end
216
186
 
217
- class Comment < BaseEle
218
- def initialize(content)
219
- @content = content
220
- end
187
+ class Comment
221
188
  def pathname; "comment()" end
222
- alterable :content
223
189
  def output(out, opts = {})
224
190
  out <<
225
191
  if_output(opts) do
226
- "<!--#{@content}-->"
192
+ "<!--#{content}-->"
227
193
  end
228
194
  end
229
195
  end
@@ -26,7 +26,7 @@ module Hpricot
26
26
  if parent and parent.respond_to? :make
27
27
  parent.make(input, &blk)
28
28
  else
29
- Hpricot.make(input, &blk)
29
+ Hpricot.make(input, &blk).children
30
30
  end
31
31
  end
32
32
 
@@ -51,7 +51,7 @@ module Hpricot
51
51
  return i if (x.respond_to?(:name) and name == x.name) or
52
52
  (x.text? and name == "text()")
53
53
  i += 1
54
- end
54
+ end if children
55
55
  -1
56
56
  end
57
57
 
@@ -146,16 +146,20 @@ module Hpricot
146
146
  # Builds a string from the text contained in this node. All
147
147
  # HTML elements are removed.
148
148
  def to_plain_text
149
- if respond_to? :children
149
+ if respond_to?(:children) and children
150
150
  children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
151
+ else
152
+ ""
151
153
  end
152
154
  end
153
155
 
154
156
  # Builds a string from the text contained in this node. All
155
157
  # HTML elements are removed.
156
158
  def inner_text
157
- if respond_to? :children
159
+ if respond_to?(:children) and children
158
160
  children.map { |x| x.inner_text }.join
161
+ else
162
+ ""
159
163
  end
160
164
  end
161
165
  alias_method :innerText, :inner_text
@@ -172,8 +176,10 @@ module Hpricot
172
176
  end
173
177
  reparent self.children
174
178
  else
175
- if respond_to? :children
179
+ if respond_to?(:children) and children
176
180
  children.map { |x| x.output("") }.join
181
+ else
182
+ ""
177
183
  end
178
184
  end
179
185
  end
@@ -207,7 +213,7 @@ module Hpricot
207
213
  parent.children.each do |e|
208
214
  id = sim if e == self
209
215
  sim += 1 if e.pathname == self.pathname
210
- end
216
+ end if parent.children
211
217
  p = File.join(parent.xpath, self.pathname)
212
218
  p += "[#{id+1}]" if sim >= 2
213
219
  p
@@ -224,7 +230,7 @@ module Hpricot
224
230
  parent.children.each do |e|
225
231
  id = sim if e == self
226
232
  sim += 1 if e.pathname == self.pathname
227
- end
233
+ end if parent.children
228
234
  p = parent.css_path
229
235
  p = p ? "#{p} > #{self.pathname}" : self.pathname
230
236
  p += ":nth(#{id})" if sim >= 2
@@ -489,13 +495,13 @@ module Hpricot
489
495
 
490
496
  # +each_child+ iterates over each child.
491
497
  def each_child(&block) # :yields: child_node
492
- children.each(&block)
498
+ children.each(&block) if children
493
499
  nil
494
500
  end
495
501
 
496
502
  # +each_child_with_index+ iterates over each child.
497
503
  def each_child_with_index(&block) # :yields: child_node, index
498
- children.each_with_index(&block)
504
+ children.each_with_index(&block) if children
499
505
  nil
500
506
  end
501
507
 
@@ -626,7 +632,7 @@ module Hpricot
626
632
  # :stopdoc:
627
633
  module Doc::Trav
628
634
  def traverse_all_element(&block)
629
- children.each {|c| c.traverse_all_element(&block) }
635
+ children.each {|c| c.traverse_all_element(&block) } if children
630
636
  end
631
637
  def xpath
632
638
  "/"
@@ -639,7 +645,7 @@ module Hpricot
639
645
  module Elem::Trav
640
646
  def traverse_all_element(&block)
641
647
  yield self
642
- children.each {|c| c.traverse_all_element(&block) }
648
+ children.each {|c| c.traverse_all_element(&block) } if children
643
649
  end
644
650
  end
645
651
 
@@ -651,14 +657,14 @@ module Hpricot
651
657
 
652
658
  module Doc::Trav
653
659
  def traverse_some_element(name_set, &block)
654
- children.each {|c| c.traverse_some_element(name_set, &block) }
660
+ children.each {|c| c.traverse_some_element(name_set, &block) } if children
655
661
  end
656
662
  end
657
663
 
658
664
  module Elem::Trav
659
665
  def traverse_some_element(name_set, &block)
660
666
  yield self if name_set.include? self.name
661
- children.each {|c| c.traverse_some_element(name_set, &block) }
667
+ children.each {|c| c.traverse_some_element(name_set, &block) } if children
662
668
  end
663
669
  end
664
670
 
@@ -797,7 +803,7 @@ module Hpricot
797
803
  module Doc::Trav
798
804
  def root
799
805
  es = []
800
- children.each {|c| es << c if c.elem? }
806
+ children.each {|c| es << c if c.elem? } if children
801
807
  raise Hpricot::Error, "no element" if es.empty?
802
808
  raise Hpricot::Error, "multiple top elements" if 1 < es.length
803
809
  es[0]
@@ -2263,4 +2263,4 @@ Why are so many drawings from earlier centuries so deliciously weird? Here are a
2263
2263
  -->
2264
2264
  </div>
2265
2265
  </body>
2266
- </html>
2266
+ </html>
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'hpricot'
5
+ require 'nokogiri'
6
+ require 'benchmark'
7
+
8
+ content = File.read("test/files/boingboing.html")
9
+
10
+ N = 100
11
+
12
+ unless Gem.loaded_specs['hpricot'].version > Gem::Version.new('0.6.161')
13
+ abort "** Use higher than Hpricot 0.6.161!"
14
+ end
15
+
16
+ puts "Hpricot #{Gem.loaded_specs['hpricot'].version} vs. Nokogiri #{Gem.loaded_specs['nokogiri'].version}"
17
+ hdoc = Hpricot(content)
18
+ ndoc = Nokogiri.Hpricot(content)
19
+
20
+ Benchmark.bm do |x|
21
+ x.report('hpricot:doc') do
22
+ N.times do
23
+ Hpricot(content)
24
+ end
25
+ end
26
+
27
+ x.report('nokogiri:doc') do
28
+ N.times do
29
+ Nokogiri.Hpricot(content)
30
+ end
31
+ end
32
+ end
33
+
34
+ Benchmark.bm do |x|
35
+ x.report('hpricot:xpath') do
36
+ N.times do
37
+ info = hdoc.search("//a[@name='027906']").first.inner_text
38
+ url = hdoc.search("h3[text()='College kids reportedly taking more smart drugs']").first.inner_text
39
+ end
40
+ end
41
+
42
+ x.report('nokogiri:xpath') do
43
+ N.times do
44
+ info = ndoc.search("//a[@name='027906']").first.inner_text
45
+ url = ndoc.search("h3[text()='College kids reportedly taking more smart drugs']").first.inner_text
46
+ end
47
+ end
48
+ end
49
+
50
+ Benchmark.bm do |x|
51
+ x.report('hpricot:css') do
52
+ N.times do
53
+ info = hdoc.search('form input[@checked]').first
54
+ url = hdoc.search('td spacer').first.inner_text
55
+ end
56
+ end
57
+
58
+ x.report('nokogiri:css') do
59
+ N.times do
60
+ info = ndoc.search('form input[@checked]').first
61
+ url = ndoc.search('td spacer').first.inner_text
62
+ end
63
+ end
64
+ end