thbar-hpricot 0.8.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. data/CHANGELOG +104 -0
  2. data/COPYING +18 -0
  3. data/README.md +276 -0
  4. data/Rakefile +234 -0
  5. data/ext/fast_xs/FastXsService.java +1123 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +210 -0
  8. data/ext/hpricot_scan/HpricotCss.java +850 -0
  9. data/ext/hpricot_scan/HpricotScanService.java +2099 -0
  10. data/ext/hpricot_scan/extconf.rb +9 -0
  11. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  12. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  13. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  14. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  15. data/ext/hpricot_scan/hpricot_scan.c +7045 -0
  16. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  17. data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
  18. data/ext/hpricot_scan/hpricot_scan.rl +902 -0
  19. data/extras/hpricot.png +0 -0
  20. data/lib/hpricot.rb +26 -0
  21. data/lib/hpricot/blankslate.rb +63 -0
  22. data/lib/hpricot/builder.rb +216 -0
  23. data/lib/hpricot/elements.rb +514 -0
  24. data/lib/hpricot/htmlinfo.rb +691 -0
  25. data/lib/hpricot/inspect.rb +103 -0
  26. data/lib/hpricot/modules.rb +40 -0
  27. data/lib/hpricot/parse.rb +38 -0
  28. data/lib/hpricot/tag.rb +219 -0
  29. data/lib/hpricot/tags.rb +164 -0
  30. data/lib/hpricot/traverse.rb +839 -0
  31. data/lib/hpricot/xchar.rb +94 -0
  32. data/test/files/basic.xhtml +17 -0
  33. data/test/files/boingboing.html +2266 -0
  34. data/test/files/cy0.html +3653 -0
  35. data/test/files/immob.html +400 -0
  36. data/test/files/pace_application.html +1320 -0
  37. data/test/files/tenderlove.html +16 -0
  38. data/test/files/uswebgen.html +220 -0
  39. data/test/files/utf8.html +1054 -0
  40. data/test/files/week9.html +1723 -0
  41. data/test/files/why.xml +19 -0
  42. data/test/load_files.rb +7 -0
  43. data/test/nokogiri-bench.rb +64 -0
  44. data/test/test_alter.rb +96 -0
  45. data/test/test_builder.rb +37 -0
  46. data/test/test_parser.rb +457 -0
  47. data/test/test_paths.rb +25 -0
  48. data/test/test_preserved.rb +88 -0
  49. data/test/test_xml.rb +28 -0
  50. metadata +124 -0
Binary file
@@ -0,0 +1,26 @@
1
+ # == About hpricot.rb
2
+ #
3
+ # All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
4
+ #
5
+ # * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
6
+ # * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
7
+ # * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
8
+ # * hpricot/modules.rb: categorizes the various elements using mixins.
9
+ # * hpricot/traverse.rb: methods for searching documents.
10
+ # * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
11
+ # * hpricot/inspect.rb: methods for displaying documents in a readable form.
12
+
13
+ # If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
14
+ # See http://git.bitwi.se/ruby-character-encodings.git/.
15
+ begin
16
+ require 'encoding/character/utf-8'
17
+ rescue LoadError
18
+ end
19
+
20
+ require 'hpricot_scan'
21
+ require 'hpricot/tag'
22
+ require 'hpricot/modules'
23
+ require 'hpricot/traverse'
24
+ require 'hpricot/inspect'
25
+ require 'hpricot/parse'
26
+ require 'hpricot/builder'
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+ #--
3
+ # Copyright 2004 by Jim Weirich (jim@weirichhouse.org).
4
+ # All rights reserved.
5
+
6
+ # Permission is granted for use, copying, modification, distribution,
7
+ # and distribution of modified versions of this work as long as the
8
+ # above copyright notice is included.
9
+ #++
10
+
11
+ module Hpricot
12
+
13
+ # BlankSlate provides an abstract base class with no predefined
14
+ # methods (except for <tt>\_\_send__</tt> and <tt>\_\_id__</tt>).
15
+ # BlankSlate is useful as a base class when writing classes that
16
+ # depend upon <tt>method_missing</tt> (e.g. dynamic proxies).
17
+ class BlankSlate
18
+ class << self
19
+
20
+ # Hide the method named +name+ in the BlankSlate class. Don't
21
+ # hide +instance_eval+ or any method beginning with "__".
22
+ def hide(name)
23
+ undef_method name if
24
+ instance_methods.include?(name.to_s) and
25
+ name !~ /^(__|instance_eval)/
26
+ end
27
+ end
28
+
29
+ instance_methods.each { |m| hide(m) }
30
+ end
31
+ end
32
+
33
+ # Since Ruby is very dynamic, methods added to the ancestors of
34
+ # BlankSlate <em>after BlankSlate is defined</em> will show up in the
35
+ # list of available BlankSlate methods. We handle this by defining a
36
+ # hook in the Object and Kernel classes that will hide any defined
37
+ module Kernel
38
+ class << self
39
+ alias_method :hpricot_slate_method_added, :method_added
40
+
41
+ # Detect method additions to Kernel and remove them in the
42
+ # BlankSlate class.
43
+ def method_added(name)
44
+ hpricot_slate_method_added(name)
45
+ return if self != Kernel
46
+ Hpricot::BlankSlate.hide(name)
47
+ end
48
+ end
49
+ end
50
+
51
+ class Object
52
+ class << self
53
+ alias_method :hpricot_slate_method_added, :method_added
54
+
55
+ # Detect method additions to Object and remove them in the
56
+ # BlankSlate class.
57
+ def method_added(name)
58
+ hpricot_slate_method_added(name)
59
+ return if self != Object
60
+ Hpricot::BlankSlate.hide(name)
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,216 @@
1
+ require 'hpricot/tags'
2
+ require 'fast_xs'
3
+ require 'hpricot/blankslate'
4
+ require 'hpricot/htmlinfo'
5
+
6
+ module Hpricot
7
+ # XML unescape
8
+ def self.uxs(str)
9
+ str.to_s.
10
+ gsub(/\&(\w+);/) { [NamedCharacters[$1] || 63].pack("U*") }. # 63 = ?? (query char)
11
+ gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
12
+ end
13
+
14
+ def self.build(ele = Doc.new, assigns = {}, &blk)
15
+ ele.extend Builder
16
+ assigns.each do |k, v|
17
+ ele.instance_variable_set("@#{k}", v)
18
+ end
19
+ ele.instance_eval(&blk)
20
+ ele
21
+ end
22
+
23
+ module Builder
24
+
25
+ @@default = {
26
+ :indent => 0,
27
+ :output_helpers => true,
28
+ :output_xml_instruction => true,
29
+ :output_meta_tag => true,
30
+ :auto_validation => true,
31
+ :tagset => Hpricot::XHTMLTransitional,
32
+ :root_attributes => {
33
+ :xmlns => 'http://www.w3.org/1999/xhtml', :'xml:lang' => 'en', :lang => 'en'
34
+ }
35
+ }
36
+
37
+ def self.set(option, value)
38
+ @@default[option] = value
39
+ end
40
+
41
+ def add_child ele
42
+ ele.parent = self
43
+ self.children ||= []
44
+ self.children << ele
45
+ ele
46
+ end
47
+
48
+ # Write a +string+ to the HTML stream, making sure to escape it.
49
+ def text!(string)
50
+ add_child Text.new(string.fast_xs)
51
+ end
52
+
53
+ # Write a +string+ to the HTML stream without escaping it.
54
+ def text(string)
55
+ add_child Text.new(string)
56
+ nil
57
+ end
58
+ alias_method :<<, :text
59
+ alias_method :concat, :text
60
+
61
+ # Create a tag named +tag+. Other than the first argument which is the tag name,
62
+ # the arguments are the same as the tags implemented via method_missing.
63
+ def tag!(tag, *args, &block)
64
+ ele_id = nil
65
+ if @auto_validation and @tagset
66
+ if !@tagset.tagset.has_key?(tag)
67
+ raise InvalidXhtmlError, "no element `#{tag}' for #{tagset.doctype}"
68
+ elsif args.last.respond_to?(:to_hash)
69
+ attrs = args.last.to_hash
70
+
71
+ if @tagset.forms.include?(tag) and attrs[:id]
72
+ attrs[:name] ||= attrs[:id]
73
+ end
74
+
75
+ attrs.each do |k, v|
76
+ atname = k.to_s.downcase.intern
77
+ unless k =~ /:/ or @tagset.tagset[tag].include? atname
78
+ raise InvalidXhtmlError, "no attribute `#{k}' on #{tag} elements"
79
+ end
80
+ if atname == :id
81
+ ele_id = v.to_s
82
+ if @elements.has_key? ele_id
83
+ raise InvalidXhtmlError, "id `#{ele_id}' already used (id's must be unique)."
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ # turn arguments into children or attributes
91
+ childs = []
92
+ attrs = args.grep(Hash)
93
+ childs.concat((args - attrs).flatten.map do |x|
94
+ if x.respond_to? :to_html
95
+ Hpricot.make(x.to_html)
96
+ elsif x
97
+ Text.new(x.fast_xs)
98
+ end
99
+ end.flatten)
100
+ attrs = attrs.inject({}) do |hsh, ath|
101
+ ath.each do |k, v|
102
+ hsh[k] = v.to_s.fast_xs if v
103
+ end
104
+ hsh
105
+ end
106
+
107
+ # create the element itself
108
+ tag = tag.to_s
109
+ f = Elem.new(tag, attrs, childs, ETag.new(tag))
110
+
111
+ # build children from the block
112
+ if block
113
+ build(f, &block)
114
+ end
115
+
116
+ add_child f
117
+ f
118
+ end
119
+
120
+ def build(*a, &b)
121
+ Hpricot.build(*a, &b)
122
+ end
123
+
124
+ # Every HTML tag method goes through an html_tag call. So, calling <tt>div</tt> is equivalent
125
+ # to calling <tt>html_tag(:div)</tt>. All HTML tags in Hpricot's list are given generated wrappers
126
+ # for this method.
127
+ #
128
+ # If the @auto_validation setting is on, this method will check for many common mistakes which
129
+ # could lead to invalid XHTML.
130
+ def html_tag(sym, *args, &block)
131
+ if @auto_validation and @tagset.self_closing.include?(sym) and block
132
+ raise InvalidXhtmlError, "the `#{sym}' element is self-closing, please remove the block"
133
+ elsif args.empty? and block.nil?
134
+ CssProxy.new(self, sym)
135
+ else
136
+ tag!(sym, *args, &block)
137
+ end
138
+ end
139
+
140
+ XHTMLTransitional.tags.each do |k|
141
+ class_eval %{
142
+ def #{k}(*args, &block)
143
+ html_tag(#{k.inspect}, *args, &block)
144
+ end
145
+ }
146
+ end
147
+
148
+ def doctype(target, pub, sys)
149
+ add_child DocType.new(target, pub, sys)
150
+ end
151
+
152
+ remove_method :head
153
+
154
+ # Builds a head tag. Adds a <tt>meta</tt> tag inside with Content-Type
155
+ # set to <tt>text/html; charset=utf-8</tt>.
156
+ def head(*args, &block)
157
+ tag!(:head, *args) do
158
+ tag!(:meta, "http-equiv" => "Content-Type", "content" => "text/html; charset=utf-8") if @output_meta_tag
159
+ instance_eval(&block)
160
+ end
161
+ end
162
+
163
+ # Builds an html tag. An XML 1.0 instruction and an XHTML 1.0 Transitional doctype
164
+ # are prepended. Also assumes <tt>:xmlns => "http://www.w3.org/1999/xhtml",
165
+ # :lang => "en"</tt>.
166
+ def xhtml_transitional(attrs = {}, &block)
167
+ # self.tagset = Hpricot::XHTMLTransitional
168
+ xhtml_html(attrs, &block)
169
+ end
170
+
171
+ # Builds an html tag with XHTML 1.0 Strict doctype instead.
172
+ def xhtml_strict(attrs = {}, &block)
173
+ # self.tagset = Hpricot::XHTMLStrict
174
+ xhtml_html(attrs, &block)
175
+ end
176
+
177
+ private
178
+
179
+ def xhtml_html(attrs = {}, &block)
180
+ instruct! if @output_xml_instruction
181
+ doctype(:html, *@@default[:tagset].doctype)
182
+ tag!(:html, @@default[:root_attributes].merge(attrs), &block)
183
+ end
184
+
185
+ end
186
+
187
+ # Class used by Markaby::Builder to store element options. Methods called
188
+ # against the CssProxy object are added as element classes or IDs.
189
+ #
190
+ # See the README for examples.
191
+ class CssProxy < BlankSlate
192
+
193
+ # Creates a CssProxy object.
194
+ def initialize(builder, sym)
195
+ @builder, @sym, @attrs = builder, sym, {}
196
+ end
197
+
198
+ # Adds attributes to an element. Bang methods set the :id attribute.
199
+ # Other methods add to the :class attribute.
200
+ def method_missing(id_or_class, *args, &block)
201
+ if (idc = id_or_class.to_s) =~ /!$/
202
+ @attrs[:id] = $`
203
+ else
204
+ @attrs[:class] = @attrs[:class].nil? ? idc : "#{@attrs[:class]} #{idc}".strip
205
+ end
206
+
207
+ if block or args.any?
208
+ args.push(@attrs)
209
+ return @builder.tag!(@sym, *args, &block)
210
+ end
211
+
212
+ return self
213
+ end
214
+
215
+ end
216
+ end
@@ -0,0 +1,514 @@
1
+ module Hpricot
2
+ # Once you've matched a list of elements, you will often need to handle them as
3
+ # a group. Or you may want to perform the same action on each of them.
4
+ # Hpricot::Elements is an extension of Ruby's array class, with some methods
5
+ # added for altering elements contained in the array.
6
+ #
7
+ # If you need to create an element array from regular elements:
8
+ #
9
+ # Hpricot::Elements[ele1, ele2, ele3]
10
+ #
11
+ # Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
12
+ # Hpricot::Doc, etc.)
13
+ #
14
+ # == Continuing Searches
15
+ #
16
+ # Usually the Hpricot::Elements you're working on comes from a search you've
17
+ # done. Well, you can continue searching the list by using the same <tt>at</tt>
18
+ # and <tt>search</tt> methods you can use on plain elements.
19
+ #
20
+ # elements = doc.search("/div/p")
21
+ # elements = elements.search("/a[@href='http://hoodwink.d/']")
22
+ # elements = elements.at("img")
23
+ #
24
+ # == Altering Elements
25
+ #
26
+ # When you're altering elements in the list, your changes will be reflected in
27
+ # the document you started searching from.
28
+ #
29
+ # doc = Hpricot("That's my <b>spoon</b>, Tyler.")
30
+ # doc.at("b").swap("<i>fork</i>")
31
+ # doc.to_html
32
+ # #=> "That's my <i>fork</i>, Tyler."
33
+ #
34
+ # == Getting More Detailed
35
+ #
36
+ # If you can't find a method here that does what you need, you may need to
37
+ # loop through the elements and find a method in Hpricot::Container::Trav
38
+ # which can do what you need.
39
+ #
40
+ # For example, you may want to search for all the H3 header tags in a document
41
+ # and grab all the tags underneath the header, but not inside the header.
42
+ # A good method for this is <tt>next_sibling</tt>:
43
+ #
44
+ # doc.search("h3").each do |h3|
45
+ # while ele = h3.next_sibling
46
+ # ary << ele # stuff away all the elements under the h3
47
+ # end
48
+ # end
49
+ #
50
+ # Most of the useful element methods are in the mixins Hpricot::Traverse
51
+ # and Hpricot::Container::Trav.
52
+ class Elements < Array
53
+
54
+ # Searches this list for any elements (or children of these elements) matching
55
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
56
+ #
57
+ # See Hpricot::Container::Trav.search for more.
58
+ def search(*expr,&blk)
59
+ Elements[*map { |x| x.search(*expr,&blk) }.flatten.uniq]
60
+ end
61
+ alias_method :/, :search
62
+
63
+ # Searches this list for the first element (or child of these elements) matching
64
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
65
+ #
66
+ # See Hpricot::Container::Trav.at for more.
67
+ def at(expr, &blk)
68
+ if expr.kind_of? Fixnum
69
+ super
70
+ else
71
+ search(expr, &blk)[0]
72
+ end
73
+ end
74
+ alias_method :%, :at
75
+
76
+ # Convert this group of elements into a complete HTML fragment, returned as a
77
+ # string.
78
+ def to_html
79
+ map { |x| x.output("") }.join
80
+ end
81
+ alias_method :to_s, :to_html
82
+
83
+ # Returns an HTML fragment built of the contents of each element in this list.
84
+ #
85
+ # If a HTML +string+ is supplied, this method acts like inner_html=.
86
+ def inner_html(*string)
87
+ if string.empty?
88
+ map { |x| x.inner_html }.join
89
+ else
90
+ x = self.inner_html = string.pop || x
91
+ end
92
+ end
93
+ alias_method :html, :inner_html
94
+ alias_method :innerHTML, :inner_html
95
+
96
+ # Replaces the contents of each element in this list. Supply an HTML +string+,
97
+ # which is loaded into Hpricot objects and inserted into every element in this
98
+ # list.
99
+ def inner_html=(string)
100
+ each { |x| x.inner_html = string }
101
+ end
102
+ alias_method :html=, :inner_html=
103
+ alias_method :innerHTML=, :inner_html=
104
+
105
+ # Returns an string containing the text contents of each element in this list.
106
+ # All HTML tags are removed.
107
+ def inner_text
108
+ map { |x| x.inner_text }.join
109
+ end
110
+ alias_method :text, :inner_text
111
+
112
+ # Remove all elements in this list from the document which contains them.
113
+ #
114
+ # doc = Hpricot("<html>Remove this: <b>here</b></html>")
115
+ # doc.search("b").remove
116
+ # doc.to_html
117
+ # => "<html>Remove this: </html>"
118
+ #
119
+ def remove
120
+ each { |x| x.parent.children.delete(x) }
121
+ end
122
+
123
+ # Empty the elements in this list, by removing their insides.
124
+ #
125
+ # doc = Hpricot("<p> We have <i>so much</i> to say.</p>")
126
+ # doc.search("i").empty
127
+ # doc.to_html
128
+ # => "<p> We have <i></i> to say.</p>"
129
+ #
130
+ def empty
131
+ each { |x| x.inner_html = nil }
132
+ end
133
+
134
+ # Add to the end of the contents inside each element in this list.
135
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
136
+ def append(str = nil, &blk)
137
+ each { |x| x.html(x.children + x.make(str, &blk)) }
138
+ end
139
+
140
+ # Add to the start of the contents inside each element in this list.
141
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
142
+ def prepend(str = nil, &blk)
143
+ each { |x| x.html(x.make(str, &blk) + x.children) }
144
+ end
145
+
146
+ # Add some HTML just previous to each element in this list.
147
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
148
+ def before(str = nil, &blk)
149
+ each { |x| x.parent.insert_before x.make(str, &blk), x }
150
+ end
151
+
152
+ # Just after each element in this list, add some HTML.
153
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
154
+ def after(str = nil, &blk)
155
+ each { |x| x.parent.insert_after x.make(str, &blk), x }
156
+ end
157
+
158
+ # Wraps each element in the list inside the element created by HTML +str+.
159
+ # If more than one element is found in the string, Hpricot locates the
160
+ # deepest spot inside the first element.
161
+ #
162
+ # doc.search("a[@href]").
163
+ # wrap(%{<div class="link"><div class="link_inner"></div></div>})
164
+ #
165
+ # This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
166
+ def wrap(str = nil, &blk)
167
+ each do |x|
168
+ wrap = x.make(str, &blk)
169
+ nest = wrap.detect { |w| w.respond_to? :children }
170
+ unless nest
171
+ raise "No wrapping element found."
172
+ end
173
+ x.parent.replace_child(x, wrap)
174
+ nest = nest.children.first until nest.empty?
175
+ nest.html([x])
176
+ end
177
+ end
178
+
179
+ # Gets and sets attributes on all matched elements.
180
+ #
181
+ # Pass in a +key+ on its own and this method will return the string value
182
+ # assigned to that attribute for the first elements. Or +nil+ if the
183
+ # attribute isn't found.
184
+ #
185
+ # doc.search("a").attr("href")
186
+ # #=> "http://hacketyhack.net/"
187
+ #
188
+ # Or, pass in a +key+ and +value+. This will set an attribute for all
189
+ # matched elements.
190
+ #
191
+ # doc.search("p").attr("class", "basic")
192
+ #
193
+ # You may also use a Hash to set a series of attributes:
194
+ #
195
+ # (doc/"a").attr(:class => "basic", :href => "http://hackety.org/")
196
+ #
197
+ # Lastly, a block can be used to rewrite an attribute based on the element
198
+ # it belongs to. The block will pass in an element. Return from the block
199
+ # the new value of the attribute.
200
+ #
201
+ # records.attr("href") { |e| e['href'] + "#top" }
202
+ #
203
+ # This example adds a <tt>#top</tt> anchor to each link.
204
+ #
205
+ def attr key, value = nil, &blk
206
+ if value or blk
207
+ each do |el|
208
+ el.set_attribute(key, value || blk[el])
209
+ end
210
+ return self
211
+ end
212
+ if key.is_a? Hash
213
+ key.each { |k,v| self.attr(k,v) }
214
+ return self
215
+ else
216
+ return self[0].get_attribute(key)
217
+ end
218
+ end
219
+ alias_method :set, :attr
220
+
221
+ # Adds the class to all matched elements.
222
+ #
223
+ # (doc/"p").add_class("bacon")
224
+ #
225
+ # Now all paragraphs will have class="bacon".
226
+ def add_class class_name
227
+ each do |el|
228
+ next unless el.respond_to? :get_attribute
229
+ classes = el.get_attribute('class').to_s.split(" ")
230
+ el.set_attribute('class', classes.push(class_name).uniq.join(" "))
231
+ end
232
+ self
233
+ end
234
+
235
+ # Remove an attribute from each of the matched elements.
236
+ #
237
+ # (doc/"input").remove_attr("disabled")
238
+ #
239
+ def remove_attr name
240
+ each do |el|
241
+ next unless el.respond_to? :remove_attribute
242
+ el.remove_attribute(name)
243
+ end
244
+ self
245
+ end
246
+
247
+ # Removes a class from all matched elements.
248
+ #
249
+ # (doc/"span").remove_class("lightgrey")
250
+ #
251
+ # Or, to remove all classes:
252
+ #
253
+ # (doc/"span").remove_class
254
+ #
255
+ def remove_class name = nil
256
+ each do |el|
257
+ next unless el.respond_to? :get_attribute
258
+ if name
259
+ classes = el.get_attribute('class').to_s.split(" ")
260
+ el.set_attribute('class', (classes - [name]).uniq.join(" "))
261
+ else
262
+ el.remove_attribute("class")
263
+ end
264
+ end
265
+ self
266
+ end
267
+
268
+ ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i # " (for emacs)
269
+ BRACK_RE = %r!(\[) *([^\]]*) *\]+!i
270
+ FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
271
+ CUST_RE = %r!(:)([a-zA-Z0-9\*_-]*)()!
272
+ CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!
273
+
274
+ def self.filter(nodes, expr, truth = true)
275
+ until expr.empty?
276
+ _, *m = *expr.match(/^(?:#{ATTR_RE}|#{BRACK_RE}|#{FUNC_RE}|#{CUST_RE}|#{CATCH_RE})/)
277
+ break unless _
278
+
279
+ expr = $'
280
+ m.compact!
281
+ if m[0] == '@'
282
+ m[0] = "@#{m.slice!(2,1).join}"
283
+ end
284
+
285
+ if m[0] == '[' && m[1] =~ /^\d+$/
286
+ m = [":", "nth", m[1].to_i-1]
287
+ end
288
+
289
+ if m[0] == ":" && m[1] == "not"
290
+ nodes, = Elements.filter(nodes, m[2], false)
291
+ elsif "#{m[0]}#{m[1]}" =~ /^(:even|:odd)$/
292
+ new_nodes = []
293
+ nodes.each_with_index {|n,i| new_nodes.push(n) if (i % 2 == (m[1] == "even" ? 0 : 1)) }
294
+ nodes = new_nodes
295
+ elsif "#{m[0]}#{m[1]}" =~ /^(:first|:last)$/
296
+ nodes = [nodes.send(m[1])]
297
+ else
298
+ meth = "filter[#{m[0]}#{m[1]}]" unless m[0].empty?
299
+ if meth and Traverse.method_defined? meth
300
+ args = m[2..-1]
301
+ else
302
+ meth = "filter[#{m[0]}]"
303
+ if Traverse.method_defined? meth
304
+ args = m[1..-1]
305
+ end
306
+ end
307
+ args << -1
308
+ nodes = Elements[*nodes.find_all do |x|
309
+ args[-1] += 1
310
+ x.send(meth, *args) ? truth : !truth
311
+ end]
312
+ end
313
+ end
314
+ [nodes, expr]
315
+ end
316
+
317
+ # Given two elements, attempt to gather an Elements array of everything between
318
+ # (and including) those two elements.
319
+ def self.expand(ele1, ele2, excl=false)
320
+ ary = []
321
+ offset = excl ? -1 : 0
322
+
323
+ if ele1 and ele2
324
+ # let's quickly take care of siblings
325
+ if ele1.parent == ele2.parent
326
+ ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
327
+ else
328
+ # find common parent
329
+ p, ele1_p = ele1, [ele1]
330
+ ele1_p.unshift p while p.respond_to?(:parent) and p = p.parent
331
+ p, ele2_p = ele2, [ele2]
332
+ ele2_p.unshift p while p.respond_to?(:parent) and p = p.parent
333
+ common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.last
334
+
335
+ child = nil
336
+ if ele1 == common_parent
337
+ child = ele2
338
+ elsif ele2 == common_parent
339
+ child = ele1
340
+ end
341
+
342
+ if child
343
+ ary = common_parent.children[0..(child.node_position+offset)]
344
+ end
345
+ end
346
+ end
347
+
348
+ return Elements[*ary]
349
+ end
350
+
351
+ def filter(expr)
352
+ nodes, = Elements.filter(self, expr)
353
+ nodes
354
+ end
355
+
356
+ def not(expr)
357
+ if expr.is_a? Traverse
358
+ nodes = self - [expr]
359
+ else
360
+ nodes, = Elements.filter(self, expr, false)
361
+ end
362
+ nodes
363
+ end
364
+
365
+ private
366
+ def copy_node(node, l)
367
+ l.instance_variables.each do |iv|
368
+ node.instance_variable_set(iv, l.instance_variable_get(iv))
369
+ end
370
+ end
371
+
372
+ end
373
+
374
+ module Traverse
375
+ def self.filter(tok, &blk)
376
+ define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
377
+ end
378
+
379
+ filter '' do |name,i|
380
+ name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
381
+ end
382
+
383
+ filter '#' do |id,i|
384
+ self.elem? and get_attribute('id').to_s == id
385
+ end
386
+
387
+ filter '.' do |name,i|
388
+ self.elem? and classes.include? name
389
+ end
390
+
391
+ filter :lt do |num,i|
392
+ self.position < num.to_i
393
+ end
394
+
395
+ filter :gt do |num,i|
396
+ self.position > num.to_i
397
+ end
398
+
399
+ nth = proc { |num,i| self.position == num.to_i }
400
+ nth_first = proc { |*a| self.position == 0 }
401
+ nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
402
+
403
+ filter :nth, &nth
404
+ filter :eq, &nth
405
+ filter ":nth-of-type", &nth
406
+
407
+ filter :first, &nth_first
408
+ filter ":first-of-type", &nth_first
409
+
410
+ filter :last, &nth_last
411
+ filter ":last-of-type", &nth_last
412
+
413
+ filter :even do |num,i|
414
+ self.position % 2 == 0
415
+ end
416
+
417
+ filter :odd do |num,i|
418
+ self.position % 2 == 1
419
+ end
420
+
421
+ filter ':first-child' do |i|
422
+ self == parent.containers.first
423
+ end
424
+
425
+ filter ':nth-child' do |arg,i|
426
+ case arg
427
+ when 'even'; (parent.containers.index(self) + 1) % 2 == 0
428
+ when 'odd'; (parent.containers.index(self) + 1) % 2 == 1
429
+ else self == (parent.containers[arg.to_i - 1])
430
+ end
431
+ end
432
+
433
+ filter ":last-child" do |i|
434
+ self == parent.containers.last
435
+ end
436
+
437
+ filter ":nth-last-child" do |arg,i|
438
+ self == parent.containers[-1-arg.to_i]
439
+ end
440
+
441
+ filter ":nth-last-of-type" do |arg,i|
442
+ self == parent.children_of_type(self.name)[-1-arg.to_i]
443
+ end
444
+
445
+ filter ":only-of-type" do |arg,i|
446
+ parent.children_of_type(self.name).length == 1
447
+ end
448
+
449
+ filter ":only-child" do |arg,i|
450
+ parent.containers.length == 1
451
+ end
452
+
453
+ filter :parent do |*a|
454
+ containers.length > 0
455
+ end
456
+
457
+ filter :empty do |*a|
458
+ elem? && inner_html.length == 0
459
+ end
460
+
461
+ filter :root do |*a|
462
+ self.is_a? Hpricot::Doc
463
+ end
464
+
465
+ filter 'text' do |*a|
466
+ self.text?
467
+ end
468
+
469
+ filter 'comment' do |*a|
470
+ self.comment?
471
+ end
472
+
473
+ filter :contains do |arg, ignore|
474
+ html.include? arg
475
+ end
476
+
477
+
478
+
479
+ pred_procs =
480
+ {'text()' => proc { |ele, *_| ele.inner_text.strip },
481
+ '@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
482
+
483
+ oper_procs =
484
+ {'=' => proc { |a,b| a == b },
485
+ '!=' => proc { |a,b| a != b },
486
+ '~=' => proc { |a,b| a.split(/\s+/).include?(b) },
487
+ '|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
488
+ '^=' => proc { |a,b| a.index(b) == 0 },
489
+ '$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
490
+ '*=' => proc { |a,b| idx = a.index(b) }}
491
+
492
+ pred_procs.each do |pred_n, pred_f|
493
+ oper_procs.each do |oper_n, oper_f|
494
+ filter "#{pred_n}#{oper_n}" do |*a|
495
+ qual = pred_f[self, *a]
496
+ oper_f[qual, a[-2]] if qual
497
+ end
498
+ end
499
+ end
500
+
501
+ filter 'text()' do |val,i|
502
+ self.children.grep(Hpricot::Text).detect { |x| x.content =~ /\S/ } if self.children
503
+ end
504
+
505
+ filter '@' do |attr,val,i|
506
+ self.elem? and has_attribute? attr
507
+ end
508
+
509
+ filter '[' do |val,i|
510
+ self.elem? and search(val).length > 0
511
+ end
512
+
513
+ end
514
+ end