hpricot 0.8.3-i386-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. data/CHANGELOG +104 -0
  2. data/COPYING +18 -0
  3. data/README.md +276 -0
  4. data/Rakefile +234 -0
  5. data/ext/fast_xs/FastXsService.java +1123 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +210 -0
  8. data/ext/hpricot_scan/HpricotCss.java +850 -0
  9. data/ext/hpricot_scan/HpricotScanService.java +2099 -0
  10. data/ext/hpricot_scan/extconf.rb +9 -0
  11. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  12. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  13. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  14. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  15. data/ext/hpricot_scan/hpricot_scan.c +7039 -0
  16. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  17. data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
  18. data/ext/hpricot_scan/hpricot_scan.rl +896 -0
  19. data/extras/hpricot.png +0 -0
  20. data/lib/fast_xs.rb +1 -0
  21. data/lib/fast_xs/1.8/fast_xs.so +0 -0
  22. data/lib/fast_xs/1.9/fast_xs.so +0 -0
  23. data/lib/hpricot.rb +26 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +216 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +94 -0
  35. data/lib/hpricot_scan.rb +1 -0
  36. data/lib/hpricot_scan/1.8/hpricot_scan.so +0 -0
  37. data/lib/hpricot_scan/1.9/hpricot_scan.so +0 -0
  38. data/test/files/basic.xhtml +17 -0
  39. data/test/files/boingboing.html +2266 -0
  40. data/test/files/cy0.html +3653 -0
  41. data/test/files/immob.html +400 -0
  42. data/test/files/pace_application.html +1320 -0
  43. data/test/files/tenderlove.html +16 -0
  44. data/test/files/uswebgen.html +220 -0
  45. data/test/files/utf8.html +1054 -0
  46. data/test/files/week9.html +1723 -0
  47. data/test/files/why.xml +19 -0
  48. data/test/load_files.rb +7 -0
  49. data/test/nokogiri-bench.rb +64 -0
  50. data/test/test_alter.rb +96 -0
  51. data/test/test_builder.rb +37 -0
  52. data/test/test_parser.rb +457 -0
  53. data/test/test_paths.rb +25 -0
  54. data/test/test_preserved.rb +88 -0
  55. data/test/test_xml.rb +28 -0
  56. metadata +128 -0
Binary file
@@ -0,0 +1 @@
1
+ require "fast_xs/#{RUBY_VERSION.sub(/\.\d+$/, '')}/fast_xs"
@@ -0,0 +1,26 @@
1
+ # == About hpricot.rb
2
+ #
3
+ # All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
4
+ #
5
+ # * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
6
+ # * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
7
+ # * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
8
+ # * hpricot/modules.rb: categorizes the various elements using mixins.
9
+ # * hpricot/traverse.rb: methods for searching documents.
10
+ # * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
11
+ # * hpricot/inspect.rb: methods for displaying documents in a readable form.
12
+
13
+ # If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
14
+ # See http://git.bitwi.se/ruby-character-encodings.git/.
15
+ begin
16
+ require 'encoding/character/utf-8'
17
+ rescue LoadError
18
+ end
19
+
20
+ require 'hpricot_scan'
21
+ require 'hpricot/tag'
22
+ require 'hpricot/modules'
23
+ require 'hpricot/traverse'
24
+ require 'hpricot/inspect'
25
+ require 'hpricot/parse'
26
+ require 'hpricot/builder'
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+ #--
3
+ # Copyright 2004 by Jim Weirich (jim@weirichhouse.org).
4
+ # All rights reserved.
5
+
6
+ # Permission is granted for use, copying, modification, distribution,
7
+ # and distribution of modified versions of this work as long as the
8
+ # above copyright notice is included.
9
+ #++
10
+
11
+ module Hpricot
12
+
13
+ # BlankSlate provides an abstract base class with no predefined
14
+ # methods (except for <tt>\_\_send__</tt> and <tt>\_\_id__</tt>).
15
+ # BlankSlate is useful as a base class when writing classes that
16
+ # depend upon <tt>method_missing</tt> (e.g. dynamic proxies).
17
+ class BlankSlate
18
+ class << self
19
+
20
+ # Hide the method named +name+ in the BlankSlate class. Don't
21
+ # hide +instance_eval+ or any method beginning with "__".
22
+ def hide(name)
23
+ undef_method name if
24
+ instance_methods.include?(name.to_s) and
25
+ name !~ /^(__|instance_eval)/
26
+ end
27
+ end
28
+
29
+ instance_methods.each { |m| hide(m) }
30
+ end
31
+ end
32
+
33
+ # Since Ruby is very dynamic, methods added to the ancestors of
34
+ # BlankSlate <em>after BlankSlate is defined</em> will show up in the
35
+ # list of available BlankSlate methods. We handle this by defining a
36
+ # hook in the Object and Kernel classes that will hide any defined
37
+ module Kernel
38
+ class << self
39
+ alias_method :hpricot_slate_method_added, :method_added
40
+
41
+ # Detect method additions to Kernel and remove them in the
42
+ # BlankSlate class.
43
+ def method_added(name)
44
+ hpricot_slate_method_added(name)
45
+ return if self != Kernel
46
+ Hpricot::BlankSlate.hide(name)
47
+ end
48
+ end
49
+ end
50
+
51
+ class Object
52
+ class << self
53
+ alias_method :hpricot_slate_method_added, :method_added
54
+
55
+ # Detect method additions to Object and remove them in the
56
+ # BlankSlate class.
57
+ def method_added(name)
58
+ hpricot_slate_method_added(name)
59
+ return if self != Object
60
+ Hpricot::BlankSlate.hide(name)
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,216 @@
1
+ require 'hpricot/tags'
2
+ require 'fast_xs'
3
+ require 'hpricot/blankslate'
4
+ require 'hpricot/htmlinfo'
5
+
6
+ module Hpricot
7
+ # XML unescape
8
+ def self.uxs(str)
9
+ str.to_s.
10
+ gsub(/\&(\w+);/) { [NamedCharacters[$1] || 63].pack("U*") }. # 63 = ?? (query char)
11
+ gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
12
+ end
13
+
14
+ def self.build(ele = Doc.new, assigns = {}, &blk)
15
+ ele.extend Builder
16
+ assigns.each do |k, v|
17
+ ele.instance_variable_set("@#{k}", v)
18
+ end
19
+ ele.instance_eval(&blk)
20
+ ele
21
+ end
22
+
23
+ module Builder
24
+
25
+ @@default = {
26
+ :indent => 0,
27
+ :output_helpers => true,
28
+ :output_xml_instruction => true,
29
+ :output_meta_tag => true,
30
+ :auto_validation => true,
31
+ :tagset => Hpricot::XHTMLTransitional,
32
+ :root_attributes => {
33
+ :xmlns => 'http://www.w3.org/1999/xhtml', :'xml:lang' => 'en', :lang => 'en'
34
+ }
35
+ }
36
+
37
+ def self.set(option, value)
38
+ @@default[option] = value
39
+ end
40
+
41
+ def add_child ele
42
+ ele.parent = self
43
+ self.children ||= []
44
+ self.children << ele
45
+ ele
46
+ end
47
+
48
+ # Write a +string+ to the HTML stream, making sure to escape it.
49
+ def text!(string)
50
+ add_child Text.new(string.fast_xs)
51
+ end
52
+
53
+ # Write a +string+ to the HTML stream without escaping it.
54
+ def text(string)
55
+ add_child Text.new(string)
56
+ nil
57
+ end
58
+ alias_method :<<, :text
59
+ alias_method :concat, :text
60
+
61
+ # Create a tag named +tag+. Other than the first argument which is the tag name,
62
+ # the arguments are the same as the tags implemented via method_missing.
63
+ def tag!(tag, *args, &block)
64
+ ele_id = nil
65
+ if @auto_validation and @tagset
66
+ if !@tagset.tagset.has_key?(tag)
67
+ raise InvalidXhtmlError, "no element `#{tag}' for #{tagset.doctype}"
68
+ elsif args.last.respond_to?(:to_hash)
69
+ attrs = args.last.to_hash
70
+
71
+ if @tagset.forms.include?(tag) and attrs[:id]
72
+ attrs[:name] ||= attrs[:id]
73
+ end
74
+
75
+ attrs.each do |k, v|
76
+ atname = k.to_s.downcase.intern
77
+ unless k =~ /:/ or @tagset.tagset[tag].include? atname
78
+ raise InvalidXhtmlError, "no attribute `#{k}' on #{tag} elements"
79
+ end
80
+ if atname == :id
81
+ ele_id = v.to_s
82
+ if @elements.has_key? ele_id
83
+ raise InvalidXhtmlError, "id `#{ele_id}' already used (id's must be unique)."
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ # turn arguments into children or attributes
91
+ childs = []
92
+ attrs = args.grep(Hash)
93
+ childs.concat((args - attrs).flatten.map do |x|
94
+ if x.respond_to? :to_html
95
+ Hpricot.make(x.to_html)
96
+ elsif x
97
+ Text.new(x.fast_xs)
98
+ end
99
+ end.flatten)
100
+ attrs = attrs.inject({}) do |hsh, ath|
101
+ ath.each do |k, v|
102
+ hsh[k] = v.to_s.fast_xs if v
103
+ end
104
+ hsh
105
+ end
106
+
107
+ # create the element itself
108
+ tag = tag.to_s
109
+ f = Elem.new(tag, attrs, childs, ETag.new(tag))
110
+
111
+ # build children from the block
112
+ if block
113
+ build(f, &block)
114
+ end
115
+
116
+ add_child f
117
+ f
118
+ end
119
+
120
+ def build(*a, &b)
121
+ Hpricot.build(*a, &b)
122
+ end
123
+
124
+ # Every HTML tag method goes through an html_tag call. So, calling <tt>div</tt> is equivalent
125
+ # to calling <tt>html_tag(:div)</tt>. All HTML tags in Hpricot's list are given generated wrappers
126
+ # for this method.
127
+ #
128
+ # If the @auto_validation setting is on, this method will check for many common mistakes which
129
+ # could lead to invalid XHTML.
130
+ def html_tag(sym, *args, &block)
131
+ if @auto_validation and @tagset.self_closing.include?(sym) and block
132
+ raise InvalidXhtmlError, "the `#{sym}' element is self-closing, please remove the block"
133
+ elsif args.empty? and block.nil?
134
+ CssProxy.new(self, sym)
135
+ else
136
+ tag!(sym, *args, &block)
137
+ end
138
+ end
139
+
140
+ XHTMLTransitional.tags.each do |k|
141
+ class_eval %{
142
+ def #{k}(*args, &block)
143
+ html_tag(#{k.inspect}, *args, &block)
144
+ end
145
+ }
146
+ end
147
+
148
+ def doctype(target, pub, sys)
149
+ add_child DocType.new(target, pub, sys)
150
+ end
151
+
152
+ remove_method :head
153
+
154
+ # Builds a head tag. Adds a <tt>meta</tt> tag inside with Content-Type
155
+ # set to <tt>text/html; charset=utf-8</tt>.
156
+ def head(*args, &block)
157
+ tag!(:head, *args) do
158
+ tag!(:meta, "http-equiv" => "Content-Type", "content" => "text/html; charset=utf-8") if @output_meta_tag
159
+ instance_eval(&block)
160
+ end
161
+ end
162
+
163
+ # Builds an html tag. An XML 1.0 instruction and an XHTML 1.0 Transitional doctype
164
+ # are prepended. Also assumes <tt>:xmlns => "http://www.w3.org/1999/xhtml",
165
+ # :lang => "en"</tt>.
166
+ def xhtml_transitional(attrs = {}, &block)
167
+ # self.tagset = Hpricot::XHTMLTransitional
168
+ xhtml_html(attrs, &block)
169
+ end
170
+
171
+ # Builds an html tag with XHTML 1.0 Strict doctype instead.
172
+ def xhtml_strict(attrs = {}, &block)
173
+ # self.tagset = Hpricot::XHTMLStrict
174
+ xhtml_html(attrs, &block)
175
+ end
176
+
177
+ private
178
+
179
+ def xhtml_html(attrs = {}, &block)
180
+ instruct! if @output_xml_instruction
181
+ doctype(:html, *@@default[:tagset].doctype)
182
+ tag!(:html, @@default[:root_attributes].merge(attrs), &block)
183
+ end
184
+
185
+ end
186
+
187
+ # Class used by Markaby::Builder to store element options. Methods called
188
+ # against the CssProxy object are added as element classes or IDs.
189
+ #
190
+ # See the README for examples.
191
+ class CssProxy < BlankSlate
192
+
193
+ # Creates a CssProxy object.
194
+ def initialize(builder, sym)
195
+ @builder, @sym, @attrs = builder, sym, {}
196
+ end
197
+
198
+ # Adds attributes to an element. Bang methods set the :id attribute.
199
+ # Other methods add to the :class attribute.
200
+ def method_missing(id_or_class, *args, &block)
201
+ if (idc = id_or_class.to_s) =~ /!$/
202
+ @attrs[:id] = $`
203
+ else
204
+ @attrs[:class] = @attrs[:class].nil? ? idc : "#{@attrs[:class]} #{idc}".strip
205
+ end
206
+
207
+ if block or args.any?
208
+ args.push(@attrs)
209
+ return @builder.tag!(@sym, *args, &block)
210
+ end
211
+
212
+ return self
213
+ end
214
+
215
+ end
216
+ end
@@ -0,0 +1,514 @@
1
+ module Hpricot
2
+ # Once you've matched a list of elements, you will often need to handle them as
3
+ # a group. Or you may want to perform the same action on each of them.
4
+ # Hpricot::Elements is an extension of Ruby's array class, with some methods
5
+ # added for altering elements contained in the array.
6
+ #
7
+ # If you need to create an element array from regular elements:
8
+ #
9
+ # Hpricot::Elements[ele1, ele2, ele3]
10
+ #
11
+ # Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
12
+ # Hpricot::Doc, etc.)
13
+ #
14
+ # == Continuing Searches
15
+ #
16
+ # Usually the Hpricot::Elements you're working on comes from a search you've
17
+ # done. Well, you can continue searching the list by using the same <tt>at</tt>
18
+ # and <tt>search</tt> methods you can use on plain elements.
19
+ #
20
+ # elements = doc.search("/div/p")
21
+ # elements = elements.search("/a[@href='http://hoodwink.d/']")
22
+ # elements = elements.at("img")
23
+ #
24
+ # == Altering Elements
25
+ #
26
+ # When you're altering elements in the list, your changes will be reflected in
27
+ # the document you started searching from.
28
+ #
29
+ # doc = Hpricot("That's my <b>spoon</b>, Tyler.")
30
+ # doc.at("b").swap("<i>fork</i>")
31
+ # doc.to_html
32
+ # #=> "That's my <i>fork</i>, Tyler."
33
+ #
34
+ # == Getting More Detailed
35
+ #
36
+ # If you can't find a method here that does what you need, you may need to
37
+ # loop through the elements and find a method in Hpricot::Container::Trav
38
+ # which can do what you need.
39
+ #
40
+ # For example, you may want to search for all the H3 header tags in a document
41
+ # and grab all the tags underneath the header, but not inside the header.
42
+ # A good method for this is <tt>next_sibling</tt>:
43
+ #
44
+ # doc.search("h3").each do |h3|
45
+ # while ele = h3.next_sibling
46
+ # ary << ele # stuff away all the elements under the h3
47
+ # end
48
+ # end
49
+ #
50
+ # Most of the useful element methods are in the mixins Hpricot::Traverse
51
+ # and Hpricot::Container::Trav.
52
+ class Elements < Array
53
+
54
+ # Searches this list for any elements (or children of these elements) matching
55
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
56
+ #
57
+ # See Hpricot::Container::Trav.search for more.
58
+ def search(*expr,&blk)
59
+ Elements[*map { |x| x.search(*expr,&blk) }.flatten.uniq]
60
+ end
61
+ alias_method :/, :search
62
+
63
+ # Searches this list for the first element (or child of these elements) matching
64
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
65
+ #
66
+ # See Hpricot::Container::Trav.at for more.
67
+ def at(expr, &blk)
68
+ if expr.kind_of? Fixnum
69
+ super
70
+ else
71
+ search(expr, &blk)[0]
72
+ end
73
+ end
74
+ alias_method :%, :at
75
+
76
+ # Convert this group of elements into a complete HTML fragment, returned as a
77
+ # string.
78
+ def to_html
79
+ map { |x| x.output("") }.join
80
+ end
81
+ alias_method :to_s, :to_html
82
+
83
+ # Returns an HTML fragment built of the contents of each element in this list.
84
+ #
85
+ # If a HTML +string+ is supplied, this method acts like inner_html=.
86
+ def inner_html(*string)
87
+ if string.empty?
88
+ map { |x| x.inner_html }.join
89
+ else
90
+ x = self.inner_html = string.pop || x
91
+ end
92
+ end
93
+ alias_method :html, :inner_html
94
+ alias_method :innerHTML, :inner_html
95
+
96
+ # Replaces the contents of each element in this list. Supply an HTML +string+,
97
+ # which is loaded into Hpricot objects and inserted into every element in this
98
+ # list.
99
+ def inner_html=(string)
100
+ each { |x| x.inner_html = string }
101
+ end
102
+ alias_method :html=, :inner_html=
103
+ alias_method :innerHTML=, :inner_html=
104
+
105
+ # Returns an string containing the text contents of each element in this list.
106
+ # All HTML tags are removed.
107
+ def inner_text
108
+ map { |x| x.inner_text }.join
109
+ end
110
+ alias_method :text, :inner_text
111
+
112
+ # Remove all elements in this list from the document which contains them.
113
+ #
114
+ # doc = Hpricot("<html>Remove this: <b>here</b></html>")
115
+ # doc.search("b").remove
116
+ # doc.to_html
117
+ # => "<html>Remove this: </html>"
118
+ #
119
+ def remove
120
+ each { |x| x.parent.children.delete(x) }
121
+ end
122
+
123
+ # Empty the elements in this list, by removing their insides.
124
+ #
125
+ # doc = Hpricot("<p> We have <i>so much</i> to say.</p>")
126
+ # doc.search("i").empty
127
+ # doc.to_html
128
+ # => "<p> We have <i></i> to say.</p>"
129
+ #
130
+ def empty
131
+ each { |x| x.inner_html = nil }
132
+ end
133
+
134
+ # Add to the end of the contents inside each element in this list.
135
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
136
+ def append(str = nil, &blk)
137
+ each { |x| x.html(x.children + x.make(str, &blk)) }
138
+ end
139
+
140
+ # Add to the start of the contents inside each element in this list.
141
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
142
+ def prepend(str = nil, &blk)
143
+ each { |x| x.html(x.make(str, &blk) + x.children) }
144
+ end
145
+
146
+ # Add some HTML just previous to each element in this list.
147
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
148
+ def before(str = nil, &blk)
149
+ each { |x| x.parent.insert_before x.make(str, &blk), x }
150
+ end
151
+
152
+ # Just after each element in this list, add some HTML.
153
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
154
+ def after(str = nil, &blk)
155
+ each { |x| x.parent.insert_after x.make(str, &blk), x }
156
+ end
157
+
158
+ # Wraps each element in the list inside the element created by HTML +str+.
159
+ # If more than one element is found in the string, Hpricot locates the
160
+ # deepest spot inside the first element.
161
+ #
162
+ # doc.search("a[@href]").
163
+ # wrap(%{<div class="link"><div class="link_inner"></div></div>})
164
+ #
165
+ # This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
166
+ def wrap(str = nil, &blk)
167
+ each do |x|
168
+ wrap = x.make(str, &blk)
169
+ nest = wrap.detect { |w| w.respond_to? :children }
170
+ unless nest
171
+ raise "No wrapping element found."
172
+ end
173
+ x.parent.replace_child(x, wrap)
174
+ nest = nest.children.first until nest.empty?
175
+ nest.html([x])
176
+ end
177
+ end
178
+
179
+ # Gets and sets attributes on all matched elements.
180
+ #
181
+ # Pass in a +key+ on its own and this method will return the string value
182
+ # assigned to that attribute for the first elements. Or +nil+ if the
183
+ # attribute isn't found.
184
+ #
185
+ # doc.search("a").attr("href")
186
+ # #=> "http://hacketyhack.net/"
187
+ #
188
+ # Or, pass in a +key+ and +value+. This will set an attribute for all
189
+ # matched elements.
190
+ #
191
+ # doc.search("p").attr("class", "basic")
192
+ #
193
+ # You may also use a Hash to set a series of attributes:
194
+ #
195
+ # (doc/"a").attr(:class => "basic", :href => "http://hackety.org/")
196
+ #
197
+ # Lastly, a block can be used to rewrite an attribute based on the element
198
+ # it belongs to. The block will pass in an element. Return from the block
199
+ # the new value of the attribute.
200
+ #
201
+ # records.attr("href") { |e| e['href'] + "#top" }
202
+ #
203
+ # This example adds a <tt>#top</tt> anchor to each link.
204
+ #
205
+ def attr key, value = nil, &blk
206
+ if value or blk
207
+ each do |el|
208
+ el.set_attribute(key, value || blk[el])
209
+ end
210
+ return self
211
+ end
212
+ if key.is_a? Hash
213
+ key.each { |k,v| self.attr(k,v) }
214
+ return self
215
+ else
216
+ return self[0].get_attribute(key)
217
+ end
218
+ end
219
+ alias_method :set, :attr
220
+
221
+ # Adds the class to all matched elements.
222
+ #
223
+ # (doc/"p").add_class("bacon")
224
+ #
225
+ # Now all paragraphs will have class="bacon".
226
+ def add_class class_name
227
+ each do |el|
228
+ next unless el.respond_to? :get_attribute
229
+ classes = el.get_attribute('class').to_s.split(" ")
230
+ el.set_attribute('class', classes.push(class_name).uniq.join(" "))
231
+ end
232
+ self
233
+ end
234
+
235
+ # Remove an attribute from each of the matched elements.
236
+ #
237
+ # (doc/"input").remove_attr("disabled")
238
+ #
239
+ def remove_attr name
240
+ each do |el|
241
+ next unless el.respond_to? :remove_attribute
242
+ el.remove_attribute(name)
243
+ end
244
+ self
245
+ end
246
+
247
+ # Removes a class from all matched elements.
248
+ #
249
+ # (doc/"span").remove_class("lightgrey")
250
+ #
251
+ # Or, to remove all classes:
252
+ #
253
+ # (doc/"span").remove_class
254
+ #
255
+ def remove_class name = nil
256
+ each do |el|
257
+ next unless el.respond_to? :get_attribute
258
+ if name
259
+ classes = el.get_attribute('class').to_s.split(" ")
260
+ el.set_attribute('class', (classes - [name]).uniq.join(" "))
261
+ else
262
+ el.remove_attribute("class")
263
+ end
264
+ end
265
+ self
266
+ end
267
+
268
+ ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i # " (for emacs)
269
+ BRACK_RE = %r!(\[) *([^\]]*) *\]+!i
270
+ FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
271
+ CUST_RE = %r!(:)([a-zA-Z0-9\*_-]*)()!
272
+ CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!
273
+
274
+ def self.filter(nodes, expr, truth = true)
275
+ until expr.empty?
276
+ _, *m = *expr.match(/^(?:#{ATTR_RE}|#{BRACK_RE}|#{FUNC_RE}|#{CUST_RE}|#{CATCH_RE})/)
277
+ break unless _
278
+
279
+ expr = $'
280
+ m.compact!
281
+ if m[0] == '@'
282
+ m[0] = "@#{m.slice!(2,1).join}"
283
+ end
284
+
285
+ if m[0] == '[' && m[1] =~ /^\d+$/
286
+ m = [":", "nth", m[1].to_i-1]
287
+ end
288
+
289
+ if m[0] == ":" && m[1] == "not"
290
+ nodes, = Elements.filter(nodes, m[2], false)
291
+ elsif "#{m[0]}#{m[1]}" =~ /^(:even|:odd)$/
292
+ new_nodes = []
293
+ nodes.each_with_index {|n,i| new_nodes.push(n) if (i % 2 == (m[1] == "even" ? 0 : 1)) }
294
+ nodes = new_nodes
295
+ elsif "#{m[0]}#{m[1]}" =~ /^(:first|:last)$/
296
+ nodes = [nodes.send(m[1])]
297
+ else
298
+ meth = "filter[#{m[0]}#{m[1]}]" unless m[0].empty?
299
+ if meth and Traverse.method_defined? meth
300
+ args = m[2..-1]
301
+ else
302
+ meth = "filter[#{m[0]}]"
303
+ if Traverse.method_defined? meth
304
+ args = m[1..-1]
305
+ end
306
+ end
307
+ args << -1
308
+ nodes = Elements[*nodes.find_all do |x|
309
+ args[-1] += 1
310
+ x.send(meth, *args) ? truth : !truth
311
+ end]
312
+ end
313
+ end
314
+ [nodes, expr]
315
+ end
316
+
317
+ # Given two elements, attempt to gather an Elements array of everything between
318
+ # (and including) those two elements.
319
+ def self.expand(ele1, ele2, excl=false)
320
+ ary = []
321
+ offset = excl ? -1 : 0
322
+
323
+ if ele1 and ele2
324
+ # let's quickly take care of siblings
325
+ if ele1.parent == ele2.parent
326
+ ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
327
+ else
328
+ # find common parent
329
+ p, ele1_p = ele1, [ele1]
330
+ ele1_p.unshift p while p.respond_to?(:parent) and p = p.parent
331
+ p, ele2_p = ele2, [ele2]
332
+ ele2_p.unshift p while p.respond_to?(:parent) and p = p.parent
333
+ common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.last
334
+
335
+ child = nil
336
+ if ele1 == common_parent
337
+ child = ele2
338
+ elsif ele2 == common_parent
339
+ child = ele1
340
+ end
341
+
342
+ if child
343
+ ary = common_parent.children[0..(child.node_position+offset)]
344
+ end
345
+ end
346
+ end
347
+
348
+ return Elements[*ary]
349
+ end
350
+
351
+ def filter(expr)
352
+ nodes, = Elements.filter(self, expr)
353
+ nodes
354
+ end
355
+
356
+ def not(expr)
357
+ if expr.is_a? Traverse
358
+ nodes = self - [expr]
359
+ else
360
+ nodes, = Elements.filter(self, expr, false)
361
+ end
362
+ nodes
363
+ end
364
+
365
+ private
366
+ def copy_node(node, l)
367
+ l.instance_variables.each do |iv|
368
+ node.instance_variable_set(iv, l.instance_variable_get(iv))
369
+ end
370
+ end
371
+
372
+ end
373
+
374
+ module Traverse
375
+ def self.filter(tok, &blk)
376
+ define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
377
+ end
378
+
379
+ filter '' do |name,i|
380
+ name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
381
+ end
382
+
383
+ filter '#' do |id,i|
384
+ self.elem? and get_attribute('id').to_s == id
385
+ end
386
+
387
+ filter '.' do |name,i|
388
+ self.elem? and classes.include? name
389
+ end
390
+
391
+ filter :lt do |num,i|
392
+ self.position < num.to_i
393
+ end
394
+
395
+ filter :gt do |num,i|
396
+ self.position > num.to_i
397
+ end
398
+
399
+ nth = proc { |num,i| self.position == num.to_i }
400
+ nth_first = proc { |*a| self.position == 0 }
401
+ nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
402
+
403
+ filter :nth, &nth
404
+ filter :eq, &nth
405
+ filter ":nth-of-type", &nth
406
+
407
+ filter :first, &nth_first
408
+ filter ":first-of-type", &nth_first
409
+
410
+ filter :last, &nth_last
411
+ filter ":last-of-type", &nth_last
412
+
413
+ filter :even do |num,i|
414
+ self.position % 2 == 0
415
+ end
416
+
417
+ filter :odd do |num,i|
418
+ self.position % 2 == 1
419
+ end
420
+
421
+ filter ':first-child' do |i|
422
+ self == parent.containers.first
423
+ end
424
+
425
+ filter ':nth-child' do |arg,i|
426
+ case arg
427
+ when 'even'; (parent.containers.index(self) + 1) % 2 == 0
428
+ when 'odd'; (parent.containers.index(self) + 1) % 2 == 1
429
+ else self == (parent.containers[arg.to_i - 1])
430
+ end
431
+ end
432
+
433
+ filter ":last-child" do |i|
434
+ self == parent.containers.last
435
+ end
436
+
437
+ filter ":nth-last-child" do |arg,i|
438
+ self == parent.containers[-1-arg.to_i]
439
+ end
440
+
441
+ filter ":nth-last-of-type" do |arg,i|
442
+ self == parent.children_of_type(self.name)[-1-arg.to_i]
443
+ end
444
+
445
+ filter ":only-of-type" do |arg,i|
446
+ parent.children_of_type(self.name).length == 1
447
+ end
448
+
449
+ filter ":only-child" do |arg,i|
450
+ parent.containers.length == 1
451
+ end
452
+
453
+ filter :parent do |*a|
454
+ containers.length > 0
455
+ end
456
+
457
+ filter :empty do |*a|
458
+ elem? && inner_html.length == 0
459
+ end
460
+
461
+ filter :root do |*a|
462
+ self.is_a? Hpricot::Doc
463
+ end
464
+
465
+ filter 'text' do |*a|
466
+ self.text?
467
+ end
468
+
469
+ filter 'comment' do |*a|
470
+ self.comment?
471
+ end
472
+
473
+ filter :contains do |arg, ignore|
474
+ html.include? arg
475
+ end
476
+
477
+
478
+
479
+ pred_procs =
480
+ {'text()' => proc { |ele, *_| ele.inner_text.strip },
481
+ '@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
482
+
483
+ oper_procs =
484
+ {'=' => proc { |a,b| a == b },
485
+ '!=' => proc { |a,b| a != b },
486
+ '~=' => proc { |a,b| a.split(/\s+/).include?(b) },
487
+ '|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
488
+ '^=' => proc { |a,b| a.index(b) == 0 },
489
+ '$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
490
+ '*=' => proc { |a,b| idx = a.index(b) }}
491
+
492
+ pred_procs.each do |pred_n, pred_f|
493
+ oper_procs.each do |oper_n, oper_f|
494
+ filter "#{pred_n}#{oper_n}" do |*a|
495
+ qual = pred_f[self, *a]
496
+ oper_f[qual, a[-2]] if qual
497
+ end
498
+ end
499
+ end
500
+
501
+ filter 'text()' do |val,i|
502
+ self.children.grep(Hpricot::Text).detect { |x| x.content =~ /\S/ } if self.children
503
+ end
504
+
505
+ filter '@' do |attr,val,i|
506
+ self.elem? and has_attribute? attr
507
+ end
508
+
509
+ filter '[' do |val,i|
510
+ self.elem? and search(val).length > 0
511
+ end
512
+
513
+ end
514
+ end