hpricot 0.8.3-i386-mswin32
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +104 -0
- data/COPYING +18 -0
- data/README.md +276 -0
- data/Rakefile +234 -0
- data/ext/fast_xs/FastXsService.java +1123 -0
- data/ext/fast_xs/extconf.rb +4 -0
- data/ext/fast_xs/fast_xs.c +210 -0
- data/ext/hpricot_scan/HpricotCss.java +850 -0
- data/ext/hpricot_scan/HpricotScanService.java +2099 -0
- data/ext/hpricot_scan/extconf.rb +9 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_css.c +3511 -0
- data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
- data/ext/hpricot_scan/hpricot_css.rl +120 -0
- data/ext/hpricot_scan/hpricot_scan.c +7039 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
- data/ext/hpricot_scan/hpricot_scan.rl +896 -0
- data/extras/hpricot.png +0 -0
- data/lib/fast_xs.rb +1 -0
- data/lib/fast_xs/1.8/fast_xs.so +0 -0
- data/lib/fast_xs/1.9/fast_xs.so +0 -0
- data/lib/hpricot.rb +26 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +216 -0
- data/lib/hpricot/elements.rb +514 -0
- data/lib/hpricot/htmlinfo.rb +691 -0
- data/lib/hpricot/inspect.rb +103 -0
- data/lib/hpricot/modules.rb +40 -0
- data/lib/hpricot/parse.rb +38 -0
- data/lib/hpricot/tag.rb +219 -0
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +839 -0
- data/lib/hpricot/xchar.rb +94 -0
- data/lib/hpricot_scan.rb +1 -0
- data/lib/hpricot_scan/1.8/hpricot_scan.so +0 -0
- data/lib/hpricot_scan/1.9/hpricot_scan.so +0 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/immob.html +400 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/uswebgen.html +220 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +7 -0
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_alter.rb +96 -0
- data/test/test_builder.rb +37 -0
- data/test/test_parser.rb +457 -0
- data/test/test_paths.rb +25 -0
- data/test/test_preserved.rb +88 -0
- data/test/test_xml.rb +28 -0
- metadata +128 -0
data/extras/hpricot.png
ADDED
Binary file
|
data/lib/fast_xs.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "fast_xs/#{RUBY_VERSION.sub(/\.\d+$/, '')}/fast_xs"
|
Binary file
|
Binary file
|
data/lib/hpricot.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# == About hpricot.rb
|
2
|
+
#
|
3
|
+
# All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
|
4
|
+
#
|
5
|
+
# * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
|
6
|
+
# * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
|
7
|
+
# * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
|
8
|
+
# * hpricot/modules.rb: categorizes the various elements using mixins.
|
9
|
+
# * hpricot/traverse.rb: methods for searching documents.
|
10
|
+
# * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
|
11
|
+
# * hpricot/inspect.rb: methods for displaying documents in a readable form.
|
12
|
+
|
13
|
+
# If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
|
14
|
+
# See http://git.bitwi.se/ruby-character-encodings.git/.
|
15
|
+
begin
|
16
|
+
require 'encoding/character/utf-8'
|
17
|
+
rescue LoadError
|
18
|
+
end
|
19
|
+
|
20
|
+
require 'hpricot_scan'
|
21
|
+
require 'hpricot/tag'
|
22
|
+
require 'hpricot/modules'
|
23
|
+
require 'hpricot/traverse'
|
24
|
+
require 'hpricot/inspect'
|
25
|
+
require 'hpricot/parse'
|
26
|
+
require 'hpricot/builder'
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#--
|
3
|
+
# Copyright 2004 by Jim Weirich (jim@weirichhouse.org).
|
4
|
+
# All rights reserved.
|
5
|
+
|
6
|
+
# Permission is granted for use, copying, modification, distribution,
|
7
|
+
# and distribution of modified versions of this work as long as the
|
8
|
+
# above copyright notice is included.
|
9
|
+
#++
|
10
|
+
|
11
|
+
module Hpricot
|
12
|
+
|
13
|
+
# BlankSlate provides an abstract base class with no predefined
|
14
|
+
# methods (except for <tt>\_\_send__</tt> and <tt>\_\_id__</tt>).
|
15
|
+
# BlankSlate is useful as a base class when writing classes that
|
16
|
+
# depend upon <tt>method_missing</tt> (e.g. dynamic proxies).
|
17
|
+
class BlankSlate
|
18
|
+
class << self
|
19
|
+
|
20
|
+
# Hide the method named +name+ in the BlankSlate class. Don't
|
21
|
+
# hide +instance_eval+ or any method beginning with "__".
|
22
|
+
def hide(name)
|
23
|
+
undef_method name if
|
24
|
+
instance_methods.include?(name.to_s) and
|
25
|
+
name !~ /^(__|instance_eval)/
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
instance_methods.each { |m| hide(m) }
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Since Ruby is very dynamic, methods added to the ancestors of
|
34
|
+
# BlankSlate <em>after BlankSlate is defined</em> will show up in the
|
35
|
+
# list of available BlankSlate methods. We handle this by defining a
|
36
|
+
# hook in the Object and Kernel classes that will hide any defined
|
37
|
+
module Kernel
|
38
|
+
class << self
|
39
|
+
alias_method :hpricot_slate_method_added, :method_added
|
40
|
+
|
41
|
+
# Detect method additions to Kernel and remove them in the
|
42
|
+
# BlankSlate class.
|
43
|
+
def method_added(name)
|
44
|
+
hpricot_slate_method_added(name)
|
45
|
+
return if self != Kernel
|
46
|
+
Hpricot::BlankSlate.hide(name)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class Object
|
52
|
+
class << self
|
53
|
+
alias_method :hpricot_slate_method_added, :method_added
|
54
|
+
|
55
|
+
# Detect method additions to Object and remove them in the
|
56
|
+
# BlankSlate class.
|
57
|
+
def method_added(name)
|
58
|
+
hpricot_slate_method_added(name)
|
59
|
+
return if self != Object
|
60
|
+
Hpricot::BlankSlate.hide(name)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,216 @@
|
|
1
|
+
require 'hpricot/tags'
|
2
|
+
require 'fast_xs'
|
3
|
+
require 'hpricot/blankslate'
|
4
|
+
require 'hpricot/htmlinfo'
|
5
|
+
|
6
|
+
module Hpricot
|
7
|
+
# XML unescape
|
8
|
+
def self.uxs(str)
|
9
|
+
str.to_s.
|
10
|
+
gsub(/\&(\w+);/) { [NamedCharacters[$1] || 63].pack("U*") }. # 63 = ?? (query char)
|
11
|
+
gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.build(ele = Doc.new, assigns = {}, &blk)
|
15
|
+
ele.extend Builder
|
16
|
+
assigns.each do |k, v|
|
17
|
+
ele.instance_variable_set("@#{k}", v)
|
18
|
+
end
|
19
|
+
ele.instance_eval(&blk)
|
20
|
+
ele
|
21
|
+
end
|
22
|
+
|
23
|
+
module Builder
|
24
|
+
|
25
|
+
@@default = {
|
26
|
+
:indent => 0,
|
27
|
+
:output_helpers => true,
|
28
|
+
:output_xml_instruction => true,
|
29
|
+
:output_meta_tag => true,
|
30
|
+
:auto_validation => true,
|
31
|
+
:tagset => Hpricot::XHTMLTransitional,
|
32
|
+
:root_attributes => {
|
33
|
+
:xmlns => 'http://www.w3.org/1999/xhtml', :'xml:lang' => 'en', :lang => 'en'
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
def self.set(option, value)
|
38
|
+
@@default[option] = value
|
39
|
+
end
|
40
|
+
|
41
|
+
def add_child ele
|
42
|
+
ele.parent = self
|
43
|
+
self.children ||= []
|
44
|
+
self.children << ele
|
45
|
+
ele
|
46
|
+
end
|
47
|
+
|
48
|
+
# Write a +string+ to the HTML stream, making sure to escape it.
|
49
|
+
def text!(string)
|
50
|
+
add_child Text.new(string.fast_xs)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Write a +string+ to the HTML stream without escaping it.
|
54
|
+
def text(string)
|
55
|
+
add_child Text.new(string)
|
56
|
+
nil
|
57
|
+
end
|
58
|
+
alias_method :<<, :text
|
59
|
+
alias_method :concat, :text
|
60
|
+
|
61
|
+
# Create a tag named +tag+. Other than the first argument which is the tag name,
|
62
|
+
# the arguments are the same as the tags implemented via method_missing.
|
63
|
+
def tag!(tag, *args, &block)
|
64
|
+
ele_id = nil
|
65
|
+
if @auto_validation and @tagset
|
66
|
+
if !@tagset.tagset.has_key?(tag)
|
67
|
+
raise InvalidXhtmlError, "no element `#{tag}' for #{tagset.doctype}"
|
68
|
+
elsif args.last.respond_to?(:to_hash)
|
69
|
+
attrs = args.last.to_hash
|
70
|
+
|
71
|
+
if @tagset.forms.include?(tag) and attrs[:id]
|
72
|
+
attrs[:name] ||= attrs[:id]
|
73
|
+
end
|
74
|
+
|
75
|
+
attrs.each do |k, v|
|
76
|
+
atname = k.to_s.downcase.intern
|
77
|
+
unless k =~ /:/ or @tagset.tagset[tag].include? atname
|
78
|
+
raise InvalidXhtmlError, "no attribute `#{k}' on #{tag} elements"
|
79
|
+
end
|
80
|
+
if atname == :id
|
81
|
+
ele_id = v.to_s
|
82
|
+
if @elements.has_key? ele_id
|
83
|
+
raise InvalidXhtmlError, "id `#{ele_id}' already used (id's must be unique)."
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# turn arguments into children or attributes
|
91
|
+
childs = []
|
92
|
+
attrs = args.grep(Hash)
|
93
|
+
childs.concat((args - attrs).flatten.map do |x|
|
94
|
+
if x.respond_to? :to_html
|
95
|
+
Hpricot.make(x.to_html)
|
96
|
+
elsif x
|
97
|
+
Text.new(x.fast_xs)
|
98
|
+
end
|
99
|
+
end.flatten)
|
100
|
+
attrs = attrs.inject({}) do |hsh, ath|
|
101
|
+
ath.each do |k, v|
|
102
|
+
hsh[k] = v.to_s.fast_xs if v
|
103
|
+
end
|
104
|
+
hsh
|
105
|
+
end
|
106
|
+
|
107
|
+
# create the element itself
|
108
|
+
tag = tag.to_s
|
109
|
+
f = Elem.new(tag, attrs, childs, ETag.new(tag))
|
110
|
+
|
111
|
+
# build children from the block
|
112
|
+
if block
|
113
|
+
build(f, &block)
|
114
|
+
end
|
115
|
+
|
116
|
+
add_child f
|
117
|
+
f
|
118
|
+
end
|
119
|
+
|
120
|
+
def build(*a, &b)
|
121
|
+
Hpricot.build(*a, &b)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Every HTML tag method goes through an html_tag call. So, calling <tt>div</tt> is equivalent
|
125
|
+
# to calling <tt>html_tag(:div)</tt>. All HTML tags in Hpricot's list are given generated wrappers
|
126
|
+
# for this method.
|
127
|
+
#
|
128
|
+
# If the @auto_validation setting is on, this method will check for many common mistakes which
|
129
|
+
# could lead to invalid XHTML.
|
130
|
+
def html_tag(sym, *args, &block)
|
131
|
+
if @auto_validation and @tagset.self_closing.include?(sym) and block
|
132
|
+
raise InvalidXhtmlError, "the `#{sym}' element is self-closing, please remove the block"
|
133
|
+
elsif args.empty? and block.nil?
|
134
|
+
CssProxy.new(self, sym)
|
135
|
+
else
|
136
|
+
tag!(sym, *args, &block)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
XHTMLTransitional.tags.each do |k|
|
141
|
+
class_eval %{
|
142
|
+
def #{k}(*args, &block)
|
143
|
+
html_tag(#{k.inspect}, *args, &block)
|
144
|
+
end
|
145
|
+
}
|
146
|
+
end
|
147
|
+
|
148
|
+
def doctype(target, pub, sys)
|
149
|
+
add_child DocType.new(target, pub, sys)
|
150
|
+
end
|
151
|
+
|
152
|
+
remove_method :head
|
153
|
+
|
154
|
+
# Builds a head tag. Adds a <tt>meta</tt> tag inside with Content-Type
|
155
|
+
# set to <tt>text/html; charset=utf-8</tt>.
|
156
|
+
def head(*args, &block)
|
157
|
+
tag!(:head, *args) do
|
158
|
+
tag!(:meta, "http-equiv" => "Content-Type", "content" => "text/html; charset=utf-8") if @output_meta_tag
|
159
|
+
instance_eval(&block)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# Builds an html tag. An XML 1.0 instruction and an XHTML 1.0 Transitional doctype
|
164
|
+
# are prepended. Also assumes <tt>:xmlns => "http://www.w3.org/1999/xhtml",
|
165
|
+
# :lang => "en"</tt>.
|
166
|
+
def xhtml_transitional(attrs = {}, &block)
|
167
|
+
# self.tagset = Hpricot::XHTMLTransitional
|
168
|
+
xhtml_html(attrs, &block)
|
169
|
+
end
|
170
|
+
|
171
|
+
# Builds an html tag with XHTML 1.0 Strict doctype instead.
|
172
|
+
def xhtml_strict(attrs = {}, &block)
|
173
|
+
# self.tagset = Hpricot::XHTMLStrict
|
174
|
+
xhtml_html(attrs, &block)
|
175
|
+
end
|
176
|
+
|
177
|
+
private
|
178
|
+
|
179
|
+
def xhtml_html(attrs = {}, &block)
|
180
|
+
instruct! if @output_xml_instruction
|
181
|
+
doctype(:html, *@@default[:tagset].doctype)
|
182
|
+
tag!(:html, @@default[:root_attributes].merge(attrs), &block)
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
# Class used by Markaby::Builder to store element options. Methods called
|
188
|
+
# against the CssProxy object are added as element classes or IDs.
|
189
|
+
#
|
190
|
+
# See the README for examples.
|
191
|
+
class CssProxy < BlankSlate
|
192
|
+
|
193
|
+
# Creates a CssProxy object.
|
194
|
+
def initialize(builder, sym)
|
195
|
+
@builder, @sym, @attrs = builder, sym, {}
|
196
|
+
end
|
197
|
+
|
198
|
+
# Adds attributes to an element. Bang methods set the :id attribute.
|
199
|
+
# Other methods add to the :class attribute.
|
200
|
+
def method_missing(id_or_class, *args, &block)
|
201
|
+
if (idc = id_or_class.to_s) =~ /!$/
|
202
|
+
@attrs[:id] = $`
|
203
|
+
else
|
204
|
+
@attrs[:class] = @attrs[:class].nil? ? idc : "#{@attrs[:class]} #{idc}".strip
|
205
|
+
end
|
206
|
+
|
207
|
+
if block or args.any?
|
208
|
+
args.push(@attrs)
|
209
|
+
return @builder.tag!(@sym, *args, &block)
|
210
|
+
end
|
211
|
+
|
212
|
+
return self
|
213
|
+
end
|
214
|
+
|
215
|
+
end
|
216
|
+
end
|
@@ -0,0 +1,514 @@
|
|
1
|
+
module Hpricot
|
2
|
+
# Once you've matched a list of elements, you will often need to handle them as
|
3
|
+
# a group. Or you may want to perform the same action on each of them.
|
4
|
+
# Hpricot::Elements is an extension of Ruby's array class, with some methods
|
5
|
+
# added for altering elements contained in the array.
|
6
|
+
#
|
7
|
+
# If you need to create an element array from regular elements:
|
8
|
+
#
|
9
|
+
# Hpricot::Elements[ele1, ele2, ele3]
|
10
|
+
#
|
11
|
+
# Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
|
12
|
+
# Hpricot::Doc, etc.)
|
13
|
+
#
|
14
|
+
# == Continuing Searches
|
15
|
+
#
|
16
|
+
# Usually the Hpricot::Elements you're working on comes from a search you've
|
17
|
+
# done. Well, you can continue searching the list by using the same <tt>at</tt>
|
18
|
+
# and <tt>search</tt> methods you can use on plain elements.
|
19
|
+
#
|
20
|
+
# elements = doc.search("/div/p")
|
21
|
+
# elements = elements.search("/a[@href='http://hoodwink.d/']")
|
22
|
+
# elements = elements.at("img")
|
23
|
+
#
|
24
|
+
# == Altering Elements
|
25
|
+
#
|
26
|
+
# When you're altering elements in the list, your changes will be reflected in
|
27
|
+
# the document you started searching from.
|
28
|
+
#
|
29
|
+
# doc = Hpricot("That's my <b>spoon</b>, Tyler.")
|
30
|
+
# doc.at("b").swap("<i>fork</i>")
|
31
|
+
# doc.to_html
|
32
|
+
# #=> "That's my <i>fork</i>, Tyler."
|
33
|
+
#
|
34
|
+
# == Getting More Detailed
|
35
|
+
#
|
36
|
+
# If you can't find a method here that does what you need, you may need to
|
37
|
+
# loop through the elements and find a method in Hpricot::Container::Trav
|
38
|
+
# which can do what you need.
|
39
|
+
#
|
40
|
+
# For example, you may want to search for all the H3 header tags in a document
|
41
|
+
# and grab all the tags underneath the header, but not inside the header.
|
42
|
+
# A good method for this is <tt>next_sibling</tt>:
|
43
|
+
#
|
44
|
+
# doc.search("h3").each do |h3|
|
45
|
+
# while ele = h3.next_sibling
|
46
|
+
# ary << ele # stuff away all the elements under the h3
|
47
|
+
# end
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# Most of the useful element methods are in the mixins Hpricot::Traverse
|
51
|
+
# and Hpricot::Container::Trav.
|
52
|
+
class Elements < Array
|
53
|
+
|
54
|
+
# Searches this list for any elements (or children of these elements) matching
|
55
|
+
# the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
|
56
|
+
#
|
57
|
+
# See Hpricot::Container::Trav.search for more.
|
58
|
+
def search(*expr,&blk)
|
59
|
+
Elements[*map { |x| x.search(*expr,&blk) }.flatten.uniq]
|
60
|
+
end
|
61
|
+
alias_method :/, :search
|
62
|
+
|
63
|
+
# Searches this list for the first element (or child of these elements) matching
|
64
|
+
# the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
|
65
|
+
#
|
66
|
+
# See Hpricot::Container::Trav.at for more.
|
67
|
+
def at(expr, &blk)
|
68
|
+
if expr.kind_of? Fixnum
|
69
|
+
super
|
70
|
+
else
|
71
|
+
search(expr, &blk)[0]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
alias_method :%, :at
|
75
|
+
|
76
|
+
# Convert this group of elements into a complete HTML fragment, returned as a
|
77
|
+
# string.
|
78
|
+
def to_html
|
79
|
+
map { |x| x.output("") }.join
|
80
|
+
end
|
81
|
+
alias_method :to_s, :to_html
|
82
|
+
|
83
|
+
# Returns an HTML fragment built of the contents of each element in this list.
|
84
|
+
#
|
85
|
+
# If a HTML +string+ is supplied, this method acts like inner_html=.
|
86
|
+
def inner_html(*string)
|
87
|
+
if string.empty?
|
88
|
+
map { |x| x.inner_html }.join
|
89
|
+
else
|
90
|
+
x = self.inner_html = string.pop || x
|
91
|
+
end
|
92
|
+
end
|
93
|
+
alias_method :html, :inner_html
|
94
|
+
alias_method :innerHTML, :inner_html
|
95
|
+
|
96
|
+
# Replaces the contents of each element in this list. Supply an HTML +string+,
|
97
|
+
# which is loaded into Hpricot objects and inserted into every element in this
|
98
|
+
# list.
|
99
|
+
def inner_html=(string)
|
100
|
+
each { |x| x.inner_html = string }
|
101
|
+
end
|
102
|
+
alias_method :html=, :inner_html=
|
103
|
+
alias_method :innerHTML=, :inner_html=
|
104
|
+
|
105
|
+
# Returns an string containing the text contents of each element in this list.
|
106
|
+
# All HTML tags are removed.
|
107
|
+
def inner_text
|
108
|
+
map { |x| x.inner_text }.join
|
109
|
+
end
|
110
|
+
alias_method :text, :inner_text
|
111
|
+
|
112
|
+
# Remove all elements in this list from the document which contains them.
|
113
|
+
#
|
114
|
+
# doc = Hpricot("<html>Remove this: <b>here</b></html>")
|
115
|
+
# doc.search("b").remove
|
116
|
+
# doc.to_html
|
117
|
+
# => "<html>Remove this: </html>"
|
118
|
+
#
|
119
|
+
def remove
|
120
|
+
each { |x| x.parent.children.delete(x) }
|
121
|
+
end
|
122
|
+
|
123
|
+
# Empty the elements in this list, by removing their insides.
|
124
|
+
#
|
125
|
+
# doc = Hpricot("<p> We have <i>so much</i> to say.</p>")
|
126
|
+
# doc.search("i").empty
|
127
|
+
# doc.to_html
|
128
|
+
# => "<p> We have <i></i> to say.</p>"
|
129
|
+
#
|
130
|
+
def empty
|
131
|
+
each { |x| x.inner_html = nil }
|
132
|
+
end
|
133
|
+
|
134
|
+
# Add to the end of the contents inside each element in this list.
|
135
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
136
|
+
def append(str = nil, &blk)
|
137
|
+
each { |x| x.html(x.children + x.make(str, &blk)) }
|
138
|
+
end
|
139
|
+
|
140
|
+
# Add to the start of the contents inside each element in this list.
|
141
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
142
|
+
def prepend(str = nil, &blk)
|
143
|
+
each { |x| x.html(x.make(str, &blk) + x.children) }
|
144
|
+
end
|
145
|
+
|
146
|
+
# Add some HTML just previous to each element in this list.
|
147
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
148
|
+
def before(str = nil, &blk)
|
149
|
+
each { |x| x.parent.insert_before x.make(str, &blk), x }
|
150
|
+
end
|
151
|
+
|
152
|
+
# Just after each element in this list, add some HTML.
|
153
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
154
|
+
def after(str = nil, &blk)
|
155
|
+
each { |x| x.parent.insert_after x.make(str, &blk), x }
|
156
|
+
end
|
157
|
+
|
158
|
+
# Wraps each element in the list inside the element created by HTML +str+.
|
159
|
+
# If more than one element is found in the string, Hpricot locates the
|
160
|
+
# deepest spot inside the first element.
|
161
|
+
#
|
162
|
+
# doc.search("a[@href]").
|
163
|
+
# wrap(%{<div class="link"><div class="link_inner"></div></div>})
|
164
|
+
#
|
165
|
+
# This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
|
166
|
+
def wrap(str = nil, &blk)
|
167
|
+
each do |x|
|
168
|
+
wrap = x.make(str, &blk)
|
169
|
+
nest = wrap.detect { |w| w.respond_to? :children }
|
170
|
+
unless nest
|
171
|
+
raise "No wrapping element found."
|
172
|
+
end
|
173
|
+
x.parent.replace_child(x, wrap)
|
174
|
+
nest = nest.children.first until nest.empty?
|
175
|
+
nest.html([x])
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# Gets and sets attributes on all matched elements.
|
180
|
+
#
|
181
|
+
# Pass in a +key+ on its own and this method will return the string value
|
182
|
+
# assigned to that attribute for the first elements. Or +nil+ if the
|
183
|
+
# attribute isn't found.
|
184
|
+
#
|
185
|
+
# doc.search("a").attr("href")
|
186
|
+
# #=> "http://hacketyhack.net/"
|
187
|
+
#
|
188
|
+
# Or, pass in a +key+ and +value+. This will set an attribute for all
|
189
|
+
# matched elements.
|
190
|
+
#
|
191
|
+
# doc.search("p").attr("class", "basic")
|
192
|
+
#
|
193
|
+
# You may also use a Hash to set a series of attributes:
|
194
|
+
#
|
195
|
+
# (doc/"a").attr(:class => "basic", :href => "http://hackety.org/")
|
196
|
+
#
|
197
|
+
# Lastly, a block can be used to rewrite an attribute based on the element
|
198
|
+
# it belongs to. The block will pass in an element. Return from the block
|
199
|
+
# the new value of the attribute.
|
200
|
+
#
|
201
|
+
# records.attr("href") { |e| e['href'] + "#top" }
|
202
|
+
#
|
203
|
+
# This example adds a <tt>#top</tt> anchor to each link.
|
204
|
+
#
|
205
|
+
def attr key, value = nil, &blk
|
206
|
+
if value or blk
|
207
|
+
each do |el|
|
208
|
+
el.set_attribute(key, value || blk[el])
|
209
|
+
end
|
210
|
+
return self
|
211
|
+
end
|
212
|
+
if key.is_a? Hash
|
213
|
+
key.each { |k,v| self.attr(k,v) }
|
214
|
+
return self
|
215
|
+
else
|
216
|
+
return self[0].get_attribute(key)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
alias_method :set, :attr
|
220
|
+
|
221
|
+
# Adds the class to all matched elements.
|
222
|
+
#
|
223
|
+
# (doc/"p").add_class("bacon")
|
224
|
+
#
|
225
|
+
# Now all paragraphs will have class="bacon".
|
226
|
+
def add_class class_name
|
227
|
+
each do |el|
|
228
|
+
next unless el.respond_to? :get_attribute
|
229
|
+
classes = el.get_attribute('class').to_s.split(" ")
|
230
|
+
el.set_attribute('class', classes.push(class_name).uniq.join(" "))
|
231
|
+
end
|
232
|
+
self
|
233
|
+
end
|
234
|
+
|
235
|
+
# Remove an attribute from each of the matched elements.
|
236
|
+
#
|
237
|
+
# (doc/"input").remove_attr("disabled")
|
238
|
+
#
|
239
|
+
def remove_attr name
|
240
|
+
each do |el|
|
241
|
+
next unless el.respond_to? :remove_attribute
|
242
|
+
el.remove_attribute(name)
|
243
|
+
end
|
244
|
+
self
|
245
|
+
end
|
246
|
+
|
247
|
+
# Removes a class from all matched elements.
|
248
|
+
#
|
249
|
+
# (doc/"span").remove_class("lightgrey")
|
250
|
+
#
|
251
|
+
# Or, to remove all classes:
|
252
|
+
#
|
253
|
+
# (doc/"span").remove_class
|
254
|
+
#
|
255
|
+
def remove_class name = nil
|
256
|
+
each do |el|
|
257
|
+
next unless el.respond_to? :get_attribute
|
258
|
+
if name
|
259
|
+
classes = el.get_attribute('class').to_s.split(" ")
|
260
|
+
el.set_attribute('class', (classes - [name]).uniq.join(" "))
|
261
|
+
else
|
262
|
+
el.remove_attribute("class")
|
263
|
+
end
|
264
|
+
end
|
265
|
+
self
|
266
|
+
end
|
267
|
+
|
268
|
+
ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i # " (for emacs)
|
269
|
+
BRACK_RE = %r!(\[) *([^\]]*) *\]+!i
|
270
|
+
FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
|
271
|
+
CUST_RE = %r!(:)([a-zA-Z0-9\*_-]*)()!
|
272
|
+
CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!
|
273
|
+
|
274
|
+
def self.filter(nodes, expr, truth = true)
|
275
|
+
until expr.empty?
|
276
|
+
_, *m = *expr.match(/^(?:#{ATTR_RE}|#{BRACK_RE}|#{FUNC_RE}|#{CUST_RE}|#{CATCH_RE})/)
|
277
|
+
break unless _
|
278
|
+
|
279
|
+
expr = $'
|
280
|
+
m.compact!
|
281
|
+
if m[0] == '@'
|
282
|
+
m[0] = "@#{m.slice!(2,1).join}"
|
283
|
+
end
|
284
|
+
|
285
|
+
if m[0] == '[' && m[1] =~ /^\d+$/
|
286
|
+
m = [":", "nth", m[1].to_i-1]
|
287
|
+
end
|
288
|
+
|
289
|
+
if m[0] == ":" && m[1] == "not"
|
290
|
+
nodes, = Elements.filter(nodes, m[2], false)
|
291
|
+
elsif "#{m[0]}#{m[1]}" =~ /^(:even|:odd)$/
|
292
|
+
new_nodes = []
|
293
|
+
nodes.each_with_index {|n,i| new_nodes.push(n) if (i % 2 == (m[1] == "even" ? 0 : 1)) }
|
294
|
+
nodes = new_nodes
|
295
|
+
elsif "#{m[0]}#{m[1]}" =~ /^(:first|:last)$/
|
296
|
+
nodes = [nodes.send(m[1])]
|
297
|
+
else
|
298
|
+
meth = "filter[#{m[0]}#{m[1]}]" unless m[0].empty?
|
299
|
+
if meth and Traverse.method_defined? meth
|
300
|
+
args = m[2..-1]
|
301
|
+
else
|
302
|
+
meth = "filter[#{m[0]}]"
|
303
|
+
if Traverse.method_defined? meth
|
304
|
+
args = m[1..-1]
|
305
|
+
end
|
306
|
+
end
|
307
|
+
args << -1
|
308
|
+
nodes = Elements[*nodes.find_all do |x|
|
309
|
+
args[-1] += 1
|
310
|
+
x.send(meth, *args) ? truth : !truth
|
311
|
+
end]
|
312
|
+
end
|
313
|
+
end
|
314
|
+
[nodes, expr]
|
315
|
+
end
|
316
|
+
|
317
|
+
# Given two elements, attempt to gather an Elements array of everything between
|
318
|
+
# (and including) those two elements.
|
319
|
+
def self.expand(ele1, ele2, excl=false)
|
320
|
+
ary = []
|
321
|
+
offset = excl ? -1 : 0
|
322
|
+
|
323
|
+
if ele1 and ele2
|
324
|
+
# let's quickly take care of siblings
|
325
|
+
if ele1.parent == ele2.parent
|
326
|
+
ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
|
327
|
+
else
|
328
|
+
# find common parent
|
329
|
+
p, ele1_p = ele1, [ele1]
|
330
|
+
ele1_p.unshift p while p.respond_to?(:parent) and p = p.parent
|
331
|
+
p, ele2_p = ele2, [ele2]
|
332
|
+
ele2_p.unshift p while p.respond_to?(:parent) and p = p.parent
|
333
|
+
common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.last
|
334
|
+
|
335
|
+
child = nil
|
336
|
+
if ele1 == common_parent
|
337
|
+
child = ele2
|
338
|
+
elsif ele2 == common_parent
|
339
|
+
child = ele1
|
340
|
+
end
|
341
|
+
|
342
|
+
if child
|
343
|
+
ary = common_parent.children[0..(child.node_position+offset)]
|
344
|
+
end
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
return Elements[*ary]
|
349
|
+
end
|
350
|
+
|
351
|
+
def filter(expr)
|
352
|
+
nodes, = Elements.filter(self, expr)
|
353
|
+
nodes
|
354
|
+
end
|
355
|
+
|
356
|
+
def not(expr)
|
357
|
+
if expr.is_a? Traverse
|
358
|
+
nodes = self - [expr]
|
359
|
+
else
|
360
|
+
nodes, = Elements.filter(self, expr, false)
|
361
|
+
end
|
362
|
+
nodes
|
363
|
+
end
|
364
|
+
|
365
|
+
private
|
366
|
+
def copy_node(node, l)
|
367
|
+
l.instance_variables.each do |iv|
|
368
|
+
node.instance_variable_set(iv, l.instance_variable_get(iv))
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
end
|
373
|
+
|
374
|
+
module Traverse
|
375
|
+
def self.filter(tok, &blk)
|
376
|
+
define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
|
377
|
+
end
|
378
|
+
|
379
|
+
filter '' do |name,i|
|
380
|
+
name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
|
381
|
+
end
|
382
|
+
|
383
|
+
filter '#' do |id,i|
|
384
|
+
self.elem? and get_attribute('id').to_s == id
|
385
|
+
end
|
386
|
+
|
387
|
+
filter '.' do |name,i|
|
388
|
+
self.elem? and classes.include? name
|
389
|
+
end
|
390
|
+
|
391
|
+
filter :lt do |num,i|
|
392
|
+
self.position < num.to_i
|
393
|
+
end
|
394
|
+
|
395
|
+
filter :gt do |num,i|
|
396
|
+
self.position > num.to_i
|
397
|
+
end
|
398
|
+
|
399
|
+
nth = proc { |num,i| self.position == num.to_i }
|
400
|
+
nth_first = proc { |*a| self.position == 0 }
|
401
|
+
nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
|
402
|
+
|
403
|
+
filter :nth, &nth
|
404
|
+
filter :eq, &nth
|
405
|
+
filter ":nth-of-type", &nth
|
406
|
+
|
407
|
+
filter :first, &nth_first
|
408
|
+
filter ":first-of-type", &nth_first
|
409
|
+
|
410
|
+
filter :last, &nth_last
|
411
|
+
filter ":last-of-type", &nth_last
|
412
|
+
|
413
|
+
filter :even do |num,i|
|
414
|
+
self.position % 2 == 0
|
415
|
+
end
|
416
|
+
|
417
|
+
filter :odd do |num,i|
|
418
|
+
self.position % 2 == 1
|
419
|
+
end
|
420
|
+
|
421
|
+
filter ':first-child' do |i|
|
422
|
+
self == parent.containers.first
|
423
|
+
end
|
424
|
+
|
425
|
+
filter ':nth-child' do |arg,i|
|
426
|
+
case arg
|
427
|
+
when 'even'; (parent.containers.index(self) + 1) % 2 == 0
|
428
|
+
when 'odd'; (parent.containers.index(self) + 1) % 2 == 1
|
429
|
+
else self == (parent.containers[arg.to_i - 1])
|
430
|
+
end
|
431
|
+
end
|
432
|
+
|
433
|
+
filter ":last-child" do |i|
|
434
|
+
self == parent.containers.last
|
435
|
+
end
|
436
|
+
|
437
|
+
filter ":nth-last-child" do |arg,i|
|
438
|
+
self == parent.containers[-1-arg.to_i]
|
439
|
+
end
|
440
|
+
|
441
|
+
filter ":nth-last-of-type" do |arg,i|
|
442
|
+
self == parent.children_of_type(self.name)[-1-arg.to_i]
|
443
|
+
end
|
444
|
+
|
445
|
+
filter ":only-of-type" do |arg,i|
|
446
|
+
parent.children_of_type(self.name).length == 1
|
447
|
+
end
|
448
|
+
|
449
|
+
filter ":only-child" do |arg,i|
|
450
|
+
parent.containers.length == 1
|
451
|
+
end
|
452
|
+
|
453
|
+
filter :parent do |*a|
|
454
|
+
containers.length > 0
|
455
|
+
end
|
456
|
+
|
457
|
+
filter :empty do |*a|
|
458
|
+
elem? && inner_html.length == 0
|
459
|
+
end
|
460
|
+
|
461
|
+
filter :root do |*a|
|
462
|
+
self.is_a? Hpricot::Doc
|
463
|
+
end
|
464
|
+
|
465
|
+
filter 'text' do |*a|
|
466
|
+
self.text?
|
467
|
+
end
|
468
|
+
|
469
|
+
filter 'comment' do |*a|
|
470
|
+
self.comment?
|
471
|
+
end
|
472
|
+
|
473
|
+
filter :contains do |arg, ignore|
|
474
|
+
html.include? arg
|
475
|
+
end
|
476
|
+
|
477
|
+
|
478
|
+
|
479
|
+
pred_procs =
|
480
|
+
{'text()' => proc { |ele, *_| ele.inner_text.strip },
|
481
|
+
'@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
|
482
|
+
|
483
|
+
oper_procs =
|
484
|
+
{'=' => proc { |a,b| a == b },
|
485
|
+
'!=' => proc { |a,b| a != b },
|
486
|
+
'~=' => proc { |a,b| a.split(/\s+/).include?(b) },
|
487
|
+
'|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
|
488
|
+
'^=' => proc { |a,b| a.index(b) == 0 },
|
489
|
+
'$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
|
490
|
+
'*=' => proc { |a,b| idx = a.index(b) }}
|
491
|
+
|
492
|
+
pred_procs.each do |pred_n, pred_f|
|
493
|
+
oper_procs.each do |oper_n, oper_f|
|
494
|
+
filter "#{pred_n}#{oper_n}" do |*a|
|
495
|
+
qual = pred_f[self, *a]
|
496
|
+
oper_f[qual, a[-2]] if qual
|
497
|
+
end
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
filter 'text()' do |val,i|
|
502
|
+
self.children.grep(Hpricot::Text).detect { |x| x.content =~ /\S/ } if self.children
|
503
|
+
end
|
504
|
+
|
505
|
+
filter '@' do |attr,val,i|
|
506
|
+
self.elem? and has_attribute? attr
|
507
|
+
end
|
508
|
+
|
509
|
+
filter '[' do |val,i|
|
510
|
+
self.elem? and search(val).length > 0
|
511
|
+
end
|
512
|
+
|
513
|
+
end
|
514
|
+
end
|