hpricot 0.8.3-i386-mswin32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +104 -0
- data/COPYING +18 -0
- data/README.md +276 -0
- data/Rakefile +234 -0
- data/ext/fast_xs/FastXsService.java +1123 -0
- data/ext/fast_xs/extconf.rb +4 -0
- data/ext/fast_xs/fast_xs.c +210 -0
- data/ext/hpricot_scan/HpricotCss.java +850 -0
- data/ext/hpricot_scan/HpricotScanService.java +2099 -0
- data/ext/hpricot_scan/extconf.rb +9 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_css.c +3511 -0
- data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
- data/ext/hpricot_scan/hpricot_css.rl +120 -0
- data/ext/hpricot_scan/hpricot_scan.c +7039 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
- data/ext/hpricot_scan/hpricot_scan.rl +896 -0
- data/extras/hpricot.png +0 -0
- data/lib/fast_xs.rb +1 -0
- data/lib/fast_xs/1.8/fast_xs.so +0 -0
- data/lib/fast_xs/1.9/fast_xs.so +0 -0
- data/lib/hpricot.rb +26 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +216 -0
- data/lib/hpricot/elements.rb +514 -0
- data/lib/hpricot/htmlinfo.rb +691 -0
- data/lib/hpricot/inspect.rb +103 -0
- data/lib/hpricot/modules.rb +40 -0
- data/lib/hpricot/parse.rb +38 -0
- data/lib/hpricot/tag.rb +219 -0
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +839 -0
- data/lib/hpricot/xchar.rb +94 -0
- data/lib/hpricot_scan.rb +1 -0
- data/lib/hpricot_scan/1.8/hpricot_scan.so +0 -0
- data/lib/hpricot_scan/1.9/hpricot_scan.so +0 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/immob.html +400 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/uswebgen.html +220 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +7 -0
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_alter.rb +96 -0
- data/test/test_builder.rb +37 -0
- data/test/test_parser.rb +457 -0
- data/test/test_paths.rb +25 -0
- data/test/test_preserved.rb +88 -0
- data/test/test_xml.rb +28 -0
- metadata +128 -0
data/extras/hpricot.png
ADDED
Binary file
|
data/lib/fast_xs.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "fast_xs/#{RUBY_VERSION.sub(/\.\d+$/, '')}/fast_xs"
|
Binary file
|
Binary file
|
data/lib/hpricot.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# == About hpricot.rb
|
2
|
+
#
|
3
|
+
# All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
|
4
|
+
#
|
5
|
+
# * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
|
6
|
+
# * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
|
7
|
+
# * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
|
8
|
+
# * hpricot/modules.rb: categorizes the various elements using mixins.
|
9
|
+
# * hpricot/traverse.rb: methods for searching documents.
|
10
|
+
# * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
|
11
|
+
# * hpricot/inspect.rb: methods for displaying documents in a readable form.
|
12
|
+
|
13
|
+
# If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
|
14
|
+
# See http://git.bitwi.se/ruby-character-encodings.git/.
|
15
|
+
begin
|
16
|
+
require 'encoding/character/utf-8'
|
17
|
+
rescue LoadError
|
18
|
+
end
|
19
|
+
|
20
|
+
require 'hpricot_scan'
|
21
|
+
require 'hpricot/tag'
|
22
|
+
require 'hpricot/modules'
|
23
|
+
require 'hpricot/traverse'
|
24
|
+
require 'hpricot/inspect'
|
25
|
+
require 'hpricot/parse'
|
26
|
+
require 'hpricot/builder'
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#--
|
3
|
+
# Copyright 2004 by Jim Weirich (jim@weirichhouse.org).
|
4
|
+
# All rights reserved.
|
5
|
+
|
6
|
+
# Permission is granted for use, copying, modification, distribution,
|
7
|
+
# and distribution of modified versions of this work as long as the
|
8
|
+
# above copyright notice is included.
|
9
|
+
#++
|
10
|
+
|
11
|
+
module Hpricot
|
12
|
+
|
13
|
+
# BlankSlate provides an abstract base class with no predefined
|
14
|
+
# methods (except for <tt>\_\_send__</tt> and <tt>\_\_id__</tt>).
|
15
|
+
# BlankSlate is useful as a base class when writing classes that
|
16
|
+
# depend upon <tt>method_missing</tt> (e.g. dynamic proxies).
|
17
|
+
class BlankSlate
|
18
|
+
class << self
|
19
|
+
|
20
|
+
# Hide the method named +name+ in the BlankSlate class. Don't
|
21
|
+
# hide +instance_eval+ or any method beginning with "__".
|
22
|
+
def hide(name)
|
23
|
+
undef_method name if
|
24
|
+
instance_methods.include?(name.to_s) and
|
25
|
+
name !~ /^(__|instance_eval)/
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
instance_methods.each { |m| hide(m) }
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Since Ruby is very dynamic, methods added to the ancestors of
|
34
|
+
# BlankSlate <em>after BlankSlate is defined</em> will show up in the
|
35
|
+
# list of available BlankSlate methods. We handle this by defining a
|
36
|
+
# hook in the Object and Kernel classes that will hide any defined
|
37
|
+
module Kernel
|
38
|
+
class << self
|
39
|
+
alias_method :hpricot_slate_method_added, :method_added
|
40
|
+
|
41
|
+
# Detect method additions to Kernel and remove them in the
|
42
|
+
# BlankSlate class.
|
43
|
+
def method_added(name)
|
44
|
+
hpricot_slate_method_added(name)
|
45
|
+
return if self != Kernel
|
46
|
+
Hpricot::BlankSlate.hide(name)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class Object
|
52
|
+
class << self
|
53
|
+
alias_method :hpricot_slate_method_added, :method_added
|
54
|
+
|
55
|
+
# Detect method additions to Object and remove them in the
|
56
|
+
# BlankSlate class.
|
57
|
+
def method_added(name)
|
58
|
+
hpricot_slate_method_added(name)
|
59
|
+
return if self != Object
|
60
|
+
Hpricot::BlankSlate.hide(name)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,216 @@
|
|
1
|
+
require 'hpricot/tags'
|
2
|
+
require 'fast_xs'
|
3
|
+
require 'hpricot/blankslate'
|
4
|
+
require 'hpricot/htmlinfo'
|
5
|
+
|
6
|
+
module Hpricot
|
7
|
+
# XML unescape
|
8
|
+
def self.uxs(str)
|
9
|
+
str.to_s.
|
10
|
+
gsub(/\&(\w+);/) { [NamedCharacters[$1] || 63].pack("U*") }. # 63 = ?? (query char)
|
11
|
+
gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.build(ele = Doc.new, assigns = {}, &blk)
|
15
|
+
ele.extend Builder
|
16
|
+
assigns.each do |k, v|
|
17
|
+
ele.instance_variable_set("@#{k}", v)
|
18
|
+
end
|
19
|
+
ele.instance_eval(&blk)
|
20
|
+
ele
|
21
|
+
end
|
22
|
+
|
23
|
+
module Builder
|
24
|
+
|
25
|
+
@@default = {
|
26
|
+
:indent => 0,
|
27
|
+
:output_helpers => true,
|
28
|
+
:output_xml_instruction => true,
|
29
|
+
:output_meta_tag => true,
|
30
|
+
:auto_validation => true,
|
31
|
+
:tagset => Hpricot::XHTMLTransitional,
|
32
|
+
:root_attributes => {
|
33
|
+
:xmlns => 'http://www.w3.org/1999/xhtml', :'xml:lang' => 'en', :lang => 'en'
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
def self.set(option, value)
|
38
|
+
@@default[option] = value
|
39
|
+
end
|
40
|
+
|
41
|
+
def add_child ele
|
42
|
+
ele.parent = self
|
43
|
+
self.children ||= []
|
44
|
+
self.children << ele
|
45
|
+
ele
|
46
|
+
end
|
47
|
+
|
48
|
+
# Write a +string+ to the HTML stream, making sure to escape it.
|
49
|
+
def text!(string)
|
50
|
+
add_child Text.new(string.fast_xs)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Write a +string+ to the HTML stream without escaping it.
|
54
|
+
def text(string)
|
55
|
+
add_child Text.new(string)
|
56
|
+
nil
|
57
|
+
end
|
58
|
+
alias_method :<<, :text
|
59
|
+
alias_method :concat, :text
|
60
|
+
|
61
|
+
# Create a tag named +tag+. Other than the first argument which is the tag name,
|
62
|
+
# the arguments are the same as the tags implemented via method_missing.
|
63
|
+
def tag!(tag, *args, &block)
|
64
|
+
ele_id = nil
|
65
|
+
if @auto_validation and @tagset
|
66
|
+
if !@tagset.tagset.has_key?(tag)
|
67
|
+
raise InvalidXhtmlError, "no element `#{tag}' for #{tagset.doctype}"
|
68
|
+
elsif args.last.respond_to?(:to_hash)
|
69
|
+
attrs = args.last.to_hash
|
70
|
+
|
71
|
+
if @tagset.forms.include?(tag) and attrs[:id]
|
72
|
+
attrs[:name] ||= attrs[:id]
|
73
|
+
end
|
74
|
+
|
75
|
+
attrs.each do |k, v|
|
76
|
+
atname = k.to_s.downcase.intern
|
77
|
+
unless k =~ /:/ or @tagset.tagset[tag].include? atname
|
78
|
+
raise InvalidXhtmlError, "no attribute `#{k}' on #{tag} elements"
|
79
|
+
end
|
80
|
+
if atname == :id
|
81
|
+
ele_id = v.to_s
|
82
|
+
if @elements.has_key? ele_id
|
83
|
+
raise InvalidXhtmlError, "id `#{ele_id}' already used (id's must be unique)."
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# turn arguments into children or attributes
|
91
|
+
childs = []
|
92
|
+
attrs = args.grep(Hash)
|
93
|
+
childs.concat((args - attrs).flatten.map do |x|
|
94
|
+
if x.respond_to? :to_html
|
95
|
+
Hpricot.make(x.to_html)
|
96
|
+
elsif x
|
97
|
+
Text.new(x.fast_xs)
|
98
|
+
end
|
99
|
+
end.flatten)
|
100
|
+
attrs = attrs.inject({}) do |hsh, ath|
|
101
|
+
ath.each do |k, v|
|
102
|
+
hsh[k] = v.to_s.fast_xs if v
|
103
|
+
end
|
104
|
+
hsh
|
105
|
+
end
|
106
|
+
|
107
|
+
# create the element itself
|
108
|
+
tag = tag.to_s
|
109
|
+
f = Elem.new(tag, attrs, childs, ETag.new(tag))
|
110
|
+
|
111
|
+
# build children from the block
|
112
|
+
if block
|
113
|
+
build(f, &block)
|
114
|
+
end
|
115
|
+
|
116
|
+
add_child f
|
117
|
+
f
|
118
|
+
end
|
119
|
+
|
120
|
+
def build(*a, &b)
|
121
|
+
Hpricot.build(*a, &b)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Every HTML tag method goes through an html_tag call. So, calling <tt>div</tt> is equivalent
|
125
|
+
# to calling <tt>html_tag(:div)</tt>. All HTML tags in Hpricot's list are given generated wrappers
|
126
|
+
# for this method.
|
127
|
+
#
|
128
|
+
# If the @auto_validation setting is on, this method will check for many common mistakes which
|
129
|
+
# could lead to invalid XHTML.
|
130
|
+
def html_tag(sym, *args, &block)
|
131
|
+
if @auto_validation and @tagset.self_closing.include?(sym) and block
|
132
|
+
raise InvalidXhtmlError, "the `#{sym}' element is self-closing, please remove the block"
|
133
|
+
elsif args.empty? and block.nil?
|
134
|
+
CssProxy.new(self, sym)
|
135
|
+
else
|
136
|
+
tag!(sym, *args, &block)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
XHTMLTransitional.tags.each do |k|
|
141
|
+
class_eval %{
|
142
|
+
def #{k}(*args, &block)
|
143
|
+
html_tag(#{k.inspect}, *args, &block)
|
144
|
+
end
|
145
|
+
}
|
146
|
+
end
|
147
|
+
|
148
|
+
def doctype(target, pub, sys)
|
149
|
+
add_child DocType.new(target, pub, sys)
|
150
|
+
end
|
151
|
+
|
152
|
+
remove_method :head
|
153
|
+
|
154
|
+
# Builds a head tag. Adds a <tt>meta</tt> tag inside with Content-Type
|
155
|
+
# set to <tt>text/html; charset=utf-8</tt>.
|
156
|
+
def head(*args, &block)
|
157
|
+
tag!(:head, *args) do
|
158
|
+
tag!(:meta, "http-equiv" => "Content-Type", "content" => "text/html; charset=utf-8") if @output_meta_tag
|
159
|
+
instance_eval(&block)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# Builds an html tag. An XML 1.0 instruction and an XHTML 1.0 Transitional doctype
|
164
|
+
# are prepended. Also assumes <tt>:xmlns => "http://www.w3.org/1999/xhtml",
|
165
|
+
# :lang => "en"</tt>.
|
166
|
+
def xhtml_transitional(attrs = {}, &block)
|
167
|
+
# self.tagset = Hpricot::XHTMLTransitional
|
168
|
+
xhtml_html(attrs, &block)
|
169
|
+
end
|
170
|
+
|
171
|
+
# Builds an html tag with XHTML 1.0 Strict doctype instead.
|
172
|
+
def xhtml_strict(attrs = {}, &block)
|
173
|
+
# self.tagset = Hpricot::XHTMLStrict
|
174
|
+
xhtml_html(attrs, &block)
|
175
|
+
end
|
176
|
+
|
177
|
+
private
|
178
|
+
|
179
|
+
def xhtml_html(attrs = {}, &block)
|
180
|
+
instruct! if @output_xml_instruction
|
181
|
+
doctype(:html, *@@default[:tagset].doctype)
|
182
|
+
tag!(:html, @@default[:root_attributes].merge(attrs), &block)
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
# Class used by Markaby::Builder to store element options. Methods called
|
188
|
+
# against the CssProxy object are added as element classes or IDs.
|
189
|
+
#
|
190
|
+
# See the README for examples.
|
191
|
+
class CssProxy < BlankSlate
|
192
|
+
|
193
|
+
# Creates a CssProxy object.
|
194
|
+
def initialize(builder, sym)
|
195
|
+
@builder, @sym, @attrs = builder, sym, {}
|
196
|
+
end
|
197
|
+
|
198
|
+
# Adds attributes to an element. Bang methods set the :id attribute.
|
199
|
+
# Other methods add to the :class attribute.
|
200
|
+
def method_missing(id_or_class, *args, &block)
|
201
|
+
if (idc = id_or_class.to_s) =~ /!$/
|
202
|
+
@attrs[:id] = $`
|
203
|
+
else
|
204
|
+
@attrs[:class] = @attrs[:class].nil? ? idc : "#{@attrs[:class]} #{idc}".strip
|
205
|
+
end
|
206
|
+
|
207
|
+
if block or args.any?
|
208
|
+
args.push(@attrs)
|
209
|
+
return @builder.tag!(@sym, *args, &block)
|
210
|
+
end
|
211
|
+
|
212
|
+
return self
|
213
|
+
end
|
214
|
+
|
215
|
+
end
|
216
|
+
end
|
@@ -0,0 +1,514 @@
|
|
1
|
+
module Hpricot
|
2
|
+
# Once you've matched a list of elements, you will often need to handle them as
|
3
|
+
# a group. Or you may want to perform the same action on each of them.
|
4
|
+
# Hpricot::Elements is an extension of Ruby's array class, with some methods
|
5
|
+
# added for altering elements contained in the array.
|
6
|
+
#
|
7
|
+
# If you need to create an element array from regular elements:
|
8
|
+
#
|
9
|
+
# Hpricot::Elements[ele1, ele2, ele3]
|
10
|
+
#
|
11
|
+
# Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
|
12
|
+
# Hpricot::Doc, etc.)
|
13
|
+
#
|
14
|
+
# == Continuing Searches
|
15
|
+
#
|
16
|
+
# Usually the Hpricot::Elements you're working on comes from a search you've
|
17
|
+
# done. Well, you can continue searching the list by using the same <tt>at</tt>
|
18
|
+
# and <tt>search</tt> methods you can use on plain elements.
|
19
|
+
#
|
20
|
+
# elements = doc.search("/div/p")
|
21
|
+
# elements = elements.search("/a[@href='http://hoodwink.d/']")
|
22
|
+
# elements = elements.at("img")
|
23
|
+
#
|
24
|
+
# == Altering Elements
|
25
|
+
#
|
26
|
+
# When you're altering elements in the list, your changes will be reflected in
|
27
|
+
# the document you started searching from.
|
28
|
+
#
|
29
|
+
# doc = Hpricot("That's my <b>spoon</b>, Tyler.")
|
30
|
+
# doc.at("b").swap("<i>fork</i>")
|
31
|
+
# doc.to_html
|
32
|
+
# #=> "That's my <i>fork</i>, Tyler."
|
33
|
+
#
|
34
|
+
# == Getting More Detailed
|
35
|
+
#
|
36
|
+
# If you can't find a method here that does what you need, you may need to
|
37
|
+
# loop through the elements and find a method in Hpricot::Container::Trav
|
38
|
+
# which can do what you need.
|
39
|
+
#
|
40
|
+
# For example, you may want to search for all the H3 header tags in a document
|
41
|
+
# and grab all the tags underneath the header, but not inside the header.
|
42
|
+
# A good method for this is <tt>next_sibling</tt>:
|
43
|
+
#
|
44
|
+
# doc.search("h3").each do |h3|
|
45
|
+
# while ele = h3.next_sibling
|
46
|
+
# ary << ele # stuff away all the elements under the h3
|
47
|
+
# end
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# Most of the useful element methods are in the mixins Hpricot::Traverse
|
51
|
+
# and Hpricot::Container::Trav.
|
52
|
+
class Elements < Array
|
53
|
+
|
54
|
+
# Searches this list for any elements (or children of these elements) matching
|
55
|
+
# the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
|
56
|
+
#
|
57
|
+
# See Hpricot::Container::Trav.search for more.
|
58
|
+
def search(*expr,&blk)
|
59
|
+
Elements[*map { |x| x.search(*expr,&blk) }.flatten.uniq]
|
60
|
+
end
|
61
|
+
alias_method :/, :search
|
62
|
+
|
63
|
+
# Searches this list for the first element (or child of these elements) matching
|
64
|
+
# the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
|
65
|
+
#
|
66
|
+
# See Hpricot::Container::Trav.at for more.
|
67
|
+
def at(expr, &blk)
|
68
|
+
if expr.kind_of? Fixnum
|
69
|
+
super
|
70
|
+
else
|
71
|
+
search(expr, &blk)[0]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
alias_method :%, :at
|
75
|
+
|
76
|
+
# Convert this group of elements into a complete HTML fragment, returned as a
|
77
|
+
# string.
|
78
|
+
def to_html
|
79
|
+
map { |x| x.output("") }.join
|
80
|
+
end
|
81
|
+
alias_method :to_s, :to_html
|
82
|
+
|
83
|
+
# Returns an HTML fragment built of the contents of each element in this list.
|
84
|
+
#
|
85
|
+
# If a HTML +string+ is supplied, this method acts like inner_html=.
|
86
|
+
def inner_html(*string)
|
87
|
+
if string.empty?
|
88
|
+
map { |x| x.inner_html }.join
|
89
|
+
else
|
90
|
+
x = self.inner_html = string.pop || x
|
91
|
+
end
|
92
|
+
end
|
93
|
+
alias_method :html, :inner_html
|
94
|
+
alias_method :innerHTML, :inner_html
|
95
|
+
|
96
|
+
# Replaces the contents of each element in this list. Supply an HTML +string+,
|
97
|
+
# which is loaded into Hpricot objects and inserted into every element in this
|
98
|
+
# list.
|
99
|
+
def inner_html=(string)
|
100
|
+
each { |x| x.inner_html = string }
|
101
|
+
end
|
102
|
+
alias_method :html=, :inner_html=
|
103
|
+
alias_method :innerHTML=, :inner_html=
|
104
|
+
|
105
|
+
# Returns an string containing the text contents of each element in this list.
|
106
|
+
# All HTML tags are removed.
|
107
|
+
def inner_text
|
108
|
+
map { |x| x.inner_text }.join
|
109
|
+
end
|
110
|
+
alias_method :text, :inner_text
|
111
|
+
|
112
|
+
# Remove all elements in this list from the document which contains them.
|
113
|
+
#
|
114
|
+
# doc = Hpricot("<html>Remove this: <b>here</b></html>")
|
115
|
+
# doc.search("b").remove
|
116
|
+
# doc.to_html
|
117
|
+
# => "<html>Remove this: </html>"
|
118
|
+
#
|
119
|
+
def remove
|
120
|
+
each { |x| x.parent.children.delete(x) }
|
121
|
+
end
|
122
|
+
|
123
|
+
# Empty the elements in this list, by removing their insides.
|
124
|
+
#
|
125
|
+
# doc = Hpricot("<p> We have <i>so much</i> to say.</p>")
|
126
|
+
# doc.search("i").empty
|
127
|
+
# doc.to_html
|
128
|
+
# => "<p> We have <i></i> to say.</p>"
|
129
|
+
#
|
130
|
+
def empty
|
131
|
+
each { |x| x.inner_html = nil }
|
132
|
+
end
|
133
|
+
|
134
|
+
# Add to the end of the contents inside each element in this list.
|
135
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
136
|
+
def append(str = nil, &blk)
|
137
|
+
each { |x| x.html(x.children + x.make(str, &blk)) }
|
138
|
+
end
|
139
|
+
|
140
|
+
# Add to the start of the contents inside each element in this list.
|
141
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
142
|
+
def prepend(str = nil, &blk)
|
143
|
+
each { |x| x.html(x.make(str, &blk) + x.children) }
|
144
|
+
end
|
145
|
+
|
146
|
+
# Add some HTML just previous to each element in this list.
|
147
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
148
|
+
def before(str = nil, &blk)
|
149
|
+
each { |x| x.parent.insert_before x.make(str, &blk), x }
|
150
|
+
end
|
151
|
+
|
152
|
+
# Just after each element in this list, add some HTML.
|
153
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
154
|
+
def after(str = nil, &blk)
|
155
|
+
each { |x| x.parent.insert_after x.make(str, &blk), x }
|
156
|
+
end
|
157
|
+
|
158
|
+
# Wraps each element in the list inside the element created by HTML +str+.
|
159
|
+
# If more than one element is found in the string, Hpricot locates the
|
160
|
+
# deepest spot inside the first element.
|
161
|
+
#
|
162
|
+
# doc.search("a[@href]").
|
163
|
+
# wrap(%{<div class="link"><div class="link_inner"></div></div>})
|
164
|
+
#
|
165
|
+
# This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
|
166
|
+
def wrap(str = nil, &blk)
|
167
|
+
each do |x|
|
168
|
+
wrap = x.make(str, &blk)
|
169
|
+
nest = wrap.detect { |w| w.respond_to? :children }
|
170
|
+
unless nest
|
171
|
+
raise "No wrapping element found."
|
172
|
+
end
|
173
|
+
x.parent.replace_child(x, wrap)
|
174
|
+
nest = nest.children.first until nest.empty?
|
175
|
+
nest.html([x])
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# Gets and sets attributes on all matched elements.
|
180
|
+
#
|
181
|
+
# Pass in a +key+ on its own and this method will return the string value
|
182
|
+
# assigned to that attribute for the first elements. Or +nil+ if the
|
183
|
+
# attribute isn't found.
|
184
|
+
#
|
185
|
+
# doc.search("a").attr("href")
|
186
|
+
# #=> "http://hacketyhack.net/"
|
187
|
+
#
|
188
|
+
# Or, pass in a +key+ and +value+. This will set an attribute for all
|
189
|
+
# matched elements.
|
190
|
+
#
|
191
|
+
# doc.search("p").attr("class", "basic")
|
192
|
+
#
|
193
|
+
# You may also use a Hash to set a series of attributes:
|
194
|
+
#
|
195
|
+
# (doc/"a").attr(:class => "basic", :href => "http://hackety.org/")
|
196
|
+
#
|
197
|
+
# Lastly, a block can be used to rewrite an attribute based on the element
|
198
|
+
# it belongs to. The block will pass in an element. Return from the block
|
199
|
+
# the new value of the attribute.
|
200
|
+
#
|
201
|
+
# records.attr("href") { |e| e['href'] + "#top" }
|
202
|
+
#
|
203
|
+
# This example adds a <tt>#top</tt> anchor to each link.
|
204
|
+
#
|
205
|
+
def attr key, value = nil, &blk
|
206
|
+
if value or blk
|
207
|
+
each do |el|
|
208
|
+
el.set_attribute(key, value || blk[el])
|
209
|
+
end
|
210
|
+
return self
|
211
|
+
end
|
212
|
+
if key.is_a? Hash
|
213
|
+
key.each { |k,v| self.attr(k,v) }
|
214
|
+
return self
|
215
|
+
else
|
216
|
+
return self[0].get_attribute(key)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
alias_method :set, :attr
|
220
|
+
|
221
|
+
# Adds the class to all matched elements.
|
222
|
+
#
|
223
|
+
# (doc/"p").add_class("bacon")
|
224
|
+
#
|
225
|
+
# Now all paragraphs will have class="bacon".
|
226
|
+
def add_class class_name
|
227
|
+
each do |el|
|
228
|
+
next unless el.respond_to? :get_attribute
|
229
|
+
classes = el.get_attribute('class').to_s.split(" ")
|
230
|
+
el.set_attribute('class', classes.push(class_name).uniq.join(" "))
|
231
|
+
end
|
232
|
+
self
|
233
|
+
end
|
234
|
+
|
235
|
+
# Remove an attribute from each of the matched elements.
|
236
|
+
#
|
237
|
+
# (doc/"input").remove_attr("disabled")
|
238
|
+
#
|
239
|
+
def remove_attr name
|
240
|
+
each do |el|
|
241
|
+
next unless el.respond_to? :remove_attribute
|
242
|
+
el.remove_attribute(name)
|
243
|
+
end
|
244
|
+
self
|
245
|
+
end
|
246
|
+
|
247
|
+
# Removes a class from all matched elements.
|
248
|
+
#
|
249
|
+
# (doc/"span").remove_class("lightgrey")
|
250
|
+
#
|
251
|
+
# Or, to remove all classes:
|
252
|
+
#
|
253
|
+
# (doc/"span").remove_class
|
254
|
+
#
|
255
|
+
def remove_class name = nil
|
256
|
+
each do |el|
|
257
|
+
next unless el.respond_to? :get_attribute
|
258
|
+
if name
|
259
|
+
classes = el.get_attribute('class').to_s.split(" ")
|
260
|
+
el.set_attribute('class', (classes - [name]).uniq.join(" "))
|
261
|
+
else
|
262
|
+
el.remove_attribute("class")
|
263
|
+
end
|
264
|
+
end
|
265
|
+
self
|
266
|
+
end
|
267
|
+
|
268
|
+
ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i # " (for emacs)
|
269
|
+
BRACK_RE = %r!(\[) *([^\]]*) *\]+!i
|
270
|
+
FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
|
271
|
+
CUST_RE = %r!(:)([a-zA-Z0-9\*_-]*)()!
|
272
|
+
CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!
|
273
|
+
|
274
|
+
def self.filter(nodes, expr, truth = true)
|
275
|
+
until expr.empty?
|
276
|
+
_, *m = *expr.match(/^(?:#{ATTR_RE}|#{BRACK_RE}|#{FUNC_RE}|#{CUST_RE}|#{CATCH_RE})/)
|
277
|
+
break unless _
|
278
|
+
|
279
|
+
expr = $'
|
280
|
+
m.compact!
|
281
|
+
if m[0] == '@'
|
282
|
+
m[0] = "@#{m.slice!(2,1).join}"
|
283
|
+
end
|
284
|
+
|
285
|
+
if m[0] == '[' && m[1] =~ /^\d+$/
|
286
|
+
m = [":", "nth", m[1].to_i-1]
|
287
|
+
end
|
288
|
+
|
289
|
+
if m[0] == ":" && m[1] == "not"
|
290
|
+
nodes, = Elements.filter(nodes, m[2], false)
|
291
|
+
elsif "#{m[0]}#{m[1]}" =~ /^(:even|:odd)$/
|
292
|
+
new_nodes = []
|
293
|
+
nodes.each_with_index {|n,i| new_nodes.push(n) if (i % 2 == (m[1] == "even" ? 0 : 1)) }
|
294
|
+
nodes = new_nodes
|
295
|
+
elsif "#{m[0]}#{m[1]}" =~ /^(:first|:last)$/
|
296
|
+
nodes = [nodes.send(m[1])]
|
297
|
+
else
|
298
|
+
meth = "filter[#{m[0]}#{m[1]}]" unless m[0].empty?
|
299
|
+
if meth and Traverse.method_defined? meth
|
300
|
+
args = m[2..-1]
|
301
|
+
else
|
302
|
+
meth = "filter[#{m[0]}]"
|
303
|
+
if Traverse.method_defined? meth
|
304
|
+
args = m[1..-1]
|
305
|
+
end
|
306
|
+
end
|
307
|
+
args << -1
|
308
|
+
nodes = Elements[*nodes.find_all do |x|
|
309
|
+
args[-1] += 1
|
310
|
+
x.send(meth, *args) ? truth : !truth
|
311
|
+
end]
|
312
|
+
end
|
313
|
+
end
|
314
|
+
[nodes, expr]
|
315
|
+
end
|
316
|
+
|
317
|
+
# Given two elements, attempt to gather an Elements array of everything between
|
318
|
+
# (and including) those two elements.
|
319
|
+
def self.expand(ele1, ele2, excl=false)
|
320
|
+
ary = []
|
321
|
+
offset = excl ? -1 : 0
|
322
|
+
|
323
|
+
if ele1 and ele2
|
324
|
+
# let's quickly take care of siblings
|
325
|
+
if ele1.parent == ele2.parent
|
326
|
+
ary = ele1.parent.children[ele1.node_position..(ele2.node_position+offset)]
|
327
|
+
else
|
328
|
+
# find common parent
|
329
|
+
p, ele1_p = ele1, [ele1]
|
330
|
+
ele1_p.unshift p while p.respond_to?(:parent) and p = p.parent
|
331
|
+
p, ele2_p = ele2, [ele2]
|
332
|
+
ele2_p.unshift p while p.respond_to?(:parent) and p = p.parent
|
333
|
+
common_parent = ele1_p.zip(ele2_p).select { |p1, p2| p1 == p2 }.flatten.last
|
334
|
+
|
335
|
+
child = nil
|
336
|
+
if ele1 == common_parent
|
337
|
+
child = ele2
|
338
|
+
elsif ele2 == common_parent
|
339
|
+
child = ele1
|
340
|
+
end
|
341
|
+
|
342
|
+
if child
|
343
|
+
ary = common_parent.children[0..(child.node_position+offset)]
|
344
|
+
end
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
return Elements[*ary]
|
349
|
+
end
|
350
|
+
|
351
|
+
def filter(expr)
|
352
|
+
nodes, = Elements.filter(self, expr)
|
353
|
+
nodes
|
354
|
+
end
|
355
|
+
|
356
|
+
def not(expr)
|
357
|
+
if expr.is_a? Traverse
|
358
|
+
nodes = self - [expr]
|
359
|
+
else
|
360
|
+
nodes, = Elements.filter(self, expr, false)
|
361
|
+
end
|
362
|
+
nodes
|
363
|
+
end
|
364
|
+
|
365
|
+
private
|
366
|
+
def copy_node(node, l)
|
367
|
+
l.instance_variables.each do |iv|
|
368
|
+
node.instance_variable_set(iv, l.instance_variable_get(iv))
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
end
|
373
|
+
|
374
|
+
module Traverse
|
375
|
+
def self.filter(tok, &blk)
|
376
|
+
define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
|
377
|
+
end
|
378
|
+
|
379
|
+
filter '' do |name,i|
|
380
|
+
name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
|
381
|
+
end
|
382
|
+
|
383
|
+
filter '#' do |id,i|
|
384
|
+
self.elem? and get_attribute('id').to_s == id
|
385
|
+
end
|
386
|
+
|
387
|
+
filter '.' do |name,i|
|
388
|
+
self.elem? and classes.include? name
|
389
|
+
end
|
390
|
+
|
391
|
+
filter :lt do |num,i|
|
392
|
+
self.position < num.to_i
|
393
|
+
end
|
394
|
+
|
395
|
+
filter :gt do |num,i|
|
396
|
+
self.position > num.to_i
|
397
|
+
end
|
398
|
+
|
399
|
+
nth = proc { |num,i| self.position == num.to_i }
|
400
|
+
nth_first = proc { |*a| self.position == 0 }
|
401
|
+
nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
|
402
|
+
|
403
|
+
filter :nth, &nth
|
404
|
+
filter :eq, &nth
|
405
|
+
filter ":nth-of-type", &nth
|
406
|
+
|
407
|
+
filter :first, &nth_first
|
408
|
+
filter ":first-of-type", &nth_first
|
409
|
+
|
410
|
+
filter :last, &nth_last
|
411
|
+
filter ":last-of-type", &nth_last
|
412
|
+
|
413
|
+
filter :even do |num,i|
|
414
|
+
self.position % 2 == 0
|
415
|
+
end
|
416
|
+
|
417
|
+
filter :odd do |num,i|
|
418
|
+
self.position % 2 == 1
|
419
|
+
end
|
420
|
+
|
421
|
+
filter ':first-child' do |i|
|
422
|
+
self == parent.containers.first
|
423
|
+
end
|
424
|
+
|
425
|
+
filter ':nth-child' do |arg,i|
|
426
|
+
case arg
|
427
|
+
when 'even'; (parent.containers.index(self) + 1) % 2 == 0
|
428
|
+
when 'odd'; (parent.containers.index(self) + 1) % 2 == 1
|
429
|
+
else self == (parent.containers[arg.to_i - 1])
|
430
|
+
end
|
431
|
+
end
|
432
|
+
|
433
|
+
filter ":last-child" do |i|
|
434
|
+
self == parent.containers.last
|
435
|
+
end
|
436
|
+
|
437
|
+
filter ":nth-last-child" do |arg,i|
|
438
|
+
self == parent.containers[-1-arg.to_i]
|
439
|
+
end
|
440
|
+
|
441
|
+
filter ":nth-last-of-type" do |arg,i|
|
442
|
+
self == parent.children_of_type(self.name)[-1-arg.to_i]
|
443
|
+
end
|
444
|
+
|
445
|
+
filter ":only-of-type" do |arg,i|
|
446
|
+
parent.children_of_type(self.name).length == 1
|
447
|
+
end
|
448
|
+
|
449
|
+
filter ":only-child" do |arg,i|
|
450
|
+
parent.containers.length == 1
|
451
|
+
end
|
452
|
+
|
453
|
+
filter :parent do |*a|
|
454
|
+
containers.length > 0
|
455
|
+
end
|
456
|
+
|
457
|
+
filter :empty do |*a|
|
458
|
+
elem? && inner_html.length == 0
|
459
|
+
end
|
460
|
+
|
461
|
+
filter :root do |*a|
|
462
|
+
self.is_a? Hpricot::Doc
|
463
|
+
end
|
464
|
+
|
465
|
+
filter 'text' do |*a|
|
466
|
+
self.text?
|
467
|
+
end
|
468
|
+
|
469
|
+
filter 'comment' do |*a|
|
470
|
+
self.comment?
|
471
|
+
end
|
472
|
+
|
473
|
+
filter :contains do |arg, ignore|
|
474
|
+
html.include? arg
|
475
|
+
end
|
476
|
+
|
477
|
+
|
478
|
+
|
479
|
+
pred_procs =
|
480
|
+
{'text()' => proc { |ele, *_| ele.inner_text.strip },
|
481
|
+
'@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
|
482
|
+
|
483
|
+
oper_procs =
|
484
|
+
{'=' => proc { |a,b| a == b },
|
485
|
+
'!=' => proc { |a,b| a != b },
|
486
|
+
'~=' => proc { |a,b| a.split(/\s+/).include?(b) },
|
487
|
+
'|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
|
488
|
+
'^=' => proc { |a,b| a.index(b) == 0 },
|
489
|
+
'$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
|
490
|
+
'*=' => proc { |a,b| idx = a.index(b) }}
|
491
|
+
|
492
|
+
pred_procs.each do |pred_n, pred_f|
|
493
|
+
oper_procs.each do |oper_n, oper_f|
|
494
|
+
filter "#{pred_n}#{oper_n}" do |*a|
|
495
|
+
qual = pred_f[self, *a]
|
496
|
+
oper_f[qual, a[-2]] if qual
|
497
|
+
end
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
filter 'text()' do |val,i|
|
502
|
+
self.children.grep(Hpricot::Text).detect { |x| x.content =~ /\S/ } if self.children
|
503
|
+
end
|
504
|
+
|
505
|
+
filter '@' do |attr,val,i|
|
506
|
+
self.elem? and has_attribute? attr
|
507
|
+
end
|
508
|
+
|
509
|
+
filter '[' do |val,i|
|
510
|
+
self.elem? and search(val).length > 0
|
511
|
+
end
|
512
|
+
|
513
|
+
end
|
514
|
+
end
|