hpricot 0.4-mswin32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +34 -0
- data/COPYING +18 -0
- data/README +6 -0
- data/Rakefile +166 -0
- data/ext/hpricot_scan/extconf.rb +6 -0
- data/ext/hpricot_scan/hpricot_scan.c +5964 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.rl +300 -0
- data/extras/mingw-rbconfig.rb +176 -0
- data/lib/hpricot.rb +6 -0
- data/lib/hpricot/elements.rb +292 -0
- data/lib/hpricot/htmlinfo.rb +672 -0
- data/lib/hpricot/inspect.rb +90 -0
- data/lib/hpricot/modules.rb +37 -0
- data/lib/hpricot/parse.rb +286 -0
- data/lib/hpricot/tag.rb +146 -0
- data/lib/hpricot/text.rb +115 -0
- data/lib/hpricot/traverse.rb +511 -0
- data/lib/hpricot_scan.so +0 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/immob.html +400 -0
- data/test/files/uswebgen.html +220 -0
- data/test/load_files.rb +7 -0
- data/test/test_parser.rb +141 -0
- metadata +72 -0
data/lib/hpricot/text.rb
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'hpricot/modules'
|
2
|
+
require 'hpricot/raw_string'
|
3
|
+
require 'hpricot/htmlinfo'
|
4
|
+
require 'hpricot/encoder'
|
5
|
+
require 'hpricot/fstr'
|
6
|
+
require 'iconv'
|
7
|
+
|
8
|
+
module Hpricot
|
9
|
+
class Text
|
10
|
+
# :stopdoc:
|
11
|
+
class << self
|
12
|
+
alias new_internal new
|
13
|
+
end
|
14
|
+
# :startdoc:
|
15
|
+
|
16
|
+
def Text.new(arg)
|
17
|
+
arg = arg.to_node if Hpricot::Location === arg
|
18
|
+
if Text === arg
|
19
|
+
new_internal arg.rcdata, arg.normalized_rcdata
|
20
|
+
elsif String === arg
|
21
|
+
arg2 = arg.gsub(/&/, '&')
|
22
|
+
arg = arg2.freeze if arg != arg2
|
23
|
+
new_internal arg
|
24
|
+
else
|
25
|
+
raise TypeError, "cannot initialize Text with #{arg.inspect}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def initialize(rcdata, normalized_rcdata=internal_normalize(rcdata)) # :notnew:
|
30
|
+
init_raw_string
|
31
|
+
@rcdata = rcdata && Hpricot.frozen_string(rcdata)
|
32
|
+
@normalized_rcdata = @rcdata == normalized_rcdata ? @rcdata : normalized_rcdata
|
33
|
+
end
|
34
|
+
attr_reader :rcdata, :normalized_rcdata
|
35
|
+
|
36
|
+
def internal_normalize(rcdata)
|
37
|
+
# - character references are decoded as much as possible.
|
38
|
+
# - undecodable character references are converted to decimal numeric character refereces.
|
39
|
+
result = rcdata.gsub(/&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([A-Za-z][A-Za-z0-9]*));/o) {|s|
|
40
|
+
u = nil
|
41
|
+
if $1
|
42
|
+
u = $1.to_i
|
43
|
+
elsif $2
|
44
|
+
u = $2.hex
|
45
|
+
elsif $3
|
46
|
+
u = NamedCharacters[$3]
|
47
|
+
end
|
48
|
+
if !u || u < 0 || 0x7fffffff < u
|
49
|
+
'?'
|
50
|
+
elsif u == 38 # '&' character.
|
51
|
+
'&'
|
52
|
+
elsif u <= 0x7f
|
53
|
+
[u].pack("C")
|
54
|
+
else
|
55
|
+
begin
|
56
|
+
Iconv.conv(Encoder.internal_charset, 'UTF-8', [u].pack("U"))
|
57
|
+
rescue Iconv::Failure
|
58
|
+
"&##{u};"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
}
|
62
|
+
Hpricot.frozen_string(result)
|
63
|
+
end
|
64
|
+
private :internal_normalize
|
65
|
+
|
66
|
+
# Hpricot::Text#to_s converts the text to a string.
|
67
|
+
# - character references are decoded as much as possible.
|
68
|
+
# - undecodable character reference are converted to `?' character.
|
69
|
+
def to_s
|
70
|
+
@normalized_rcdata.gsub(/&(?:#([0-9]+));/o) {|s|
|
71
|
+
u = $1.to_i
|
72
|
+
if 0 <= u && u <= 0x7f
|
73
|
+
[u].pack("C")
|
74
|
+
else
|
75
|
+
'?'
|
76
|
+
end
|
77
|
+
}
|
78
|
+
end
|
79
|
+
|
80
|
+
def empty?
|
81
|
+
@normalized_rcdata.empty?
|
82
|
+
end
|
83
|
+
|
84
|
+
def strip
|
85
|
+
rcdata = @normalized_rcdata.dup
|
86
|
+
rcdata.sub!(/\A(?:\s| )+/, '')
|
87
|
+
rcdata.sub!(/(?:\s| )+\z/, '')
|
88
|
+
if rcdata == @normalized_rcdata
|
89
|
+
self
|
90
|
+
else
|
91
|
+
rcdata.freeze
|
92
|
+
Text.new_internal(rcdata, rcdata)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# Hpricot::Text.concat returns a text which is concatenation of arguments.
|
97
|
+
#
|
98
|
+
# An argument should be one of follows.
|
99
|
+
# - String
|
100
|
+
# - Hpricot::Text
|
101
|
+
# - Hpricot::Location which points Hpricot::Text
|
102
|
+
def Text.concat(*args)
|
103
|
+
rcdata = ''
|
104
|
+
args.each {|arg|
|
105
|
+
arg = arg.to_node if Hpricot::Location === arg
|
106
|
+
if Text === arg
|
107
|
+
rcdata << arg.rcdata
|
108
|
+
else
|
109
|
+
rcdata << arg.gsub(/&/, '&')
|
110
|
+
end
|
111
|
+
}
|
112
|
+
new_internal rcdata
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,511 @@
|
|
1
|
+
require 'hpricot/elements'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Hpricot
|
5
|
+
module Traverse
|
6
|
+
def doc?() Doc::Trav === self end
|
7
|
+
def elem?() Elem::Trav === self end
|
8
|
+
def text?() Text::Trav === self end
|
9
|
+
def xmldecl?() XMLDecl::Trav === self end
|
10
|
+
def doctype?() DocType::Trav === self end
|
11
|
+
def procins?() ProcIns::Trav === self end
|
12
|
+
def comment?() Comment::Trav === self end
|
13
|
+
def bogusetag?() BogusETag::Trav === self end
|
14
|
+
|
15
|
+
def to_html
|
16
|
+
output("")
|
17
|
+
end
|
18
|
+
alias_method :to_s, :to_html
|
19
|
+
|
20
|
+
def get_subnode(*indexes)
|
21
|
+
n = self
|
22
|
+
indexes.each {|index|
|
23
|
+
n = n.get_subnode_internal(index)
|
24
|
+
}
|
25
|
+
n
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
module Container::Trav
|
30
|
+
def containers
|
31
|
+
children.grep(Container::Trav)
|
32
|
+
end
|
33
|
+
def replace_child(old, new)
|
34
|
+
children[children.index(old), 1] = [*new]
|
35
|
+
end
|
36
|
+
def insert_before(nodes, ele)
|
37
|
+
case nodes
|
38
|
+
when Array
|
39
|
+
nodes.each { |n| insert_before(n, ele) }
|
40
|
+
else
|
41
|
+
children[children.index(ele) || 0, 0] = nodes
|
42
|
+
end
|
43
|
+
end
|
44
|
+
def insert_after(nodes, ele)
|
45
|
+
case nodes
|
46
|
+
when Array
|
47
|
+
nodes.each { |n| insert_after(n, ele) }
|
48
|
+
else
|
49
|
+
idx = children.index(ele)
|
50
|
+
children[idx ? idx + 1 : children.length, 0] = nodes
|
51
|
+
end
|
52
|
+
end
|
53
|
+
def inner_html
|
54
|
+
children.map { |x| x.output("") }.join
|
55
|
+
end
|
56
|
+
alias_method :innerHTML, :inner_html
|
57
|
+
def inner_html=(inner)
|
58
|
+
case inner
|
59
|
+
when String, IO
|
60
|
+
self.children = Hpricot.parse(inner).children
|
61
|
+
when Array
|
62
|
+
self.children = inner
|
63
|
+
when nil
|
64
|
+
self.children = []
|
65
|
+
end
|
66
|
+
end
|
67
|
+
alias_method :innerHTML=, :inner_html=
|
68
|
+
def search(expr, &blk)
|
69
|
+
last = nil
|
70
|
+
nodes = [self]
|
71
|
+
done = []
|
72
|
+
expr = expr.to_s
|
73
|
+
until expr.empty?
|
74
|
+
expr = clean_path(expr)
|
75
|
+
expr.gsub!(%r!^//!, '')
|
76
|
+
|
77
|
+
case expr
|
78
|
+
when %r!^/?\.\.!
|
79
|
+
expr = $'
|
80
|
+
nodes.map! { |node| node.parent }
|
81
|
+
when %r!^[>/]!
|
82
|
+
expr = $'
|
83
|
+
nodes = Elements[*nodes.map { |node| node.containers }.flatten]
|
84
|
+
when %r!^\+!
|
85
|
+
expr = $'
|
86
|
+
nodes.map! do |node|
|
87
|
+
siblings = node.parent.containers
|
88
|
+
siblings[siblings.index(node)+1]
|
89
|
+
end
|
90
|
+
nodes.compact!
|
91
|
+
when %r!^~!
|
92
|
+
expr = $'
|
93
|
+
nodes.map! do |node|
|
94
|
+
siblings = node.parent.containers
|
95
|
+
siblings[(siblings.index(node)+1)..-1]
|
96
|
+
end
|
97
|
+
nodes.flatten!
|
98
|
+
when %r!^[|,]!
|
99
|
+
expr = " #$'"
|
100
|
+
nodes.shift if nodes.first == self
|
101
|
+
done += nodes
|
102
|
+
nodes = [self]
|
103
|
+
else
|
104
|
+
m = expr.match %r!^([#.]?)([a-z0-9\\*_-]*)!i
|
105
|
+
expr = $'
|
106
|
+
if m[1] == '#'
|
107
|
+
oid = get_element_by_id(m[2])
|
108
|
+
nodes = oid ? [oid] : []
|
109
|
+
else
|
110
|
+
m[2] = "*" if m[2] == "" || m[1] == "."
|
111
|
+
ret = []
|
112
|
+
nodes.each do |node|
|
113
|
+
case m[2]
|
114
|
+
when '*'
|
115
|
+
else
|
116
|
+
ret += [*node.get_elements_by_tag_name(m[2])]
|
117
|
+
end
|
118
|
+
end
|
119
|
+
nodes = ret
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
nodes, expr = Elements.filter(nodes, expr)
|
124
|
+
end
|
125
|
+
nodes = done + nodes.flatten.uniq
|
126
|
+
if blk
|
127
|
+
nodes.each(&blk)
|
128
|
+
self
|
129
|
+
else
|
130
|
+
Elements[*nodes]
|
131
|
+
end
|
132
|
+
end
|
133
|
+
alias_method :/, :search
|
134
|
+
|
135
|
+
def at(expr, &blk)
|
136
|
+
search(expr, &blk).first
|
137
|
+
end
|
138
|
+
alias_method :%, :at
|
139
|
+
|
140
|
+
def clean_path(path)
|
141
|
+
path.gsub(/^\s+|\s+$/, '')
|
142
|
+
end
|
143
|
+
|
144
|
+
# +each_child+ iterates over each child.
|
145
|
+
def each_child(&block) # :yields: child_node
|
146
|
+
children.each(&block)
|
147
|
+
nil
|
148
|
+
end
|
149
|
+
|
150
|
+
# +each_child_with_index+ iterates over each child.
|
151
|
+
def each_child_with_index(&block) # :yields: child_node, index
|
152
|
+
children.each_with_index(&block)
|
153
|
+
nil
|
154
|
+
end
|
155
|
+
|
156
|
+
# +find_element+ searches an element which universal name is specified by
|
157
|
+
# the arguments.
|
158
|
+
# It returns nil if not found.
|
159
|
+
def find_element(*names)
|
160
|
+
traverse_element(*names) {|e| return e }
|
161
|
+
nil
|
162
|
+
end
|
163
|
+
|
164
|
+
# +traverse_element+ traverses elements in the tree.
|
165
|
+
# It yields elements in depth first order.
|
166
|
+
#
|
167
|
+
# If _names_ are empty, it yields all elements.
|
168
|
+
# If non-empty _names_ are given, it should be list of universal names.
|
169
|
+
#
|
170
|
+
# A nested element is yielded in depth first order as follows.
|
171
|
+
#
|
172
|
+
# t = Hpricot('<a id=0><b><a id=1 /></b><c id=2 /></a>')
|
173
|
+
# t.traverse_element("a", "c") {|e| p e}
|
174
|
+
# # =>
|
175
|
+
# {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
|
176
|
+
# {emptyelem <a id="1">}
|
177
|
+
# {emptyelem <c id="2">}
|
178
|
+
#
|
179
|
+
# Universal names are specified as follows.
|
180
|
+
#
|
181
|
+
# t = Hpricot(<<'End')
|
182
|
+
# <html>
|
183
|
+
# <meta name="robots" content="index,nofollow">
|
184
|
+
# <meta name="author" content="Who am I?">
|
185
|
+
# </html>
|
186
|
+
# End
|
187
|
+
# t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
|
188
|
+
# # =>
|
189
|
+
# {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
|
190
|
+
# {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
|
191
|
+
#
|
192
|
+
def traverse_element(*names, &block) # :yields: element
|
193
|
+
if names.empty?
|
194
|
+
traverse_all_element(&block)
|
195
|
+
else
|
196
|
+
name_set = {}
|
197
|
+
names.each {|n| name_set[n] = true }
|
198
|
+
traverse_some_element(name_set, &block)
|
199
|
+
end
|
200
|
+
nil
|
201
|
+
end
|
202
|
+
|
203
|
+
def classes
|
204
|
+
get_attribute('class').to_s.strip.split(/\s+/)
|
205
|
+
end
|
206
|
+
|
207
|
+
def get_element_by_id(id)
|
208
|
+
traverse_all_element do |ele|
|
209
|
+
if eid = ele.get_attribute('id')
|
210
|
+
return ele if eid.to_s == id
|
211
|
+
end
|
212
|
+
end
|
213
|
+
nil
|
214
|
+
end
|
215
|
+
|
216
|
+
def get_elements_by_tag_name(*a)
|
217
|
+
list = Elements[]
|
218
|
+
traverse_element(*a.map { |tag| [tag, "{http://www.w3.org/1999/xhtml}#{tag}"] }.flatten) do |e|
|
219
|
+
list << e
|
220
|
+
end
|
221
|
+
list
|
222
|
+
end
|
223
|
+
|
224
|
+
def each_hyperlink_attribute
|
225
|
+
traverse_element(
|
226
|
+
'{http://www.w3.org/1999/xhtml}a',
|
227
|
+
'{http://www.w3.org/1999/xhtml}area',
|
228
|
+
'{http://www.w3.org/1999/xhtml}link',
|
229
|
+
'{http://www.w3.org/1999/xhtml}img',
|
230
|
+
'{http://www.w3.org/1999/xhtml}object',
|
231
|
+
'{http://www.w3.org/1999/xhtml}q',
|
232
|
+
'{http://www.w3.org/1999/xhtml}blockquote',
|
233
|
+
'{http://www.w3.org/1999/xhtml}ins',
|
234
|
+
'{http://www.w3.org/1999/xhtml}del',
|
235
|
+
'{http://www.w3.org/1999/xhtml}form',
|
236
|
+
'{http://www.w3.org/1999/xhtml}input',
|
237
|
+
'{http://www.w3.org/1999/xhtml}head',
|
238
|
+
'{http://www.w3.org/1999/xhtml}base',
|
239
|
+
'{http://www.w3.org/1999/xhtml}script') {|elem|
|
240
|
+
case elem.name
|
241
|
+
when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
|
242
|
+
attrs = ['href']
|
243
|
+
when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
|
244
|
+
attrs = ['src', 'longdesc', 'usemap']
|
245
|
+
when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
|
246
|
+
attrs = ['classid', 'codebase', 'data', 'usemap']
|
247
|
+
when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
|
248
|
+
attrs = ['cite']
|
249
|
+
when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
|
250
|
+
attrs = ['action']
|
251
|
+
when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
|
252
|
+
attrs = ['src', 'usemap']
|
253
|
+
when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
|
254
|
+
attrs = ['profile']
|
255
|
+
when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
|
256
|
+
attrs = ['src', 'for']
|
257
|
+
end
|
258
|
+
attrs.each {|attr|
|
259
|
+
if hyperlink = elem.get_attribute(attr)
|
260
|
+
yield elem, attr, hyperlink
|
261
|
+
end
|
262
|
+
}
|
263
|
+
}
|
264
|
+
end
|
265
|
+
private :each_hyperlink_attribute
|
266
|
+
|
267
|
+
# +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
|
268
|
+
# of A element.
|
269
|
+
#
|
270
|
+
# It yields Hpricot::Text (or Hpricot::Loc) and URI for each hyperlink.
|
271
|
+
#
|
272
|
+
# The URI objects are created with a base URI which is given by
|
273
|
+
# HTML BASE element or the argument ((|base_uri|)).
|
274
|
+
# +each_hyperlink_uri+ doesn't yields href of the BASE element.
|
275
|
+
def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
|
276
|
+
base_uri = URI.parse(base_uri) if String === base_uri
|
277
|
+
links = []
|
278
|
+
each_hyperlink_attribute {|elem, attr, hyperlink|
|
279
|
+
if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
|
280
|
+
base_uri = URI.parse(hyperlink.to_s)
|
281
|
+
else
|
282
|
+
links << hyperlink
|
283
|
+
end
|
284
|
+
}
|
285
|
+
if base_uri
|
286
|
+
links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
|
287
|
+
else
|
288
|
+
links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
# +each_hyperlink+ traverses hyperlinks such as HTML href attribute
|
293
|
+
# of A element.
|
294
|
+
#
|
295
|
+
# It yields Hpricot::Text or Hpricot::Loc.
|
296
|
+
#
|
297
|
+
# Note that +each_hyperlink+ yields HTML href attribute of BASE element.
|
298
|
+
def each_hyperlink # :yields: text
|
299
|
+
links = []
|
300
|
+
each_hyperlink_attribute {|elem, attr, hyperlink|
|
301
|
+
yield hyperlink
|
302
|
+
}
|
303
|
+
end
|
304
|
+
|
305
|
+
# +each_uri+ traverses hyperlinks such as HTML href attribute
|
306
|
+
# of A element.
|
307
|
+
#
|
308
|
+
# It yields URI for each hyperlink.
|
309
|
+
#
|
310
|
+
# The URI objects are created with a base URI which is given by
|
311
|
+
# HTML BASE element or the argument ((|base_uri|)).
|
312
|
+
def each_uri(base_uri=nil) # :yields: URI
|
313
|
+
each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
317
|
+
# :stopdoc:
|
318
|
+
module Doc::Trav
|
319
|
+
def traverse_all_element(&block)
|
320
|
+
children.each {|c| c.traverse_all_element(&block) }
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
module Elem::Trav
|
325
|
+
def traverse_all_element(&block)
|
326
|
+
yield self
|
327
|
+
children.each {|c| c.traverse_all_element(&block) }
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
module Leaf::Trav
|
332
|
+
def traverse_all_element
|
333
|
+
end
|
334
|
+
end
|
335
|
+
|
336
|
+
module Doc::Trav
|
337
|
+
def traverse_some_element(name_set, &block)
|
338
|
+
children.each {|c| c.traverse_some_element(name_set, &block) }
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
module Elem::Trav
|
343
|
+
def traverse_some_element(name_set, &block)
|
344
|
+
yield self if name_set.include? self.name
|
345
|
+
children.each {|c| c.traverse_some_element(name_set, &block) }
|
346
|
+
end
|
347
|
+
end
|
348
|
+
|
349
|
+
module Leaf::Trav
|
350
|
+
def traverse_some_element(name_set)
|
351
|
+
end
|
352
|
+
end
|
353
|
+
# :startdoc:
|
354
|
+
|
355
|
+
module Traverse
|
356
|
+
# +traverse_text+ traverses texts in the tree
|
357
|
+
def traverse_text(&block) # :yields: text
|
358
|
+
traverse_text_internal(&block)
|
359
|
+
nil
|
360
|
+
end
|
361
|
+
end
|
362
|
+
|
363
|
+
# :stopdoc:
|
364
|
+
module Container::Trav
|
365
|
+
def traverse_text_internal(&block)
|
366
|
+
each_child {|c| c.traverse_text_internal(&block) }
|
367
|
+
end
|
368
|
+
end
|
369
|
+
|
370
|
+
module Leaf::Trav
|
371
|
+
def traverse_text_internal
|
372
|
+
end
|
373
|
+
end
|
374
|
+
|
375
|
+
module Text::Trav
|
376
|
+
def traverse_text_internal
|
377
|
+
yield self
|
378
|
+
end
|
379
|
+
end
|
380
|
+
# :startdoc:
|
381
|
+
|
382
|
+
module Container::Trav
|
383
|
+
# +filter+ rebuilds the tree without some components.
|
384
|
+
#
|
385
|
+
# node.filter {|descendant_node| predicate } -> node
|
386
|
+
# loc.filter {|descendant_loc| predicate } -> node
|
387
|
+
#
|
388
|
+
# +filter+ yields each node except top node.
|
389
|
+
# If given block returns false, corresponding node is dropped.
|
390
|
+
# If given block returns true, corresponding node is retained and
|
391
|
+
# inner nodes are examined.
|
392
|
+
#
|
393
|
+
# +filter+ returns an node.
|
394
|
+
# It doesn't return location object even if self is location object.
|
395
|
+
#
|
396
|
+
def filter(&block)
|
397
|
+
subst = {}
|
398
|
+
each_child_with_index {|descendant, i|
|
399
|
+
if yield descendant
|
400
|
+
if descendant.elem?
|
401
|
+
subst[i] = descendant.filter(&block)
|
402
|
+
else
|
403
|
+
subst[i] = descendant
|
404
|
+
end
|
405
|
+
else
|
406
|
+
subst[i] = nil
|
407
|
+
end
|
408
|
+
}
|
409
|
+
to_node.subst_subnode(subst)
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
module Doc::Trav
|
414
|
+
# +title+ searches title and return it as a text.
|
415
|
+
# It returns nil if not found.
|
416
|
+
#
|
417
|
+
# +title+ searchs following information.
|
418
|
+
#
|
419
|
+
# - <title>...</title> in HTML
|
420
|
+
# - <title>...</title> in RSS
|
421
|
+
def title
|
422
|
+
e = find_element('title',
|
423
|
+
'{http://www.w3.org/1999/xhtml}title',
|
424
|
+
'{http://purl.org/rss/1.0/}title',
|
425
|
+
'{http://my.netscape.com/rdf/simple/0.9/}title')
|
426
|
+
e && e.extract_text
|
427
|
+
end
|
428
|
+
|
429
|
+
# +author+ searches author and return it as a text.
|
430
|
+
# It returns nil if not found.
|
431
|
+
#
|
432
|
+
# +author+ searchs following information.
|
433
|
+
#
|
434
|
+
# - <meta name="author" content="author-name"> in HTML
|
435
|
+
# - <link rev="made" title="author-name"> in HTML
|
436
|
+
# - <dc:creator>author-name</dc:creator> in RSS
|
437
|
+
# - <dc:publisher>author-name</dc:publisher> in RSS
|
438
|
+
def author
|
439
|
+
traverse_element('meta',
|
440
|
+
'{http://www.w3.org/1999/xhtml}meta') {|e|
|
441
|
+
begin
|
442
|
+
next unless e.fetch_attr('name').downcase == 'author'
|
443
|
+
author = e.fetch_attribute('content').strip
|
444
|
+
return author if !author.empty?
|
445
|
+
rescue IndexError
|
446
|
+
end
|
447
|
+
}
|
448
|
+
|
449
|
+
traverse_element('link',
|
450
|
+
'{http://www.w3.org/1999/xhtml}link') {|e|
|
451
|
+
begin
|
452
|
+
next unless e.fetch_attr('rev').downcase == 'made'
|
453
|
+
author = e.fetch_attribute('title').strip
|
454
|
+
return author if !author.empty?
|
455
|
+
rescue IndexError
|
456
|
+
end
|
457
|
+
}
|
458
|
+
|
459
|
+
if channel = find_element('{http://purl.org/rss/1.0/}channel')
|
460
|
+
channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
|
461
|
+
begin
|
462
|
+
author = e.extract_text.strip
|
463
|
+
return author if !author.empty?
|
464
|
+
rescue IndexError
|
465
|
+
end
|
466
|
+
}
|
467
|
+
channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
|
468
|
+
begin
|
469
|
+
author = e.extract_text.strip
|
470
|
+
return author if !author.empty?
|
471
|
+
rescue IndexError
|
472
|
+
end
|
473
|
+
}
|
474
|
+
end
|
475
|
+
|
476
|
+
nil
|
477
|
+
end
|
478
|
+
|
479
|
+
end
|
480
|
+
|
481
|
+
module Doc::Trav
|
482
|
+
def root
|
483
|
+
es = []
|
484
|
+
children.each {|c| es << c if c.elem? }
|
485
|
+
raise Hpricot::Error, "no element" if es.empty?
|
486
|
+
raise Hpricot::Error, "multiple top elements" if 1 < es.length
|
487
|
+
es[0]
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
module Elem::Trav
|
492
|
+
def has_attribute?(name)
|
493
|
+
self.attributes && self.attributes.has_key?(name.to_s)
|
494
|
+
end
|
495
|
+
def get_attribute(name)
|
496
|
+
self.attributes && self.attributes[name.to_s]
|
497
|
+
end
|
498
|
+
alias_method :[], :get_attribute
|
499
|
+
def set_attribute(name, val)
|
500
|
+
self.attributes ||= {}
|
501
|
+
self.attributes[name.to_s] = val
|
502
|
+
end
|
503
|
+
alias_method :[]=, :set_attribute
|
504
|
+
def remove_attribute(name)
|
505
|
+
if has_attribute? name
|
506
|
+
self.attributes.delete(name)
|
507
|
+
end
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
end
|