rubyful_soup 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +12 -0
- data/lib/rubyful_soup.rb +925 -0
- data/tests/rubyful_soup_tests.rb +431 -0
- metadata +52 -0
data/CHANGELOG
ADDED
data/lib/rubyful_soup.rb
ADDED
@@ -0,0 +1,925 @@
|
|
1
|
+
#Rubyful Soup
|
2
|
+
#Elixir and Tonic
|
3
|
+
#"The Screen-Scraper's Friend"
|
4
|
+
#v1.0.1
|
5
|
+
#http://www.crummy.com/software/RubyfulSoup/
|
6
|
+
#
|
7
|
+
#Rubyful Soup is a port to the Ruby language and idiom of the Python
|
8
|
+
#library Beautiful Soup.
|
9
|
+
#See http://www.crummy.com/software/BeautifulSoup/ for details on the original.
|
10
|
+
|
11
|
+
#This library requires the sgml-parser library, written by Takahiro
|
12
|
+
#Maebashi. The easiest way to get it is to install the "htmltools"
|
13
|
+
#gem.
|
14
|
+
require 'rubygems'
|
15
|
+
require 'sgml-parser'
|
16
|
+
|
17
|
+
#UTF-8 voodoo--does this really work?
|
18
|
+
$KCODE = 'u'
|
19
|
+
require 'jcode'
|
20
|
+
|
21
|
+
#This code makes SGMLParser able to parse XML with namespaces.
|
22
|
+
class SGMLParser
|
23
|
+
if const_defined? :Tagfind
|
24
|
+
remove_const(:Tagfind)
|
25
|
+
Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
module PageElement
|
30
|
+
|
31
|
+
attr_reader :parser
|
32
|
+
attr_accessor :parent, :previous_parsed, :next_parsed, :previous_sibling
|
33
|
+
attr_accessor :next_sibling
|
34
|
+
|
35
|
+
def setup(parent=nil, previous_parsed=nil)
|
36
|
+
@parent = parent
|
37
|
+
@previous_parsed = previous_parsed
|
38
|
+
@next_parsed = nil
|
39
|
+
@previous_sibling = nil
|
40
|
+
@next_sibling = nil
|
41
|
+
if @parent and not @parent.contents.empty?
|
42
|
+
@previous_sibling = @parent.contents[-1]
|
43
|
+
@previous_sibling.next_sibling = self
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
#A bunch of different iterators over a parsed document.
|
48
|
+
{
|
49
|
+
#Iterates in parse order over the rest of the items in this document.
|
50
|
+
:next_parsed_items => :next_parsed,
|
51
|
+
|
52
|
+
#Iterates in reverse parse order over all previously parsed items in
|
53
|
+
#this document.
|
54
|
+
:previous_parsed_items => :previous_parsed,
|
55
|
+
|
56
|
+
#Iterates in parse order over all subsequent siblings of this item.
|
57
|
+
:next_siblings => :next_sibling,
|
58
|
+
|
59
|
+
#Iterates in reverse parse order over all prior siblings of this item.
|
60
|
+
:previous_siblings => :previous_sibling,
|
61
|
+
|
62
|
+
#Iterates upwards through the parentage of this item.
|
63
|
+
:parents => :parent
|
64
|
+
}.each do |k,v|
|
65
|
+
class_eval %{
|
66
|
+
def #{k}
|
67
|
+
i = self
|
68
|
+
while i
|
69
|
+
i = i.#{v}
|
70
|
+
yield i if i
|
71
|
+
end
|
72
|
+
end
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
[ #Returns first item/all items matching the given criteria and
|
77
|
+
#appearing after this PageElement in the document.
|
78
|
+
[:find_next, :find_all_next, 'next_parsed_items'],
|
79
|
+
|
80
|
+
#Returns first item/all items matching the given criteria and
|
81
|
+
#appearing before this PageElement in the document.
|
82
|
+
[:find_previous, :find_all_previous, 'previous_parsed_items'],
|
83
|
+
|
84
|
+
#Returns the nearest sibling/all siblings of this PageElement matching
|
85
|
+
#the given criteria and appearing before this PageElement in
|
86
|
+
#the document.
|
87
|
+
[:find_previous_sibling, :find_previous_siblings, 'previous_siblings'],
|
88
|
+
|
89
|
+
#Returns the nearest sibling/all siblings of this PageElement matching
|
90
|
+
#the given criteria and appearing after this PageElement in
|
91
|
+
#the document
|
92
|
+
[:find_next_sibling, :find_next_siblings, 'next_siblings'],
|
93
|
+
|
94
|
+
#Returns the nearest parent/all parents of this PageElement matching
|
95
|
+
#the given criteria.
|
96
|
+
[:find_parent, :find_parents, 'parents'],
|
97
|
+
].each do |singular, plural, method_name|
|
98
|
+
class_eval %{
|
99
|
+
def #{singular}(name=nil, args={}, &block)
|
100
|
+
args['limit'] = 1
|
101
|
+
fetch(method('#{method_name}'), name, args, block)[0]
|
102
|
+
end
|
103
|
+
|
104
|
+
def #{plural}(name=nil, args={}, &block)
|
105
|
+
fetch(method('#{method_name}'), name, args, block)
|
106
|
+
end
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
protected
|
111
|
+
|
112
|
+
#Returns a list of items matching the given criteria, obtained by
|
113
|
+
#iterating over the given iterator.
|
114
|
+
def fetch(iterator, name, args, block)
|
115
|
+
attrs = args[:attrs]
|
116
|
+
limit = args[:limit]
|
117
|
+
text = args[:text]
|
118
|
+
|
119
|
+
attrs ||= {}
|
120
|
+
if attrs != nil and not attrs.respond_to? :keys
|
121
|
+
attrs = {'class' => attrs}
|
122
|
+
end
|
123
|
+
bucket = []
|
124
|
+
catch(:stop_iteration) do
|
125
|
+
iterator.call do |item|
|
126
|
+
match = false
|
127
|
+
if block
|
128
|
+
match = true if block.call(item)
|
129
|
+
elsif item.is_a? Tag
|
130
|
+
#A tag matches if its name matches and its attributes line up.
|
131
|
+
if not text and (not name or PageElement.matches(item, name))
|
132
|
+
match = true
|
133
|
+
attrs.each_pair do |attr, matchAgainst|
|
134
|
+
check = item[attr]
|
135
|
+
unless PageElement.matches(check, matchAgainst)
|
136
|
+
match = false
|
137
|
+
break
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
elsif text
|
142
|
+
#A text matches if its string value matches the given text
|
143
|
+
#criterion.
|
144
|
+
match = PageElement.matches(item, text)
|
145
|
+
end
|
146
|
+
if match
|
147
|
+
bucket.push(item)
|
148
|
+
if limit and bucket.length >= limit
|
149
|
+
throw :stop_iteration
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
return bucket
|
155
|
+
end
|
156
|
+
|
157
|
+
#Used to tell whether a Tag or a NavigableString "matches" some data
|
158
|
+
#structure.
|
159
|
+
def PageElement.matches(chunk, how_to_match)
|
160
|
+
#puts "Seeing if #{chunk.class} #{chunk} matches #{how_to_match.class} #{how_to_match}."
|
161
|
+
#
|
162
|
+
# If given a list of items, return true if the list contains a
|
163
|
+
# text element that matches.
|
164
|
+
if chunk.is_a? Array
|
165
|
+
chunk.each do |tag|
|
166
|
+
return true if tag.is_a? NavigableString and matches(tag, how_to_match)
|
167
|
+
end
|
168
|
+
return false
|
169
|
+
elsif how_to_match.is_a? Proc
|
170
|
+
return how_to_match.call(chunk)
|
171
|
+
elsif chunk.is_a? Tag
|
172
|
+
#Custom match methods take the tag as an argument, but all other
|
173
|
+
#ways of matching match the tag name as a string
|
174
|
+
chunk = chunk.name
|
175
|
+
end
|
176
|
+
|
177
|
+
#At this point we know that chunk is a string
|
178
|
+
unless chunk.is_a? String
|
179
|
+
chunk = chunk.to_s
|
180
|
+
end
|
181
|
+
if how_to_match.is_a? Regexp
|
182
|
+
return how_to_match.match(chunk) != nil
|
183
|
+
elsif how_to_match.is_a? Array
|
184
|
+
return how_to_match.find {|x| x == chunk} != nil
|
185
|
+
elsif how_to_match.is_a? Hash
|
186
|
+
return how_to_match[chunk] != nil
|
187
|
+
else
|
188
|
+
#It's just a string
|
189
|
+
return how_to_match.to_s == chunk
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
194
|
+
|
195
|
+
module TagModule
|
196
|
+
|
197
|
+
include Enumerable
|
198
|
+
include PageElement
|
199
|
+
|
200
|
+
attr_accessor :name, :contents, :attrs, :string
|
201
|
+
|
202
|
+
#I tried to have Tag subclass Method, but it killed the
|
203
|
+
#whole thing. Maybe I should just leave well enough alone.
|
204
|
+
#
|
205
|
+
#def arity
|
206
|
+
# return methods('find_all').arity
|
207
|
+
#end
|
208
|
+
#
|
209
|
+
#def call(*args)
|
210
|
+
# return find_all(*args)
|
211
|
+
#end
|
212
|
+
#
|
213
|
+
#def to_proc
|
214
|
+
# return methods('find_all').to_proc
|
215
|
+
#end
|
216
|
+
|
217
|
+
def initialize(parser, name, attrs=nil, parent=nil, previous=nil)
|
218
|
+
@hidden = false
|
219
|
+
@parser = parser
|
220
|
+
@name = name
|
221
|
+
attrs ||= {}
|
222
|
+
@attrs = attrs
|
223
|
+
@contents = []
|
224
|
+
setup(parent, previous)
|
225
|
+
end
|
226
|
+
|
227
|
+
#soup.title_tag or soup.title is the same as soup.find('title')
|
228
|
+
def method_missing(name, *args)
|
229
|
+
#puts "Missing method #{name}"
|
230
|
+
name = name.to_s
|
231
|
+
if name[-4...name.length] == '_tag'
|
232
|
+
name = name[0...name.length-4]
|
233
|
+
end
|
234
|
+
return find(name, *args)
|
235
|
+
end
|
236
|
+
|
237
|
+
#TODO: is there a mixin for Hash?
|
238
|
+
def [](k)
|
239
|
+
return @attrs[k]
|
240
|
+
end
|
241
|
+
|
242
|
+
def []=(k, v)
|
243
|
+
@attrs[k] = v
|
244
|
+
end
|
245
|
+
|
246
|
+
def delete(k)
|
247
|
+
@attrs.delete(k)
|
248
|
+
end
|
249
|
+
|
250
|
+
def has_key?(k)
|
251
|
+
return @attrs.has_key(k)
|
252
|
+
end
|
253
|
+
|
254
|
+
#End things that would go away if there was a mixin for Hash.
|
255
|
+
|
256
|
+
def each
|
257
|
+
@contents.each { |x| yield x }
|
258
|
+
end
|
259
|
+
|
260
|
+
def length
|
261
|
+
return contents.length
|
262
|
+
end
|
263
|
+
alias size length
|
264
|
+
|
265
|
+
def self_closing?
|
266
|
+
return @parser.self_closing_tag?(@name)
|
267
|
+
end
|
268
|
+
|
269
|
+
#Adds the given tag to the contents of this tag
|
270
|
+
def append(tag)
|
271
|
+
@contents.push(tag)
|
272
|
+
end
|
273
|
+
|
274
|
+
def to_str
|
275
|
+
return to_s
|
276
|
+
end
|
277
|
+
|
278
|
+
#Renders this tag and its contents as a pretty-printed string.
|
279
|
+
def prettify
|
280
|
+
return to_s(true)
|
281
|
+
end
|
282
|
+
|
283
|
+
def inspect
|
284
|
+
to_s
|
285
|
+
end
|
286
|
+
|
287
|
+
#Renders this tag and its contents as a string. NOTE: since REXML
|
288
|
+
#consumes whitespace, this method is not certain to reproduce the
|
289
|
+
#whitespace present in the original string.
|
290
|
+
def to_s(show_structure_indent=nil)
|
291
|
+
attrs = []
|
292
|
+
@attrs.each { |k,v| attrs.push("#{k}=\"#{v}\"") if v }
|
293
|
+
if self_closing?
|
294
|
+
close = ' /'
|
295
|
+
closeTag = nil
|
296
|
+
else
|
297
|
+
close = nil
|
298
|
+
closeTag = "</#{name}>"
|
299
|
+
end
|
300
|
+
indent_increment = show_structure_indent==true ? 0 : show_structure_indent
|
301
|
+
if show_structure_indent
|
302
|
+
indent_increment += 1 unless @hidden
|
303
|
+
end
|
304
|
+
contents = render_contents(indent_increment)
|
305
|
+
space = "\n #{' ' * indent_increment}" if show_structure_indent
|
306
|
+
if @hidden
|
307
|
+
s = contents
|
308
|
+
else
|
309
|
+
s = []
|
310
|
+
attribute_string = ''
|
311
|
+
unless attrs.empty?
|
312
|
+
attribute_string = ' ' + attrs.join(' ')
|
313
|
+
end
|
314
|
+
s.push(space) if show_structure_indent
|
315
|
+
s.push("<#{@name}#{attribute_string}#{close}>")
|
316
|
+
s.push(contents)
|
317
|
+
s.push(space) if closeTag and show_structure_indent
|
318
|
+
s.push(closeTag)
|
319
|
+
s = s.join('')
|
320
|
+
end
|
321
|
+
return s
|
322
|
+
end
|
323
|
+
|
324
|
+
#Renders the contents of this tag as a string.
|
325
|
+
def render_contents(show_structure_indent=nil)
|
326
|
+
s=[]
|
327
|
+
@contents.each do |c|
|
328
|
+
text = nil
|
329
|
+
if c.is_a? Tag
|
330
|
+
text = c.to_s(show_structure_indent)
|
331
|
+
else
|
332
|
+
text = c.to_s
|
333
|
+
end
|
334
|
+
if text
|
335
|
+
if show_structure_indent
|
336
|
+
text.chomp!
|
337
|
+
end
|
338
|
+
s.push(text)
|
339
|
+
end
|
340
|
+
end
|
341
|
+
return s.join('')
|
342
|
+
end
|
343
|
+
|
344
|
+
def recursive_children
|
345
|
+
stack = [[self, 0]]
|
346
|
+
catch(:stop_iteration) do
|
347
|
+
until stack.empty?
|
348
|
+
tag, start = stack.pop
|
349
|
+
for i in start...tag.contents.length
|
350
|
+
a = tag.contents[i]
|
351
|
+
yield a
|
352
|
+
if a.is_a? TagModule and not tag.contents.empty? and i < tag.contents.length
|
353
|
+
stack.push([tag, i+1])
|
354
|
+
stack.push([a, 0])
|
355
|
+
break
|
356
|
+
end
|
357
|
+
end if tag.is_a? TagModule
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
#Iterates over the direct children of this Tag.
|
363
|
+
def children
|
364
|
+
catch(:stop_iteration) { @contents.each { |x| yield x } }
|
365
|
+
end
|
366
|
+
|
367
|
+
#Convenience method to retrieve the first piece of text matching the
|
368
|
+
#given criteria. 'text' can be a string, a regular expression object,
|
369
|
+
#a Proc that takes a string and returns whether or not the
|
370
|
+
#string 'matches', etc.
|
371
|
+
def find_text(text=nil, &block)
|
372
|
+
args = { :text => text, :limit => 1}
|
373
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
374
|
+
fetch(iterator, nil, args, block)[0]
|
375
|
+
end
|
376
|
+
|
377
|
+
#Convenience method to retrieve all pieces of text matching the
|
378
|
+
#given criteria. 'text' can be a string, a regular expression object,
|
379
|
+
#a callable that takes a string and returns whether or not the
|
380
|
+
#string 'matches', etc.
|
381
|
+
#Args: :limit
|
382
|
+
def find_all_text(text=nil, args={}, &block)
|
383
|
+
args['text'] = text
|
384
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
385
|
+
fetch(iterator, nil, args, block)
|
386
|
+
end
|
387
|
+
|
388
|
+
#Extracts a list of Tag objects that match the given criteria. You
|
389
|
+
#can specify the name of the Tag and any attributes you want the Tag
|
390
|
+
#to have.
|
391
|
+
#
|
392
|
+
#The value of a key-value pair in the 'attrs' map can be a string, a
|
393
|
+
#list of strings, a regular expression object, or a Proc object that
|
394
|
+
#takes a string and returns whether or not the string matches for
|
395
|
+
#some custom definition of 'matches'. The same is true of the tag
|
396
|
+
#name, except that a Proc object will be passed the Tag object instead
|
397
|
+
#of just a string.
|
398
|
+
#Args: :attrs :text :limit :recursive
|
399
|
+
def find_all(name=nil, args={}, &block)
|
400
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
401
|
+
fetch(iterator, name, args, block)
|
402
|
+
end
|
403
|
+
|
404
|
+
#Returns the first Tag or NavigableString object that matches the
|
405
|
+
#given criteria. Takes much the same arguments as fetch.
|
406
|
+
#args: :attrs :text :limit :recursive
|
407
|
+
def find(name=nil, args={}, &block)
|
408
|
+
args[:limit] = 1
|
409
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
410
|
+
fetch(iterator, name, args, block)[0]
|
411
|
+
end
|
412
|
+
end
|
413
|
+
|
414
|
+
class Tag
|
415
|
+
include TagModule
|
416
|
+
end
|
417
|
+
|
418
|
+
class NavigableString < String
|
419
|
+
include PageElement
|
420
|
+
end
|
421
|
+
|
422
|
+
#This class contains the basic parser and fetch code. It defines
|
423
|
+
#a parser that knows nothing about tag behavior except for the
|
424
|
+
#following:
|
425
|
+
#
|
426
|
+
#You can't close a tag without closing all the tags it encloses.
|
427
|
+
#That is, "<foo><bar></foo>" actually means
|
428
|
+
#"<foo><bar></bar></foo>".
|
429
|
+
#
|
430
|
+
#[Another possible explanation is "<foo><bar /></foo>", but since
|
431
|
+
# this class defines no self_closing_tags, it will never use that
|
432
|
+
# explanation.]
|
433
|
+
#
|
434
|
+
#This class is useful for parsing XML or made-up markup languages,
|
435
|
+
#or when BeautifulSoup makes an assumption counter to what you were
|
436
|
+
#expecting."""
|
437
|
+
class BeautifulStoneSoup < SGMLParser
|
438
|
+
include TagModule
|
439
|
+
|
440
|
+
#As a public service we will by default silently replace MS smart quotes
|
441
|
+
#and similar characters with their HTML or ASCII equivalents.
|
442
|
+
@@ms_chars = { '\x80' => '€',
|
443
|
+
"\x81" => ' ',
|
444
|
+
"\x82" => '‚',
|
445
|
+
"\x83" => 'ƒ',
|
446
|
+
"\x84" => '„',
|
447
|
+
"\x85" => '…',
|
448
|
+
"\x86" => '†',
|
449
|
+
"\x87" => '‡',
|
450
|
+
"\x88" => '⁁',
|
451
|
+
"\x89" => '%',
|
452
|
+
"\x8A" => 'Š',
|
453
|
+
"\x8B" => '<',
|
454
|
+
"\x8C" => 'Œ',
|
455
|
+
"\x8D" => '?',
|
456
|
+
"\x8E" => 'Z',
|
457
|
+
"\x8F" => '?',
|
458
|
+
"\x90" => '?',
|
459
|
+
"\x91" => '‘',
|
460
|
+
"\x92" => '’',
|
461
|
+
"\x93" => '“',
|
462
|
+
"\x94" => '”',
|
463
|
+
"\x95" => '•',
|
464
|
+
"\x96" => '–',
|
465
|
+
"\x97" => '—',
|
466
|
+
"\x98" => '˜',
|
467
|
+
"\x99" => '™',
|
468
|
+
"\x9a" => 'š',
|
469
|
+
"\x9b" => '>',
|
470
|
+
"\x9c" => 'œ',
|
471
|
+
"\x9d" => '?',
|
472
|
+
"\x9e" => 'z',
|
473
|
+
"\x9f" => 'Ÿ'}
|
474
|
+
|
475
|
+
@@parser_massage = [[/<([^<>]*)\/>/, '<\1></\1>'],
|
476
|
+
[/<!\s+([^<>]*)>/, '<!\1>'],
|
477
|
+
[/([\x80-\x9f])/m, proc { |m| @@ms_chars[m]}]
|
478
|
+
]
|
479
|
+
|
480
|
+
@@rootTagName = '[document]'
|
481
|
+
|
482
|
+
@@nestable_tags = {}
|
483
|
+
@@reset_nesting_tags = {}
|
484
|
+
@@quoteTags = {}
|
485
|
+
@@self_closing_tags = {}
|
486
|
+
|
487
|
+
attr_accessor :hidden
|
488
|
+
|
489
|
+
def self_closing_tag?(tag)
|
490
|
+
@@self_closing_tags.has_key?(tag)
|
491
|
+
end
|
492
|
+
|
493
|
+
#Args: :initial_text_is_everything, :avoid_parser_problems
|
494
|
+
def initialize(text, args={})
|
495
|
+
super(self, @@rootTagName)
|
496
|
+
@quote_stack = []
|
497
|
+
@hidden = 1
|
498
|
+
reset
|
499
|
+
|
500
|
+
@avoid_parser_problems = args[:avoid_parser_problems] || true
|
501
|
+
if @avoid_parser_problems and not @avoid_parser_problems.is_a? Enumerable
|
502
|
+
@avoid_parser_problems = @@parser_massage
|
503
|
+
end
|
504
|
+
feed(text) if text != nil
|
505
|
+
done if args[:initial_text_is_everything] != false
|
506
|
+
end
|
507
|
+
|
508
|
+
def feed(text)
|
509
|
+
if @avoid_parser_problems
|
510
|
+
#before = text.clone
|
511
|
+
@avoid_parser_problems.each do |re, fix|
|
512
|
+
if fix.is_a? String
|
513
|
+
text.gsub!(re, fix)
|
514
|
+
else
|
515
|
+
text.gsub!(re) { |x| fix.call(x) }
|
516
|
+
end
|
517
|
+
end
|
518
|
+
#if before != text
|
519
|
+
# puts "Changed from #{before} to #{text}"
|
520
|
+
#end
|
521
|
+
end
|
522
|
+
super
|
523
|
+
end
|
524
|
+
|
525
|
+
def ==(anObject)
|
526
|
+
return anObject.to_s == to_s
|
527
|
+
end
|
528
|
+
|
529
|
+
def done
|
530
|
+
end_text
|
531
|
+
pop_tag while @currentTag.name != @@rootTagName
|
532
|
+
end
|
533
|
+
|
534
|
+
def reset
|
535
|
+
super
|
536
|
+
@currentText = []
|
537
|
+
@currentTag = nil
|
538
|
+
@tag_stack = []
|
539
|
+
push_tag(self)
|
540
|
+
end
|
541
|
+
|
542
|
+
def push_tag(tag)
|
543
|
+
#puts "Push #{ tag.name }"
|
544
|
+
@currentTag.append(tag) if @currentTag
|
545
|
+
@tag_stack.push(tag)
|
546
|
+
@currentTag = @tag_stack[-1]
|
547
|
+
end
|
548
|
+
|
549
|
+
def pop_tag
|
550
|
+
tag = @tag_stack.pop
|
551
|
+
#puts "Pop #{ tag.name }"
|
552
|
+
|
553
|
+
# Tags with just one string-owning child get the child as a
|
554
|
+
# 'string' property, so that soup.tag.string is shorthand for
|
555
|
+
# soup.tag.contents[0]
|
556
|
+
if @currentTag.contents.length == 1 and @currentTag.contents[0].is_a? NavigableString
|
557
|
+
@currentTag.string = @currentTag.contents[0]
|
558
|
+
end
|
559
|
+
|
560
|
+
@currentTag = @tag_stack[-1] unless @tag_stack.empty?
|
561
|
+
@currentTag
|
562
|
+
end
|
563
|
+
|
564
|
+
# StreamListener implementation
|
565
|
+
|
566
|
+
def unknown_starttag(name, attrs)
|
567
|
+
#puts "Starting tag #{name} #{attrs.inspect}"
|
568
|
+
attrs = attrs.inject({}) do |m,v|
|
569
|
+
if v[1][0] == ?" and v[1][-1] == ?":
|
570
|
+
v[1] = v[1][1..-2]
|
571
|
+
end
|
572
|
+
m[v[0]] = v[1]
|
573
|
+
m
|
574
|
+
end
|
575
|
+
unless @quote_stack.empty?
|
576
|
+
#This is not a real tag.
|
577
|
+
#puts "<#{name}> is not real!"
|
578
|
+
#TODO: find idiomatic way to do this
|
579
|
+
attrString = []
|
580
|
+
attrs.each { |k,v| attrString.push('#{k}="#{v}"') }
|
581
|
+
self.handle_data('<#{name} #{attrString.join(' ')}>')
|
582
|
+
return
|
583
|
+
end
|
584
|
+
|
585
|
+
end_text
|
586
|
+
self_closing = @@self_closing_tags.has_key?(name)
|
587
|
+
smart_pop(name) unless self_closing
|
588
|
+
tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
|
589
|
+
@previous_parsed.next_parsed = tag if @previous_parsed
|
590
|
+
@previous_parsed = tag
|
591
|
+
push_tag(tag)
|
592
|
+
pop_tag if self_closing
|
593
|
+
if @@quoteTags.has_key?(name)
|
594
|
+
#puts "Beginning quote (#{name})"
|
595
|
+
@quote_stack.push(name)
|
596
|
+
end
|
597
|
+
end
|
598
|
+
|
599
|
+
def unknown_endtag(name)
|
600
|
+
#Ignore tag_end calls for self-closing tags; they were
|
601
|
+
#closed in the tag_start call.
|
602
|
+
#TODO: still neccessary?
|
603
|
+
#puts "Ending tag #{name}"
|
604
|
+
return if @@self_closing_tags.has_key?(name)
|
605
|
+
|
606
|
+
if not @quote_stack.empty? and @quote_stack[-1] != name
|
607
|
+
#This is not a real end tag.
|
608
|
+
#puts "</#{name}> is not real!"
|
609
|
+
handle_data('</#{name}>')
|
610
|
+
return
|
611
|
+
end
|
612
|
+
end_text
|
613
|
+
pop_to_tag(name)
|
614
|
+
@quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
|
615
|
+
end
|
616
|
+
|
617
|
+
def handle_data(data)
|
618
|
+
@currentText.push(data)
|
619
|
+
end
|
620
|
+
|
621
|
+
#Propagate comments right through.
|
622
|
+
def handle_comment(data)
|
623
|
+
handle_data("<!--#{comment}-->")
|
624
|
+
end
|
625
|
+
|
626
|
+
def handle_special(data)
|
627
|
+
handle_data("<#{data}>")
|
628
|
+
end
|
629
|
+
|
630
|
+
def unknown_charref(ref)
|
631
|
+
handle_data("&#{ref};")
|
632
|
+
end
|
633
|
+
|
634
|
+
def unknown_entityref(ref)
|
635
|
+
handle_data("%#{content}")
|
636
|
+
end
|
637
|
+
|
638
|
+
def attlistdecl(element_name, attributes, raw_content)
|
639
|
+
handle_data("<!ATTLIST #{raw_content}>")
|
640
|
+
end
|
641
|
+
|
642
|
+
def cdata(content)
|
643
|
+
handle_data("<![CDATA[#{content}]]")
|
644
|
+
end
|
645
|
+
|
646
|
+
###
|
647
|
+
|
648
|
+
def doctype(*args)
|
649
|
+
content = args.join(' ')
|
650
|
+
##{name} #{pub_sys}#{long_name}#{url}
|
651
|
+
#long_name = ' "#{long_name}"' if long_name
|
652
|
+
#url = ' "#{url}"' if url
|
653
|
+
handle_data("<!DOCTYPE #{content}>")
|
654
|
+
end
|
655
|
+
|
656
|
+
def elementdecl(content)
|
657
|
+
handle_data("<!ELEMENT #{content}>")
|
658
|
+
end
|
659
|
+
|
660
|
+
def entity(content)
|
661
|
+
|
662
|
+
end
|
663
|
+
|
664
|
+
def entitydecl(content)
|
665
|
+
handle_data("<!ENTITY #{content.join(' ')}>")
|
666
|
+
end
|
667
|
+
|
668
|
+
def instruction(name, instruction)
|
669
|
+
handle_data("<?#{name} #{instruction}>")
|
670
|
+
end
|
671
|
+
|
672
|
+
def notationdecl(content)
|
673
|
+
handle_data("<!NOTATION #{content}>")
|
674
|
+
end
|
675
|
+
|
676
|
+
def xmldecl(version, encoding, standalone)
|
677
|
+
encoding = ' encoding="#{encoding}"' if encoding
|
678
|
+
handle_data('<?xml version="#{version}"#{encoding}#{standalone}>')
|
679
|
+
end
|
680
|
+
|
681
|
+
#Called when we're done collecting some text, declarations, etc.
|
682
|
+
def end_text
|
683
|
+
currentText = @currentText.join('')
|
684
|
+
unless currentText.empty?
|
685
|
+
if currentText.strip.empty?
|
686
|
+
if currentText =~ /\n/
|
687
|
+
currentText = "\n"
|
688
|
+
else
|
689
|
+
currentText = ' '
|
690
|
+
end
|
691
|
+
end
|
692
|
+
#puts "Setting up text #{currentText}"
|
693
|
+
currentText = NavigableString.new(currentText)
|
694
|
+
currentText.setup(@currentTag, @previous_parsed)
|
695
|
+
@previous_parsed.next_parsed = currentText if @previous_parsed
|
696
|
+
@previous_parsed = currentText
|
697
|
+
@currentTag.contents.push(currentText)
|
698
|
+
end
|
699
|
+
@currentText = []
|
700
|
+
end
|
701
|
+
|
702
|
+
# Helper methods
|
703
|
+
|
704
|
+
private
|
705
|
+
|
706
|
+
#Pops the tag stack up to and including the most recent
|
707
|
+
#instance of the given tag. If inclusivePop is false, pops the tag
|
708
|
+
#stack up to but *not* including the most recent instance of
|
709
|
+
#the given tag.
|
710
|
+
def pop_to_tag(name, inclusive_pop=true)
|
711
|
+
return if name == @@rootTagName
|
712
|
+
|
713
|
+
#puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
|
714
|
+
num_pops = 0
|
715
|
+
mostRecentTag = nil
|
716
|
+
(0...@tag_stack.length).to_a.reverse.each do |i|
|
717
|
+
if name == @tag_stack[i].name
|
718
|
+
#puts "Found at #{i}, #{@tag_stack.length-i}"
|
719
|
+
num_pops = @tag_stack.length-i
|
720
|
+
break
|
721
|
+
end
|
722
|
+
end
|
723
|
+
num_pops -= 1 if not inclusive_pop
|
724
|
+
|
725
|
+
#puts "Popping #{num_pops} times."
|
726
|
+
num_pops.times { mostRecentTag = pop_tag }
|
727
|
+
mostRecentTag
|
728
|
+
end
|
729
|
+
|
730
|
+
#We need to pop up to the previous tag of this type, unless
|
731
|
+
#one of this tag's nesting reset triggers comes between this
|
732
|
+
#tag and the previous tag of this type, OR unless this tag is a
|
733
|
+
#generic nesting trigger and another generic nesting trigger
|
734
|
+
#comes between this tag and the previous tag of this type.
|
735
|
+
#
|
736
|
+
#Examples:
|
737
|
+
# <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
|
738
|
+
# <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
|
739
|
+
# <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
|
740
|
+
# <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
|
741
|
+
#
|
742
|
+
# <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
|
743
|
+
# <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
|
744
|
+
# <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
|
745
|
+
def smart_pop(name)
|
746
|
+
#puts "Smart pop for #{name}"
|
747
|
+
nesting_reset_triggers = @@nestable_tags[name]
|
748
|
+
is_nestable = nesting_reset_triggers != nil
|
749
|
+
is_reset_nesting = @@reset_nesting_tags.has_key?(name)
|
750
|
+
popTo = nil
|
751
|
+
inclusive = true
|
752
|
+
for p in @tag_stack.reverse
|
753
|
+
if (p == nil or p.name == name) and not is_nestable
|
754
|
+
#Non-nestable tags get popped to the top or to their
|
755
|
+
#last occurance.
|
756
|
+
#puts "Non-nestable tag #{name} gets popped to its last occurance."
|
757
|
+
popTo = name
|
758
|
+
break
|
759
|
+
end
|
760
|
+
if (nesting_reset_triggers != nil and nesting_reset_triggers.include?(p.name)) or (nesting_reset_triggers == nil and is_reset_nesting and @@reset_nesting_tags.has_key?(p.name))
|
761
|
+
#If we encounter one of the nesting reset triggers
|
762
|
+
#peculiar to this tag, or we encounter another tag
|
763
|
+
#that causes nesting to reset, pop up to but not
|
764
|
+
#including that tag.
|
765
|
+
#puts "Nesting reset trigger encountered for #{name}: #{p.name}"
|
766
|
+
popTo = p.name
|
767
|
+
inclusive = false
|
768
|
+
break
|
769
|
+
end
|
770
|
+
p = p.parent
|
771
|
+
end
|
772
|
+
pop_to_tag(popTo, inclusive) if popTo
|
773
|
+
end
|
774
|
+
|
775
|
+
protected
|
776
|
+
|
777
|
+
#Turns a list of maps, lists, or scalars into a single map.
|
778
|
+
#Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
|
779
|
+
#of lists and partial maps.
|
780
|
+
def BeautifulStoneSoup.build_tag_map(default, *args)
|
781
|
+
built = args.inject({}) do |m, portion|
|
782
|
+
if portion.is_a? Hash
|
783
|
+
#It's a map. Merge it.
|
784
|
+
portion.each_pair { |k,v| m[k] = v }
|
785
|
+
elsif portion.is_a? Array
|
786
|
+
#It's a list. Map each item to the default.
|
787
|
+
portion.each { |k| m[k] = default }
|
788
|
+
else
|
789
|
+
#It's a scalar. Map it to the default.
|
790
|
+
m[portion] = default
|
791
|
+
end
|
792
|
+
m
|
793
|
+
end
|
794
|
+
end
|
795
|
+
end
|
796
|
+
|
797
|
+
#This parser knows the following facts about HTML:
|
798
|
+
#
|
799
|
+
#* Some tags have no closing tag and should be interpreted as being
|
800
|
+
# closed as soon as they are encountered.
|
801
|
+
#
|
802
|
+
#* The text inside some tags (ie. 'script') may contain tags which
|
803
|
+
# are not really part of the document and which should be parsed
|
804
|
+
# as text, not tags. If you want to parse the text as tags, you can
|
805
|
+
# always fetch it and parse it explicitly.
|
806
|
+
#
|
807
|
+
#* Tag nesting rules:
|
808
|
+
#
|
809
|
+
# Most tags can't be nested at all. For instance, the occurance of
|
810
|
+
# a <p> tag should implicitly close the previous <p> tag.
|
811
|
+
#
|
812
|
+
# <p>Para1<p>Para2
|
813
|
+
# should be transformed into:
|
814
|
+
# <p>Para1</p><p>Para2
|
815
|
+
#
|
816
|
+
# Some tags can be nested arbitrarily. For instance, the occurance
|
817
|
+
# of a <blockquote> tag should _not_ implicitly close the previous
|
818
|
+
# <blockquote> tag.
|
819
|
+
#
|
820
|
+
# Alice said: <blockquote>Bob said: <blockquote>Blah
|
821
|
+
# should NOT be transformed into:
|
822
|
+
# Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
|
823
|
+
#
|
824
|
+
# Some tags can be nested, but the nesting is reset by the
|
825
|
+
# interposition of other tags. For instance, a <tr> tag should
|
826
|
+
# implicitly close the previous <tr> tag within the same <table>,
|
827
|
+
# but not close a <tr> tag in another table.
|
828
|
+
#
|
829
|
+
# <table><tr>Blah<tr>Blah
|
830
|
+
# should be transformed into:
|
831
|
+
# <table><tr>Blah</tr><tr>Blah
|
832
|
+
# but,
|
833
|
+
# <tr>Blah<table><tr>Blah
|
834
|
+
# should NOT be transformed into
|
835
|
+
# <tr>Blah<table></tr><tr>Blah
|
836
|
+
#
|
837
|
+
#Differing assumptions about tag nesting rules are a major source
|
838
|
+
#of problems with the BeautifulSoup class. If BeautifulSoup is not
|
839
|
+
#treating as nestable a tag your page author treats as nestable,
|
840
|
+
#try writing a subclass.
|
841
|
+
class BeautifulSoup < BeautifulStoneSoup
|
842
|
+
|
843
|
+
@@self_closing_tags.replace(build_tag_map(nil, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame']))
|
844
|
+
|
845
|
+
@@quote_tags = {'script' => nil}
|
846
|
+
|
847
|
+
#According to the HTML standard, each of these inline tags can
|
848
|
+
#contain another tag of the same type. Furthermore, it's common
|
849
|
+
#to actually use these tags this way.
|
850
|
+
@@nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center']
|
851
|
+
|
852
|
+
#According to the HTML standard, these block tags can contain
|
853
|
+
#another tag of the same type. Furthermore, it's common
|
854
|
+
#to actually use these tags this way.
|
855
|
+
@@nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
|
856
|
+
|
857
|
+
#Lists can contain other lists, but there are restrictions.
|
858
|
+
@@nestable_list_tags = { 'ol' => [],
|
859
|
+
'ul' => [],
|
860
|
+
'li' => ['ul', 'ol'],
|
861
|
+
'dl' => [],
|
862
|
+
'dd' => ['dl'],
|
863
|
+
'dt' => ['dl'] }
|
864
|
+
|
865
|
+
#Tables can contain other tables, but there are restrictions.
|
866
|
+
@@nestable_table_tags = {'table' => ['tr', 'td'],
|
867
|
+
'tr' => ['table'],
|
868
|
+
'td' => ['tr'],
|
869
|
+
'th' => ['tr'],
|
870
|
+
}
|
871
|
+
|
872
|
+
@@non_nestable_block_tags = ['address', 'form', 'p', 'pre']
|
873
|
+
|
874
|
+
#If one of these tags is encountered, all tags up to the next tag of
|
875
|
+
#this type are popped.
|
876
|
+
@@reset_nesting_tags.replace(build_tag_map(nil, @@nestable_block_tags, 'noscript', @@non_nestable_block_tags,
|
877
|
+
@@nestable_list_tags, @@nestable_table_tags))
|
878
|
+
|
879
|
+
@@nestable_tags.replace(build_tag_map([], @@nestable_inline_tags, @@nestable_block_tags, @@nestable_list_tags, @@nestable_table_tags))
|
880
|
+
|
881
|
+
end
|
882
|
+
|
883
|
+
# This class will push a tag with only a single string child into
|
884
|
+
# the tag's parent as an attribute. The attribute's name is the tag
|
885
|
+
# name, and the value is the string child. An example should give
|
886
|
+
# the flavor of the change:
|
887
|
+
#
|
888
|
+
# <foo><bar>baz</bar></foo>
|
889
|
+
# =>
|
890
|
+
# <foo bar="baz"><bar>baz</bar></foo>
|
891
|
+
#
|
892
|
+
# You can then access fooTag['bar'] instead of fooTag.barTag.string.
|
893
|
+
#
|
894
|
+
# This is, of course, useful for scraping structures that tend to
|
895
|
+
# use subelements instead of attributes, such as SOAP messages. Note
|
896
|
+
# that it modifies its input, so don't print the modified version
|
897
|
+
# out.
|
898
|
+
class BeautifulSOAP < BeautifulStoneSoup
|
899
|
+
def pop_tag
|
900
|
+
if @tag_stack.size > 1
|
901
|
+
tag = @tag_stack[-1]
|
902
|
+
parent = @tag_stack[-2]
|
903
|
+
if (tag.is_a?(Tag) && tag.contents.size == 1 && \
|
904
|
+
tag.contents[0].is_a?(NavigableString) && !parent[tag.name])
|
905
|
+
parent[tag.name] = tag.contents[0]
|
906
|
+
end
|
907
|
+
super
|
908
|
+
end
|
909
|
+
end
|
910
|
+
end
|
911
|
+
|
912
|
+
#Enterprise class names! It has come to our attention that some people
|
913
|
+
#think the names of the Rubyful Soup parser classes are too silly
|
914
|
+
#and "unprofessional" for use in enterprise screen-scraping. We feel
|
915
|
+
#your pain! For such-minded folk, the Rubyful Soup Consortium And
|
916
|
+
#Rootin' Tootin' Texas Delicatessen recommends renaming this file to
|
917
|
+
#"RobustParser.rb" (or, in cases of extreme enterprisitude,
|
918
|
+
#"RobustParserBeanInterface.class") and using the following
|
919
|
+
#enterprise-friendly class aliases:
|
920
|
+
class RobustXMLParser < BeautifulStoneSoup; end
|
921
|
+
class RobustHTMLParser < BeautifulSoup; end
|
922
|
+
class SimplifyingSOAPParser < BeautifulSOAP; end
|
923
|
+
|
924
|
+
print BeautifulSoup.new(ARGF.read).prettify if $0 == __FILE__
|
925
|
+
|
@@ -0,0 +1,431 @@
|
|
1
|
+
#Unit tests for Rubyful Soup.
|
2
|
+
#
|
3
|
+
#These tests make sure the Rubyful Soup works as it should. If you
|
4
|
+
#find a bug in Rubyful Soup, the best way to express it is as a test
|
5
|
+
#case like this that fails.
|
6
|
+
|
7
|
+
require 'test/unit'
|
8
|
+
require 'rubygems'
|
9
|
+
require 'rubyful_soup'
|
10
|
+
|
11
|
+
class SoupTest < Test::Unit::TestCase
|
12
|
+
|
13
|
+
#Parse the given text and make sure its string rep is the other
|
14
|
+
#given text.
|
15
|
+
def assert_soup_equals(toParse, rep=nil, c=BeautifulStoneSoup)
|
16
|
+
if rep == nil
|
17
|
+
rep = toParse
|
18
|
+
end
|
19
|
+
assert_equal(c.new(toParse).to_s(false), rep)
|
20
|
+
end
|
21
|
+
|
22
|
+
#Null test to shut the compiler up.
|
23
|
+
def test_null
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
#Tests the various ways of fetching tags from a soup.
|
29
|
+
class ToteThatTag < SoupTest
|
30
|
+
|
31
|
+
def setup
|
32
|
+
ml = %{
|
33
|
+
<a id="x">1</a>
|
34
|
+
<a id="a">2</a>
|
35
|
+
<b id="b">3</b>
|
36
|
+
<b id="x">4</b>
|
37
|
+
<abc:d width="100">5</abc:d>}
|
38
|
+
@soup = BeautifulStoneSoup.new(ml)
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_fetch_by_name
|
42
|
+
matching = @soup.find_all('a')
|
43
|
+
assert_equal(matching.length, 2)
|
44
|
+
assert_equal(matching[0].name, 'a')
|
45
|
+
assert_equal(matching[0], @soup.find('a'))
|
46
|
+
assert_equal(@soup.find('abc:d').contents.length, 1)
|
47
|
+
|
48
|
+
firstB = @soup.find('b')
|
49
|
+
nextB = firstB.find_next('b')
|
50
|
+
assert_equal(nextB.contents[0], '4')
|
51
|
+
assert_equal(nextB['id'], 'x')
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_fetch_by_block
|
56
|
+
|
57
|
+
a = @soup.find_all('a')
|
58
|
+
b = @soup.find_all do |x|
|
59
|
+
x.is_a? Tag and x.name == 'a'
|
60
|
+
end
|
61
|
+
assert_equal(a,b)
|
62
|
+
|
63
|
+
a = @soup.find_text('3')
|
64
|
+
b = @soup.find_text do |x|
|
65
|
+
x.is_a? NavigableString and x == '3'
|
66
|
+
end
|
67
|
+
assert_equal(a,b)
|
68
|
+
|
69
|
+
matching = @soup.find_all do |x|
|
70
|
+
x.respond_to?('name') and x.name == x['id']
|
71
|
+
end
|
72
|
+
assert_equal(matching.length, 2)
|
73
|
+
assert_equal(matching[0].name, 'a')
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_fetch_by_attribute
|
77
|
+
matching = @soup.find_all(nil, :attrs=>{'id' => 'x'})
|
78
|
+
assert_equal(matching.length, 2)
|
79
|
+
assert_equal(matching[0].name, 'a')
|
80
|
+
assert_equal(matching[1].name, 'b')
|
81
|
+
|
82
|
+
assert_equal(@soup.find_all(nil, :attrs=>{'id' => nil}).length, 1)
|
83
|
+
assert_equal(@soup.find_all(nil, :attrs=>{'id' => nil}).length, 1)
|
84
|
+
|
85
|
+
assert_equal(@soup.find_all(nil, :attrs=>{'width' => 100}).length, 1)
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_tag_name_as_method
|
89
|
+
firstB = @soup.find('b')
|
90
|
+
assert_equal(firstB, @soup.b)
|
91
|
+
assert_equal(firstB, @soup.b_tag)
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_fetch_by_list
|
95
|
+
matching = @soup.find_all(['a', 'abc:d'])
|
96
|
+
assert_equal(matching.length, 3)
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_fetch_by_hash
|
100
|
+
matching = @soup.find_all({'a' => true, 'b' => true})
|
101
|
+
assert_equal(matching.length, 4)
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_fetch_by_re
|
105
|
+
r = /a.*/
|
106
|
+
assert_equal(@soup.find_all(r).length, 3)
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_fetch_by_method
|
110
|
+
proc = Proc.new { |x| return x.name == x['id'] }
|
111
|
+
matching = @soup.find_all(proc)
|
112
|
+
assert_equal(matching.length, 2)
|
113
|
+
assert_equal(matching[0].name, 'a')
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
#Testing the integrity of the parse tree.
|
119
|
+
class FollowThatTag < SoupTest
|
120
|
+
|
121
|
+
@@PROXIMITY_TEST = BeautifulStoneSoup.new('<b id="1"><b id="2"><b id="3"><b id="4">')
|
122
|
+
|
123
|
+
@@SIBLING_TEST = BeautifulStoneSoup.new('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
|
124
|
+
|
125
|
+
def test_parents
|
126
|
+
soup = BeautifulSoup.new('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah</b></ul></ul></ul>')
|
127
|
+
b = soup.find('b')
|
128
|
+
assert_equal(b.find_parents('ul', :attrs=>{'id' => 'foo'}).length, 2)
|
129
|
+
assert_equal(b.find_parent('ul')['a'], 'b')
|
130
|
+
end
|
131
|
+
|
132
|
+
def test_next_sibling
|
133
|
+
soup = @@SIBLING_TEST
|
134
|
+
tag = 'blockquote'
|
135
|
+
b = soup.find(tag, :attrs=>{'id' => 2})
|
136
|
+
assert_equal(b.find_next(tag)['id'], '2.1')
|
137
|
+
assert_equal(b.find_next_sibling(tag)['id'], '3')
|
138
|
+
assert_equal(b.find_next_sibling(tag)['id'], '3')
|
139
|
+
assert_equal(b.find_next_siblings(tag).length, 2)
|
140
|
+
assert_equal(b.find_next_siblings(tag, :attrs=>{'id' => 4}).length, 1)
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_previous_sibling
|
144
|
+
soup = @@SIBLING_TEST
|
145
|
+
tag = 'blockquote'
|
146
|
+
b = soup.find(tag, :attrs=>{'id' => 3})
|
147
|
+
assert_equal(b.find_previous(tag)['id'], '2.1')
|
148
|
+
assert_equal(b.find_previous_sibling(tag)['id'], '2')
|
149
|
+
assert_equal(b.find_previous_sibling(tag)['id'], '2')
|
150
|
+
assert_equal(b.find_previous_siblings(tag).length, 2)
|
151
|
+
assert_equal(b.find_previous_siblings(tag, :attrs=>{'id' => 1}).length, 1)
|
152
|
+
end
|
153
|
+
|
154
|
+
def test_text_navigation
|
155
|
+
soup = BeautifulSoup.new('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
|
156
|
+
baz = soup.find_text('Baz')
|
157
|
+
assert_equal(baz.find_parent("i")['id'], '1')
|
158
|
+
assert_equal(baz.find_next(nil, :text=> 'Blee'), 'Blee')
|
159
|
+
assert_equal(baz.find_next_sibling(nil, :text=>'Blee'), 'Blee')
|
160
|
+
assert_equal(baz.find_next_sibling(nil, :text=>'Blargh'), nil)
|
161
|
+
assert_equal(baz.find_next_sibling('hr')['id'], '1')
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
#Tests the nextSibling and previousSibling navigation.
|
167
|
+
class SiblingRivalry < SoupTest
|
168
|
+
|
169
|
+
def test_siblings
|
170
|
+
soup = BeautifulSoup.new("<ul><li>1<p>A</p>B</li><li>2</li><li>3</li></ul>")
|
171
|
+
second_li = soup.find('li').next_sibling
|
172
|
+
assert_equal(second_li.name, 'li')
|
173
|
+
assert_equal(second_li.string, '2')
|
174
|
+
assert_equal(soup.find_text('1').next_sibling.name, 'p')
|
175
|
+
assert_equal(soup.find('p').next_sibling, 'B')
|
176
|
+
assert_equal(soup.find('p').next_sibling.previous_sibling.next_sibling,
|
177
|
+
'B')
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
#Tests the various built-in functions of Tag objects.
|
182
|
+
class TagsAreObjectsToo < SoupTest
|
183
|
+
|
184
|
+
@@SOUP = BeautifulSoup.new('<top id="1">1<b>2</b>3</top>')
|
185
|
+
|
186
|
+
def test_length
|
187
|
+
assert_equal(@@SOUP.top.length, 3)
|
188
|
+
end
|
189
|
+
|
190
|
+
def test_hash_lookup
|
191
|
+
assert_equal(@@SOUP.top['id'], "1")
|
192
|
+
end
|
193
|
+
|
194
|
+
def test_iterator
|
195
|
+
bucket = []
|
196
|
+
@@SOUP.top.each do |x|
|
197
|
+
bucket << x
|
198
|
+
end
|
199
|
+
assert_equal(bucket.length, 3)
|
200
|
+
assert_equal(bucket[2], "3")
|
201
|
+
end
|
202
|
+
|
203
|
+
end
|
204
|
+
|
205
|
+
#Tests the use of 'string' as an alias for a tag's only content.
|
206
|
+
class StringEmUp < SoupTest
|
207
|
+
|
208
|
+
def test_string
|
209
|
+
s = BeautifulSoup.new('<b>foo</b>')
|
210
|
+
assert_equal(s.b.string, 'foo')
|
211
|
+
end
|
212
|
+
|
213
|
+
def test_lack_of_string
|
214
|
+
s = BeautifulSoup.new("<b>f<i>e</i>o</b>")
|
215
|
+
self.assert_equal(s.b.string, nil)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
#Tests the limit argument.
|
220
|
+
class ThatsMyLimit < SoupTest
|
221
|
+
|
222
|
+
def test_basic_limits
|
223
|
+
s = BeautifulSoup.new('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
|
224
|
+
assert_equal(s.find_all('br').length, 4)
|
225
|
+
assert_equal(s.find_all('br', :limit=> 2).length, 2)
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
#Testing the modification of the tree.
|
230
|
+
class WriteOnlyCode < SoupTest
|
231
|
+
|
232
|
+
def test_replace_contents
|
233
|
+
soup = BeautifulSoup.new('<a>foo</a>')
|
234
|
+
soup.a.contents[0] = (NavigableString.new('bar'))
|
235
|
+
assert_equal(soup.render_contents, '<a>bar</a>')
|
236
|
+
end
|
237
|
+
|
238
|
+
def test_modify_attributes
|
239
|
+
soup = BeautifulSoup.new('<a id="1"></a>')
|
240
|
+
first_a = soup.find('a')
|
241
|
+
|
242
|
+
first_a['id'] = 2
|
243
|
+
assert_equal(soup.render_contents, '<a id="2"></a>')
|
244
|
+
first_a['id'] = nil
|
245
|
+
assert_equal(soup.render_contents, '<a></a>')
|
246
|
+
|
247
|
+
first_a['id2'] = 'foo'
|
248
|
+
assert_equal(soup.render_contents, '<a id2="foo"></a>')
|
249
|
+
first_a.delete('id2')
|
250
|
+
assert_equal(soup.render_contents, '<a></a>')
|
251
|
+
end
|
252
|
+
|
253
|
+
#Makes sure tags don't step on each others' toes.
|
254
|
+
def test_new_tag_
|
255
|
+
soup = BeautifulSoup.new('')
|
256
|
+
a = Tag.new(soup, 'a')
|
257
|
+
ol = Tag.new(soup, 'ol')
|
258
|
+
a["href"] = "http://foo.com/"
|
259
|
+
assert_equal(ol["href"], nil)
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
#Our operators do it all! Call now!
|
264
|
+
class OperatorOverload < SoupTest
|
265
|
+
|
266
|
+
def test_tag_name_as_find
|
267
|
+
# Tests that referencing a tag name as a member delegates to find.
|
268
|
+
soup = BeautifulSoup.new('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
|
269
|
+
assert_equal(soup.b.i, soup.find('b').find('i'))
|
270
|
+
assert_equal(soup.b.i.string, 'bar')
|
271
|
+
assert_equal(soup.b['id'], '1')
|
272
|
+
assert_equal(soup.b.contents[0], 'foo')
|
273
|
+
assert(soup.a == nil)
|
274
|
+
|
275
|
+
#Test the .foo_tag variant of .foo.
|
276
|
+
assert_equal(soup.b_tag.i_tag.string, 'bar')
|
277
|
+
assert_equal(soup.b.i_tag.string, 'bar')
|
278
|
+
assert_equal(soup.find('b').find('i'), soup.b_tag.i_tag)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
#Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!
|
283
|
+
class NestableEgg < SoupTest
|
284
|
+
|
285
|
+
def test_para_inside_blockquote
|
286
|
+
soup = BeautifulSoup.new('<blockquote><p><b>Foo</b></p></blockquote><p>Bar')
|
287
|
+
assert_equal(soup.blockquote.p.b.string, 'Foo')
|
288
|
+
assert_equal(soup.blockquote.b.string, 'Foo')
|
289
|
+
assert_equal(soup.find('p', :recursive=>false).string, 'Bar')
|
290
|
+
end
|
291
|
+
|
292
|
+
def test_nested_tables
|
293
|
+
text = %{<table id="1"><tr><td>Here's another table:
|
294
|
+
<table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>}
|
295
|
+
soup = BeautifulSoup.new(text)
|
296
|
+
assert_equal(soup.table.table.td.string, 'Juicy text')
|
297
|
+
assert_equal(soup.find_all('table').length, 2)
|
298
|
+
assert_equal(soup.table.find_all('table').length, 1)
|
299
|
+
assert_equal(soup.find('table', :attrs=>{'id' => 2}).parent.parent.parent.name,
|
300
|
+
'table')
|
301
|
+
end
|
302
|
+
|
303
|
+
def test_bad_nested_tables
|
304
|
+
soup = BeautifulSoup.new("<table><tr><table><tr id='nested'></tr></table></tr></table>")
|
305
|
+
assert_equal(soup.table.tr.table.tr['id'], 'nested')
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
|
310
|
+
#Here we test cleanup of text that breaks an unaltered parser or is just
|
311
|
+
#obnoxious.
|
312
|
+
class CleanupOnAisleFour < SoupTest
|
313
|
+
|
314
|
+
def test_self_closing_tag
|
315
|
+
assert_equal(BeautifulStoneSoup.new("Foo<br/>Bar").find('br').to_s,
|
316
|
+
'<br />')
|
317
|
+
assert_soup_equals('<p>test1<br/>test2</p>',
|
318
|
+
'<p>test1<br />test2</p>')
|
319
|
+
end
|
320
|
+
|
321
|
+
def test_bad_closing_tags
|
322
|
+
BeautifulStoneSoup.new("<a>Foo<b>Bar</a>")
|
323
|
+
end
|
324
|
+
|
325
|
+
def test_premature_closing_tag
|
326
|
+
BeautifulStoneSoup.new("</b><a>Foo<b>Bar</a>")
|
327
|
+
end
|
328
|
+
|
329
|
+
def test_bad_doctype
|
330
|
+
assert_soup_equals("<!DOCTYPE foo='bar'>")
|
331
|
+
end
|
332
|
+
|
333
|
+
def test_whitespace_in_declaration
|
334
|
+
assert_soup_equals('<! DOCTYPE>', '<!DOCTYPE>')
|
335
|
+
end
|
336
|
+
|
337
|
+
def test_JunkInDeclaration
|
338
|
+
assert_soup_equals('<! Foo = -8>a', '<!Foo = -8>a')
|
339
|
+
end
|
340
|
+
|
341
|
+
def test_incomplete_declaration
|
342
|
+
assert_soup_equals('a<!b <p>c', 'a<!b <p>c</p>')
|
343
|
+
end
|
344
|
+
|
345
|
+
def test_valid_but_bogus_declaration
|
346
|
+
assert_soup_equals('<! Foo >a', '<!Foo >a')
|
347
|
+
end
|
348
|
+
|
349
|
+
#This fails for a totally bogus reason! I can't figure it out.
|
350
|
+
#def test_smart_quotes_not_so_smart_anymore_FAILS
|
351
|
+
# assert_soup_equals("\x91Foo\x92", '‘Foo’')
|
352
|
+
#end
|
353
|
+
|
354
|
+
#def test_incomplete_declaration_at_endFAILS
|
355
|
+
# assert_soup_equals('a<!b')
|
356
|
+
#end
|
357
|
+
|
358
|
+
end
|
359
|
+
|
360
|
+
#Verifies that the parser treats multiple feed calls the same as one
|
361
|
+
#big feed call only if constructed with
|
362
|
+
#initialTextIsEverything=False.
|
363
|
+
class KeepOnParsing < SoupTest
|
364
|
+
|
365
|
+
def test_multiple_parse_calls
|
366
|
+
f1 = '<foo>bah<bar>'
|
367
|
+
f2 = 'blee</bar></foo>'
|
368
|
+
|
369
|
+
s1 = BeautifulSoup.new(f1+f2)
|
370
|
+
s2 = BeautifulSoup.new(f1)
|
371
|
+
s2.feed(f2)
|
372
|
+
s3 = BeautifulSoup.new(f1, :initial_text_is_everything => false)
|
373
|
+
s3.feed(f2)
|
374
|
+
assert_not_equal(s1, s2)
|
375
|
+
assert_equal(s1, s3)
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
#Verifies that BeautifulSOAP parser works.
|
380
|
+
class SOAPMeUp < SoupTest
|
381
|
+
def test_basic_soap
|
382
|
+
s = "<foo><bar>baz</bar></foo>"
|
383
|
+
soup = BeautifulSOAP.new(s)
|
384
|
+
assert_equal(soup.to_s, %{<foo bar="baz"><bar>baz</bar></foo>})
|
385
|
+
end
|
386
|
+
|
387
|
+
def test_dont_overwrite_existing_attr
|
388
|
+
s = %{<foo bar="don't kill me!"><bar>baz</bar></foo>}
|
389
|
+
soup = BeautifulSOAP.new(s)
|
390
|
+
assert_equal(soup.to_s, s)
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|
394
|
+
#The Unicode test suite has not yet been ported because I haven't
|
395
|
+
#figured out how Ruby does Unicode.
|
396
|
+
|
397
|
+
# class UnicodeRed < SoupTest
|
398
|
+
# "Makes sure Unicode works."
|
399
|
+
|
400
|
+
# def setUp
|
401
|
+
# text = 'foo<b>bar</b>'
|
402
|
+
# @soup = BeautifulStoneSoup
|
403
|
+
# @soup.feed(text)
|
404
|
+
|
405
|
+
# def test_BasicUnicode
|
406
|
+
# import types
|
407
|
+
# sType = types.StringType
|
408
|
+
# uType = types.UnicodeType
|
409
|
+
|
410
|
+
# u = u'\3100'
|
411
|
+
# #It starts out ASCII...
|
412
|
+
# assert_equal(type(@soup.renderContents), sType)
|
413
|
+
# assert_equal(type(@soup.prettify), sType)
|
414
|
+
# #But you can have unicode if you want.
|
415
|
+
# assert_equal(type(unicode(@soup)), uType)
|
416
|
+
|
417
|
+
# #Add a Unicode character and it's Unicode.
|
418
|
+
# @soup.feed(u)
|
419
|
+
# assert_equal(type(@soup.renderContents), uType)
|
420
|
+
# assert_equal(type(@soup.prettify), uType)
|
421
|
+
# #But you can have ASCII if you want.
|
422
|
+
# assert_equal(type(str(@soup)), sType)
|
423
|
+
|
424
|
+
# #The part without any Unicode is still ASCII.
|
425
|
+
# assert_equal(type(@soup.b.prettify), sType)
|
426
|
+
|
427
|
+
# #But if you add a Unicode character it'll become Unicode.
|
428
|
+
# @soup.b['foo'] = u'\3100'
|
429
|
+
# assert_equal(type(@soup.b.prettify), uType)
|
430
|
+
|
431
|
+
|
metadata
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.4
|
3
|
+
specification_version: 1
|
4
|
+
name: rubyful_soup
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.1
|
7
|
+
date: 2005-10-21
|
8
|
+
summary: An HTML/XML parser that handles bad markup and provides tree traversal methods.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: leonardr@segfault.org
|
12
|
+
homepage: http://www.crummy.com/software/RubyfulSoup/
|
13
|
+
rubyforge_project:
|
14
|
+
description: "Rubyful Soup is a *ML parser that makes screen-scraping easy. It won't choke on
|
15
|
+
bad markup, and it's easy to locate the part of a document you want."
|
16
|
+
autorequire:
|
17
|
+
default_executable:
|
18
|
+
bindir: bin
|
19
|
+
has_rdoc: true
|
20
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
21
|
+
requirements:
|
22
|
+
-
|
23
|
+
- ">"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: 0.0.0
|
26
|
+
version:
|
27
|
+
platform: ruby
|
28
|
+
authors:
|
29
|
+
- Leonard Richardson
|
30
|
+
files:
|
31
|
+
- lib/rubyful_soup.rb
|
32
|
+
- tests/rubyful_soup_tests.rb
|
33
|
+
- CHANGELOG
|
34
|
+
test_files:
|
35
|
+
- tests/rubyful_soup_tests.rb
|
36
|
+
rdoc_options: []
|
37
|
+
extra_rdoc_files:
|
38
|
+
- CHANGELOG
|
39
|
+
executables: []
|
40
|
+
extensions: []
|
41
|
+
requirements: []
|
42
|
+
dependencies:
|
43
|
+
- !ruby/object:Gem::Dependency
|
44
|
+
name: htmltools
|
45
|
+
version_requirement:
|
46
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
47
|
+
requirements:
|
48
|
+
-
|
49
|
+
- ">"
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: 0.0.0
|
52
|
+
version:
|