rubyful_soup 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -0
- data/lib/rubyful_soup.rb +925 -0
- data/tests/rubyful_soup_tests.rb +431 -0
- metadata +52 -0
data/CHANGELOG
ADDED
data/lib/rubyful_soup.rb
ADDED
@@ -0,0 +1,925 @@
|
|
1
|
+
#Rubyful Soup
|
2
|
+
#Elixir and Tonic
|
3
|
+
#"The Screen-Scraper's Friend"
|
4
|
+
#v1.0.1
|
5
|
+
#http://www.crummy.com/software/RubyfulSoup/
|
6
|
+
#
|
7
|
+
#Rubyful Soup is a port to the Ruby language and idiom of the Python
|
8
|
+
#library Beautiful Soup.
|
9
|
+
#See http://www.crummy.com/software/BeautifulSoup/ for details on the original.
|
10
|
+
|
11
|
+
#This library requires the sgml-parser library, written by Takahiro
|
12
|
+
#Maebashi. The easiest way to get it is to install the "htmltools"
|
13
|
+
#gem.
|
14
|
+
require 'rubygems'
|
15
|
+
require 'sgml-parser'
|
16
|
+
|
17
|
+
#UTF-8 voodoo--does this really work?
|
18
|
+
$KCODE = 'u'
|
19
|
+
require 'jcode'
|
20
|
+
|
21
|
+
#This code makes SGMLParser able to parse XML with namespaces.
|
22
|
+
class SGMLParser
|
23
|
+
if const_defined? :Tagfind
|
24
|
+
remove_const(:Tagfind)
|
25
|
+
Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
module PageElement
|
30
|
+
|
31
|
+
attr_reader :parser
|
32
|
+
attr_accessor :parent, :previous_parsed, :next_parsed, :previous_sibling
|
33
|
+
attr_accessor :next_sibling
|
34
|
+
|
35
|
+
def setup(parent=nil, previous_parsed=nil)
|
36
|
+
@parent = parent
|
37
|
+
@previous_parsed = previous_parsed
|
38
|
+
@next_parsed = nil
|
39
|
+
@previous_sibling = nil
|
40
|
+
@next_sibling = nil
|
41
|
+
if @parent and not @parent.contents.empty?
|
42
|
+
@previous_sibling = @parent.contents[-1]
|
43
|
+
@previous_sibling.next_sibling = self
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
#A bunch of different iterators over a parsed document.
|
48
|
+
{
|
49
|
+
#Iterates in parse order over the rest of the items in this document.
|
50
|
+
:next_parsed_items => :next_parsed,
|
51
|
+
|
52
|
+
#Iterates in reverse parse order over all previously parsed items in
|
53
|
+
#this document.
|
54
|
+
:previous_parsed_items => :previous_parsed,
|
55
|
+
|
56
|
+
#Iterates in parse order over all subsequent siblings of this item.
|
57
|
+
:next_siblings => :next_sibling,
|
58
|
+
|
59
|
+
#Iterates in reverse parse order over all prior siblings of this item.
|
60
|
+
:previous_siblings => :previous_sibling,
|
61
|
+
|
62
|
+
#Iterates upwards through the parentage of this item.
|
63
|
+
:parents => :parent
|
64
|
+
}.each do |k,v|
|
65
|
+
class_eval %{
|
66
|
+
def #{k}
|
67
|
+
i = self
|
68
|
+
while i
|
69
|
+
i = i.#{v}
|
70
|
+
yield i if i
|
71
|
+
end
|
72
|
+
end
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
[ #Returns first item/all items matching the given criteria and
|
77
|
+
#appearing after this PageElement in the document.
|
78
|
+
[:find_next, :find_all_next, 'next_parsed_items'],
|
79
|
+
|
80
|
+
#Returns first item/all items matching the given criteria and
|
81
|
+
#appearing before this PageElement in the document.
|
82
|
+
[:find_previous, :find_all_previous, 'previous_parsed_items'],
|
83
|
+
|
84
|
+
#Returns the nearest sibling/all siblings of this PageElement matching
|
85
|
+
#the given criteria and appearing before this PageElement in
|
86
|
+
#the document.
|
87
|
+
[:find_previous_sibling, :find_previous_siblings, 'previous_siblings'],
|
88
|
+
|
89
|
+
#Returns the nearest sibling/all siblings of this PageElement matching
|
90
|
+
#the given criteria and appearing after this PageElement in
|
91
|
+
#the document
|
92
|
+
[:find_next_sibling, :find_next_siblings, 'next_siblings'],
|
93
|
+
|
94
|
+
#Returns the nearest parent/all parents of this PageElement matching
|
95
|
+
#the given criteria.
|
96
|
+
[:find_parent, :find_parents, 'parents'],
|
97
|
+
].each do |singular, plural, method_name|
|
98
|
+
class_eval %{
|
99
|
+
def #{singular}(name=nil, args={}, &block)
|
100
|
+
args['limit'] = 1
|
101
|
+
fetch(method('#{method_name}'), name, args, block)[0]
|
102
|
+
end
|
103
|
+
|
104
|
+
def #{plural}(name=nil, args={}, &block)
|
105
|
+
fetch(method('#{method_name}'), name, args, block)
|
106
|
+
end
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
protected
|
111
|
+
|
112
|
+
#Returns a list of items matching the given criteria, obtained by
|
113
|
+
#iterating over the given iterator.
|
114
|
+
def fetch(iterator, name, args, block)
|
115
|
+
attrs = args[:attrs]
|
116
|
+
limit = args[:limit]
|
117
|
+
text = args[:text]
|
118
|
+
|
119
|
+
attrs ||= {}
|
120
|
+
if attrs != nil and not attrs.respond_to? :keys
|
121
|
+
attrs = {'class' => attrs}
|
122
|
+
end
|
123
|
+
bucket = []
|
124
|
+
catch(:stop_iteration) do
|
125
|
+
iterator.call do |item|
|
126
|
+
match = false
|
127
|
+
if block
|
128
|
+
match = true if block.call(item)
|
129
|
+
elsif item.is_a? Tag
|
130
|
+
#A tag matches if its name matches and its attributes line up.
|
131
|
+
if not text and (not name or PageElement.matches(item, name))
|
132
|
+
match = true
|
133
|
+
attrs.each_pair do |attr, matchAgainst|
|
134
|
+
check = item[attr]
|
135
|
+
unless PageElement.matches(check, matchAgainst)
|
136
|
+
match = false
|
137
|
+
break
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
elsif text
|
142
|
+
#A text matches if its string value matches the given text
|
143
|
+
#criterion.
|
144
|
+
match = PageElement.matches(item, text)
|
145
|
+
end
|
146
|
+
if match
|
147
|
+
bucket.push(item)
|
148
|
+
if limit and bucket.length >= limit
|
149
|
+
throw :stop_iteration
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
return bucket
|
155
|
+
end
|
156
|
+
|
157
|
+
#Used to tell whether a Tag or a NavigableString "matches" some data
|
158
|
+
#structure.
|
159
|
+
def PageElement.matches(chunk, how_to_match)
|
160
|
+
#puts "Seeing if #{chunk.class} #{chunk} matches #{how_to_match.class} #{how_to_match}."
|
161
|
+
#
|
162
|
+
# If given a list of items, return true if the list contains a
|
163
|
+
# text element that matches.
|
164
|
+
if chunk.is_a? Array
|
165
|
+
chunk.each do |tag|
|
166
|
+
return true if tag.is_a? NavigableString and matches(tag, how_to_match)
|
167
|
+
end
|
168
|
+
return false
|
169
|
+
elsif how_to_match.is_a? Proc
|
170
|
+
return how_to_match.call(chunk)
|
171
|
+
elsif chunk.is_a? Tag
|
172
|
+
#Custom match methods take the tag as an argument, but all other
|
173
|
+
#ways of matching match the tag name as a string
|
174
|
+
chunk = chunk.name
|
175
|
+
end
|
176
|
+
|
177
|
+
#At this point we know that chunk is a string
|
178
|
+
unless chunk.is_a? String
|
179
|
+
chunk = chunk.to_s
|
180
|
+
end
|
181
|
+
if how_to_match.is_a? Regexp
|
182
|
+
return how_to_match.match(chunk) != nil
|
183
|
+
elsif how_to_match.is_a? Array
|
184
|
+
return how_to_match.find {|x| x == chunk} != nil
|
185
|
+
elsif how_to_match.is_a? Hash
|
186
|
+
return how_to_match[chunk] != nil
|
187
|
+
else
|
188
|
+
#It's just a string
|
189
|
+
return how_to_match.to_s == chunk
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
194
|
+
|
195
|
+
module TagModule
|
196
|
+
|
197
|
+
include Enumerable
|
198
|
+
include PageElement
|
199
|
+
|
200
|
+
attr_accessor :name, :contents, :attrs, :string
|
201
|
+
|
202
|
+
#I tried to have Tag subclass Method, but it killed the
|
203
|
+
#whole thing. Maybe I should just leave well enough alone.
|
204
|
+
#
|
205
|
+
#def arity
|
206
|
+
# return methods('find_all').arity
|
207
|
+
#end
|
208
|
+
#
|
209
|
+
#def call(*args)
|
210
|
+
# return find_all(*args)
|
211
|
+
#end
|
212
|
+
#
|
213
|
+
#def to_proc
|
214
|
+
# return methods('find_all').to_proc
|
215
|
+
#end
|
216
|
+
|
217
|
+
def initialize(parser, name, attrs=nil, parent=nil, previous=nil)
|
218
|
+
@hidden = false
|
219
|
+
@parser = parser
|
220
|
+
@name = name
|
221
|
+
attrs ||= {}
|
222
|
+
@attrs = attrs
|
223
|
+
@contents = []
|
224
|
+
setup(parent, previous)
|
225
|
+
end
|
226
|
+
|
227
|
+
#soup.title_tag or soup.title is the same as soup.find('title')
|
228
|
+
def method_missing(name, *args)
|
229
|
+
#puts "Missing method #{name}"
|
230
|
+
name = name.to_s
|
231
|
+
if name[-4...name.length] == '_tag'
|
232
|
+
name = name[0...name.length-4]
|
233
|
+
end
|
234
|
+
return find(name, *args)
|
235
|
+
end
|
236
|
+
|
237
|
+
#TODO: is there a mixin for Hash?
|
238
|
+
def [](k)
|
239
|
+
return @attrs[k]
|
240
|
+
end
|
241
|
+
|
242
|
+
def []=(k, v)
|
243
|
+
@attrs[k] = v
|
244
|
+
end
|
245
|
+
|
246
|
+
def delete(k)
|
247
|
+
@attrs.delete(k)
|
248
|
+
end
|
249
|
+
|
250
|
+
def has_key?(k)
|
251
|
+
return @attrs.has_key(k)
|
252
|
+
end
|
253
|
+
|
254
|
+
#End things that would go away if there was a mixin for Hash.
|
255
|
+
|
256
|
+
def each
|
257
|
+
@contents.each { |x| yield x }
|
258
|
+
end
|
259
|
+
|
260
|
+
def length
|
261
|
+
return contents.length
|
262
|
+
end
|
263
|
+
alias size length
|
264
|
+
|
265
|
+
def self_closing?
|
266
|
+
return @parser.self_closing_tag?(@name)
|
267
|
+
end
|
268
|
+
|
269
|
+
#Adds the given tag to the contents of this tag
|
270
|
+
def append(tag)
|
271
|
+
@contents.push(tag)
|
272
|
+
end
|
273
|
+
|
274
|
+
def to_str
|
275
|
+
return to_s
|
276
|
+
end
|
277
|
+
|
278
|
+
#Renders this tag and its contents as a pretty-printed string.
|
279
|
+
def prettify
|
280
|
+
return to_s(true)
|
281
|
+
end
|
282
|
+
|
283
|
+
def inspect
|
284
|
+
to_s
|
285
|
+
end
|
286
|
+
|
287
|
+
#Renders this tag and its contents as a string. NOTE: since REXML
|
288
|
+
#consumes whitespace, this method is not certain to reproduce the
|
289
|
+
#whitespace present in the original string.
|
290
|
+
def to_s(show_structure_indent=nil)
|
291
|
+
attrs = []
|
292
|
+
@attrs.each { |k,v| attrs.push("#{k}=\"#{v}\"") if v }
|
293
|
+
if self_closing?
|
294
|
+
close = ' /'
|
295
|
+
closeTag = nil
|
296
|
+
else
|
297
|
+
close = nil
|
298
|
+
closeTag = "</#{name}>"
|
299
|
+
end
|
300
|
+
indent_increment = show_structure_indent==true ? 0 : show_structure_indent
|
301
|
+
if show_structure_indent
|
302
|
+
indent_increment += 1 unless @hidden
|
303
|
+
end
|
304
|
+
contents = render_contents(indent_increment)
|
305
|
+
space = "\n #{' ' * indent_increment}" if show_structure_indent
|
306
|
+
if @hidden
|
307
|
+
s = contents
|
308
|
+
else
|
309
|
+
s = []
|
310
|
+
attribute_string = ''
|
311
|
+
unless attrs.empty?
|
312
|
+
attribute_string = ' ' + attrs.join(' ')
|
313
|
+
end
|
314
|
+
s.push(space) if show_structure_indent
|
315
|
+
s.push("<#{@name}#{attribute_string}#{close}>")
|
316
|
+
s.push(contents)
|
317
|
+
s.push(space) if closeTag and show_structure_indent
|
318
|
+
s.push(closeTag)
|
319
|
+
s = s.join('')
|
320
|
+
end
|
321
|
+
return s
|
322
|
+
end
|
323
|
+
|
324
|
+
#Renders the contents of this tag as a string.
|
325
|
+
def render_contents(show_structure_indent=nil)
|
326
|
+
s=[]
|
327
|
+
@contents.each do |c|
|
328
|
+
text = nil
|
329
|
+
if c.is_a? Tag
|
330
|
+
text = c.to_s(show_structure_indent)
|
331
|
+
else
|
332
|
+
text = c.to_s
|
333
|
+
end
|
334
|
+
if text
|
335
|
+
if show_structure_indent
|
336
|
+
text.chomp!
|
337
|
+
end
|
338
|
+
s.push(text)
|
339
|
+
end
|
340
|
+
end
|
341
|
+
return s.join('')
|
342
|
+
end
|
343
|
+
|
344
|
+
def recursive_children
|
345
|
+
stack = [[self, 0]]
|
346
|
+
catch(:stop_iteration) do
|
347
|
+
until stack.empty?
|
348
|
+
tag, start = stack.pop
|
349
|
+
for i in start...tag.contents.length
|
350
|
+
a = tag.contents[i]
|
351
|
+
yield a
|
352
|
+
if a.is_a? TagModule and not tag.contents.empty? and i < tag.contents.length
|
353
|
+
stack.push([tag, i+1])
|
354
|
+
stack.push([a, 0])
|
355
|
+
break
|
356
|
+
end
|
357
|
+
end if tag.is_a? TagModule
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
#Iterates over the direct children of this Tag.
|
363
|
+
def children
|
364
|
+
catch(:stop_iteration) { @contents.each { |x| yield x } }
|
365
|
+
end
|
366
|
+
|
367
|
+
#Convenience method to retrieve the first piece of text matching the
|
368
|
+
#given criteria. 'text' can be a string, a regular expression object,
|
369
|
+
#a Proc that takes a string and returns whether or not the
|
370
|
+
#string 'matches', etc.
|
371
|
+
def find_text(text=nil, &block)
|
372
|
+
args = { :text => text, :limit => 1}
|
373
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
374
|
+
fetch(iterator, nil, args, block)[0]
|
375
|
+
end
|
376
|
+
|
377
|
+
#Convenience method to retrieve all pieces of text matching the
|
378
|
+
#given criteria. 'text' can be a string, a regular expression object,
|
379
|
+
#a callable that takes a string and returns whether or not the
|
380
|
+
#string 'matches', etc.
|
381
|
+
#Args: :limit
|
382
|
+
def find_all_text(text=nil, args={}, &block)
|
383
|
+
args['text'] = text
|
384
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
385
|
+
fetch(iterator, nil, args, block)
|
386
|
+
end
|
387
|
+
|
388
|
+
#Extracts a list of Tag objects that match the given criteria. You
|
389
|
+
#can specify the name of the Tag and any attributes you want the Tag
|
390
|
+
#to have.
|
391
|
+
#
|
392
|
+
#The value of a key-value pair in the 'attrs' map can be a string, a
|
393
|
+
#list of strings, a regular expression object, or a Proc object that
|
394
|
+
#takes a string and returns whether or not the string matches for
|
395
|
+
#some custom definition of 'matches'. The same is true of the tag
|
396
|
+
#name, except that a Proc object will be passed the Tag object instead
|
397
|
+
#of just a string.
|
398
|
+
#Args: :attrs :text :limit :recursive
|
399
|
+
def find_all(name=nil, args={}, &block)
|
400
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
401
|
+
fetch(iterator, name, args, block)
|
402
|
+
end
|
403
|
+
|
404
|
+
#Returns the first Tag or NavigableString object that matches the
|
405
|
+
#given criteria. Takes much the same arguments as fetch.
|
406
|
+
#args: :attrs :text :limit :recursive
|
407
|
+
def find(name=nil, args={}, &block)
|
408
|
+
args[:limit] = 1
|
409
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
410
|
+
fetch(iterator, name, args, block)[0]
|
411
|
+
end
|
412
|
+
end
|
413
|
+
|
414
|
+
class Tag
|
415
|
+
include TagModule
|
416
|
+
end
|
417
|
+
|
418
|
+
class NavigableString < String
|
419
|
+
include PageElement
|
420
|
+
end
|
421
|
+
|
422
|
+
#This class contains the basic parser and fetch code. It defines
|
423
|
+
#a parser that knows nothing about tag behavior except for the
|
424
|
+
#following:
|
425
|
+
#
|
426
|
+
#You can't close a tag without closing all the tags it encloses.
|
427
|
+
#That is, "<foo><bar></foo>" actually means
|
428
|
+
#"<foo><bar></bar></foo>".
|
429
|
+
#
|
430
|
+
#[Another possible explanation is "<foo><bar /></foo>", but since
|
431
|
+
# this class defines no self_closing_tags, it will never use that
|
432
|
+
# explanation.]
|
433
|
+
#
|
434
|
+
#This class is useful for parsing XML or made-up markup languages,
|
435
|
+
#or when BeautifulSoup makes an assumption counter to what you were
|
436
|
+
#expecting."""
|
437
|
+
class BeautifulStoneSoup < SGMLParser
|
438
|
+
include TagModule
|
439
|
+
|
440
|
+
#As a public service we will by default silently replace MS smart quotes
|
441
|
+
#and similar characters with their HTML or ASCII equivalents.
|
442
|
+
@@ms_chars = { '\x80' => '€',
|
443
|
+
"\x81" => ' ',
|
444
|
+
"\x82" => '‚',
|
445
|
+
"\x83" => 'ƒ',
|
446
|
+
"\x84" => '„',
|
447
|
+
"\x85" => '…',
|
448
|
+
"\x86" => '†',
|
449
|
+
"\x87" => '‡',
|
450
|
+
"\x88" => '⁁',
|
451
|
+
"\x89" => '%',
|
452
|
+
"\x8A" => 'Š',
|
453
|
+
"\x8B" => '<',
|
454
|
+
"\x8C" => 'Œ',
|
455
|
+
"\x8D" => '?',
|
456
|
+
"\x8E" => 'Z',
|
457
|
+
"\x8F" => '?',
|
458
|
+
"\x90" => '?',
|
459
|
+
"\x91" => '‘',
|
460
|
+
"\x92" => '’',
|
461
|
+
"\x93" => '“',
|
462
|
+
"\x94" => '”',
|
463
|
+
"\x95" => '•',
|
464
|
+
"\x96" => '–',
|
465
|
+
"\x97" => '—',
|
466
|
+
"\x98" => '˜',
|
467
|
+
"\x99" => '™',
|
468
|
+
"\x9a" => 'š',
|
469
|
+
"\x9b" => '>',
|
470
|
+
"\x9c" => 'œ',
|
471
|
+
"\x9d" => '?',
|
472
|
+
"\x9e" => 'z',
|
473
|
+
"\x9f" => 'Ÿ'}
|
474
|
+
|
475
|
+
@@parser_massage = [[/<([^<>]*)\/>/, '<\1></\1>'],
|
476
|
+
[/<!\s+([^<>]*)>/, '<!\1>'],
|
477
|
+
[/([\x80-\x9f])/m, proc { |m| @@ms_chars[m]}]
|
478
|
+
]
|
479
|
+
|
480
|
+
@@rootTagName = '[document]'
|
481
|
+
|
482
|
+
@@nestable_tags = {}
|
483
|
+
@@reset_nesting_tags = {}
|
484
|
+
@@quoteTags = {}
|
485
|
+
@@self_closing_tags = {}
|
486
|
+
|
487
|
+
attr_accessor :hidden
|
488
|
+
|
489
|
+
def self_closing_tag?(tag)
|
490
|
+
@@self_closing_tags.has_key?(tag)
|
491
|
+
end
|
492
|
+
|
493
|
+
#Args: :initial_text_is_everything, :avoid_parser_problems
|
494
|
+
def initialize(text, args={})
|
495
|
+
super(self, @@rootTagName)
|
496
|
+
@quote_stack = []
|
497
|
+
@hidden = 1
|
498
|
+
reset
|
499
|
+
|
500
|
+
@avoid_parser_problems = args[:avoid_parser_problems] || true
|
501
|
+
if @avoid_parser_problems and not @avoid_parser_problems.is_a? Enumerable
|
502
|
+
@avoid_parser_problems = @@parser_massage
|
503
|
+
end
|
504
|
+
feed(text) if text != nil
|
505
|
+
done if args[:initial_text_is_everything] != false
|
506
|
+
end
|
507
|
+
|
508
|
+
def feed(text)
|
509
|
+
if @avoid_parser_problems
|
510
|
+
#before = text.clone
|
511
|
+
@avoid_parser_problems.each do |re, fix|
|
512
|
+
if fix.is_a? String
|
513
|
+
text.gsub!(re, fix)
|
514
|
+
else
|
515
|
+
text.gsub!(re) { |x| fix.call(x) }
|
516
|
+
end
|
517
|
+
end
|
518
|
+
#if before != text
|
519
|
+
# puts "Changed from #{before} to #{text}"
|
520
|
+
#end
|
521
|
+
end
|
522
|
+
super
|
523
|
+
end
|
524
|
+
|
525
|
+
def ==(anObject)
|
526
|
+
return anObject.to_s == to_s
|
527
|
+
end
|
528
|
+
|
529
|
+
def done
|
530
|
+
end_text
|
531
|
+
pop_tag while @currentTag.name != @@rootTagName
|
532
|
+
end
|
533
|
+
|
534
|
+
def reset
|
535
|
+
super
|
536
|
+
@currentText = []
|
537
|
+
@currentTag = nil
|
538
|
+
@tag_stack = []
|
539
|
+
push_tag(self)
|
540
|
+
end
|
541
|
+
|
542
|
+
def push_tag(tag)
|
543
|
+
#puts "Push #{ tag.name }"
|
544
|
+
@currentTag.append(tag) if @currentTag
|
545
|
+
@tag_stack.push(tag)
|
546
|
+
@currentTag = @tag_stack[-1]
|
547
|
+
end
|
548
|
+
|
549
|
+
def pop_tag
|
550
|
+
tag = @tag_stack.pop
|
551
|
+
#puts "Pop #{ tag.name }"
|
552
|
+
|
553
|
+
# Tags with just one string-owning child get the child as a
|
554
|
+
# 'string' property, so that soup.tag.string is shorthand for
|
555
|
+
# soup.tag.contents[0]
|
556
|
+
if @currentTag.contents.length == 1 and @currentTag.contents[0].is_a? NavigableString
|
557
|
+
@currentTag.string = @currentTag.contents[0]
|
558
|
+
end
|
559
|
+
|
560
|
+
@currentTag = @tag_stack[-1] unless @tag_stack.empty?
|
561
|
+
@currentTag
|
562
|
+
end
|
563
|
+
|
564
|
+
# StreamListener implementation
|
565
|
+
|
566
|
+
def unknown_starttag(name, attrs)
|
567
|
+
#puts "Starting tag #{name} #{attrs.inspect}"
|
568
|
+
attrs = attrs.inject({}) do |m,v|
|
569
|
+
if v[1][0] == ?" and v[1][-1] == ?":
|
570
|
+
v[1] = v[1][1..-2]
|
571
|
+
end
|
572
|
+
m[v[0]] = v[1]
|
573
|
+
m
|
574
|
+
end
|
575
|
+
unless @quote_stack.empty?
|
576
|
+
#This is not a real tag.
|
577
|
+
#puts "<#{name}> is not real!"
|
578
|
+
#TODO: find idiomatic way to do this
|
579
|
+
attrString = []
|
580
|
+
attrs.each { |k,v| attrString.push('#{k}="#{v}"') }
|
581
|
+
self.handle_data('<#{name} #{attrString.join(' ')}>')
|
582
|
+
return
|
583
|
+
end
|
584
|
+
|
585
|
+
end_text
|
586
|
+
self_closing = @@self_closing_tags.has_key?(name)
|
587
|
+
smart_pop(name) unless self_closing
|
588
|
+
tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
|
589
|
+
@previous_parsed.next_parsed = tag if @previous_parsed
|
590
|
+
@previous_parsed = tag
|
591
|
+
push_tag(tag)
|
592
|
+
pop_tag if self_closing
|
593
|
+
if @@quoteTags.has_key?(name)
|
594
|
+
#puts "Beginning quote (#{name})"
|
595
|
+
@quote_stack.push(name)
|
596
|
+
end
|
597
|
+
end
|
598
|
+
|
599
|
+
def unknown_endtag(name)
|
600
|
+
#Ignore tag_end calls for self-closing tags; they were
|
601
|
+
#closed in the tag_start call.
|
602
|
+
#TODO: still neccessary?
|
603
|
+
#puts "Ending tag #{name}"
|
604
|
+
return if @@self_closing_tags.has_key?(name)
|
605
|
+
|
606
|
+
if not @quote_stack.empty? and @quote_stack[-1] != name
|
607
|
+
#This is not a real end tag.
|
608
|
+
#puts "</#{name}> is not real!"
|
609
|
+
handle_data('</#{name}>')
|
610
|
+
return
|
611
|
+
end
|
612
|
+
end_text
|
613
|
+
pop_to_tag(name)
|
614
|
+
@quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
|
615
|
+
end
|
616
|
+
|
617
|
+
def handle_data(data)
|
618
|
+
@currentText.push(data)
|
619
|
+
end
|
620
|
+
|
621
|
+
#Propagate comments right through.
|
622
|
+
def handle_comment(data)
|
623
|
+
handle_data("<!--#{comment}-->")
|
624
|
+
end
|
625
|
+
|
626
|
+
def handle_special(data)
|
627
|
+
handle_data("<#{data}>")
|
628
|
+
end
|
629
|
+
|
630
|
+
def unknown_charref(ref)
|
631
|
+
handle_data("&#{ref};")
|
632
|
+
end
|
633
|
+
|
634
|
+
def unknown_entityref(ref)
|
635
|
+
handle_data("%#{content}")
|
636
|
+
end
|
637
|
+
|
638
|
+
def attlistdecl(element_name, attributes, raw_content)
|
639
|
+
handle_data("<!ATTLIST #{raw_content}>")
|
640
|
+
end
|
641
|
+
|
642
|
+
def cdata(content)
|
643
|
+
handle_data("<![CDATA[#{content}]]")
|
644
|
+
end
|
645
|
+
|
646
|
+
###
|
647
|
+
|
648
|
+
def doctype(*args)
|
649
|
+
content = args.join(' ')
|
650
|
+
##{name} #{pub_sys}#{long_name}#{url}
|
651
|
+
#long_name = ' "#{long_name}"' if long_name
|
652
|
+
#url = ' "#{url}"' if url
|
653
|
+
handle_data("<!DOCTYPE #{content}>")
|
654
|
+
end
|
655
|
+
|
656
|
+
def elementdecl(content)
|
657
|
+
handle_data("<!ELEMENT #{content}>")
|
658
|
+
end
|
659
|
+
|
660
|
+
def entity(content)
|
661
|
+
|
662
|
+
end
|
663
|
+
|
664
|
+
def entitydecl(content)
|
665
|
+
handle_data("<!ENTITY #{content.join(' ')}>")
|
666
|
+
end
|
667
|
+
|
668
|
+
def instruction(name, instruction)
|
669
|
+
handle_data("<?#{name} #{instruction}>")
|
670
|
+
end
|
671
|
+
|
672
|
+
def notationdecl(content)
|
673
|
+
handle_data("<!NOTATION #{content}>")
|
674
|
+
end
|
675
|
+
|
676
|
+
def xmldecl(version, encoding, standalone)
|
677
|
+
encoding = ' encoding="#{encoding}"' if encoding
|
678
|
+
handle_data('<?xml version="#{version}"#{encoding}#{standalone}>')
|
679
|
+
end
|
680
|
+
|
681
|
+
#Called when we're done collecting some text, declarations, etc.
|
682
|
+
def end_text
|
683
|
+
currentText = @currentText.join('')
|
684
|
+
unless currentText.empty?
|
685
|
+
if currentText.strip.empty?
|
686
|
+
if currentText =~ /\n/
|
687
|
+
currentText = "\n"
|
688
|
+
else
|
689
|
+
currentText = ' '
|
690
|
+
end
|
691
|
+
end
|
692
|
+
#puts "Setting up text #{currentText}"
|
693
|
+
currentText = NavigableString.new(currentText)
|
694
|
+
currentText.setup(@currentTag, @previous_parsed)
|
695
|
+
@previous_parsed.next_parsed = currentText if @previous_parsed
|
696
|
+
@previous_parsed = currentText
|
697
|
+
@currentTag.contents.push(currentText)
|
698
|
+
end
|
699
|
+
@currentText = []
|
700
|
+
end
|
701
|
+
|
702
|
+
# Helper methods
|
703
|
+
|
704
|
+
private
|
705
|
+
|
706
|
+
#Pops the tag stack up to and including the most recent
|
707
|
+
#instance of the given tag. If inclusivePop is false, pops the tag
|
708
|
+
#stack up to but *not* including the most recent instance of
|
709
|
+
#the given tag.
|
710
|
+
def pop_to_tag(name, inclusive_pop=true)
|
711
|
+
return if name == @@rootTagName
|
712
|
+
|
713
|
+
#puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
|
714
|
+
num_pops = 0
|
715
|
+
mostRecentTag = nil
|
716
|
+
(0...@tag_stack.length).to_a.reverse.each do |i|
|
717
|
+
if name == @tag_stack[i].name
|
718
|
+
#puts "Found at #{i}, #{@tag_stack.length-i}"
|
719
|
+
num_pops = @tag_stack.length-i
|
720
|
+
break
|
721
|
+
end
|
722
|
+
end
|
723
|
+
num_pops -= 1 if not inclusive_pop
|
724
|
+
|
725
|
+
#puts "Popping #{num_pops} times."
|
726
|
+
num_pops.times { mostRecentTag = pop_tag }
|
727
|
+
mostRecentTag
|
728
|
+
end
|
729
|
+
|
730
|
+
#We need to pop up to the previous tag of this type, unless
|
731
|
+
#one of this tag's nesting reset triggers comes between this
|
732
|
+
#tag and the previous tag of this type, OR unless this tag is a
|
733
|
+
#generic nesting trigger and another generic nesting trigger
|
734
|
+
#comes between this tag and the previous tag of this type.
|
735
|
+
#
|
736
|
+
#Examples:
|
737
|
+
# <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
|
738
|
+
# <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
|
739
|
+
# <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
|
740
|
+
# <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
|
741
|
+
#
|
742
|
+
# <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
|
743
|
+
# <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
|
744
|
+
# <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
|
745
|
+
def smart_pop(name)
|
746
|
+
#puts "Smart pop for #{name}"
|
747
|
+
nesting_reset_triggers = @@nestable_tags[name]
|
748
|
+
is_nestable = nesting_reset_triggers != nil
|
749
|
+
is_reset_nesting = @@reset_nesting_tags.has_key?(name)
|
750
|
+
popTo = nil
|
751
|
+
inclusive = true
|
752
|
+
for p in @tag_stack.reverse
|
753
|
+
if (p == nil or p.name == name) and not is_nestable
|
754
|
+
#Non-nestable tags get popped to the top or to their
|
755
|
+
#last occurance.
|
756
|
+
#puts "Non-nestable tag #{name} gets popped to its last occurance."
|
757
|
+
popTo = name
|
758
|
+
break
|
759
|
+
end
|
760
|
+
if (nesting_reset_triggers != nil and nesting_reset_triggers.include?(p.name)) or (nesting_reset_triggers == nil and is_reset_nesting and @@reset_nesting_tags.has_key?(p.name))
|
761
|
+
#If we encounter one of the nesting reset triggers
|
762
|
+
#peculiar to this tag, or we encounter another tag
|
763
|
+
#that causes nesting to reset, pop up to but not
|
764
|
+
#including that tag.
|
765
|
+
#puts "Nesting reset trigger encountered for #{name}: #{p.name}"
|
766
|
+
popTo = p.name
|
767
|
+
inclusive = false
|
768
|
+
break
|
769
|
+
end
|
770
|
+
p = p.parent
|
771
|
+
end
|
772
|
+
pop_to_tag(popTo, inclusive) if popTo
|
773
|
+
end
|
774
|
+
|
775
|
+
protected
|
776
|
+
|
777
|
+
#Turns a list of maps, lists, or scalars into a single map.
|
778
|
+
#Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
|
779
|
+
#of lists and partial maps.
|
780
|
+
def BeautifulStoneSoup.build_tag_map(default, *args)
|
781
|
+
built = args.inject({}) do |m, portion|
|
782
|
+
if portion.is_a? Hash
|
783
|
+
#It's a map. Merge it.
|
784
|
+
portion.each_pair { |k,v| m[k] = v }
|
785
|
+
elsif portion.is_a? Array
|
786
|
+
#It's a list. Map each item to the default.
|
787
|
+
portion.each { |k| m[k] = default }
|
788
|
+
else
|
789
|
+
#It's a scalar. Map it to the default.
|
790
|
+
m[portion] = default
|
791
|
+
end
|
792
|
+
m
|
793
|
+
end
|
794
|
+
end
|
795
|
+
end
|
796
|
+
|
797
|
+
#This parser knows the following facts about HTML:
|
798
|
+
#
|
799
|
+
#* Some tags have no closing tag and should be interpreted as being
|
800
|
+
# closed as soon as they are encountered.
|
801
|
+
#
|
802
|
+
#* The text inside some tags (ie. 'script') may contain tags which
|
803
|
+
# are not really part of the document and which should be parsed
|
804
|
+
# as text, not tags. If you want to parse the text as tags, you can
|
805
|
+
# always fetch it and parse it explicitly.
|
806
|
+
#
|
807
|
+
#* Tag nesting rules:
|
808
|
+
#
|
809
|
+
# Most tags can't be nested at all. For instance, the occurance of
|
810
|
+
# a <p> tag should implicitly close the previous <p> tag.
|
811
|
+
#
|
812
|
+
# <p>Para1<p>Para2
|
813
|
+
# should be transformed into:
|
814
|
+
# <p>Para1</p><p>Para2
|
815
|
+
#
|
816
|
+
# Some tags can be nested arbitrarily. For instance, the occurance
|
817
|
+
# of a <blockquote> tag should _not_ implicitly close the previous
|
818
|
+
# <blockquote> tag.
|
819
|
+
#
|
820
|
+
# Alice said: <blockquote>Bob said: <blockquote>Blah
|
821
|
+
# should NOT be transformed into:
|
822
|
+
# Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
|
823
|
+
#
|
824
|
+
# Some tags can be nested, but the nesting is reset by the
|
825
|
+
# interposition of other tags. For instance, a <tr> tag should
|
826
|
+
# implicitly close the previous <tr> tag within the same <table>,
|
827
|
+
# but not close a <tr> tag in another table.
|
828
|
+
#
|
829
|
+
# <table><tr>Blah<tr>Blah
|
830
|
+
# should be transformed into:
|
831
|
+
# <table><tr>Blah</tr><tr>Blah
|
832
|
+
# but,
|
833
|
+
# <tr>Blah<table><tr>Blah
|
834
|
+
# should NOT be transformed into
|
835
|
+
# <tr>Blah<table></tr><tr>Blah
|
836
|
+
#
|
837
|
+
#Differing assumptions about tag nesting rules are a major source
|
838
|
+
#of problems with the BeautifulSoup class. If BeautifulSoup is not
|
839
|
+
#treating as nestable a tag your page author treats as nestable,
|
840
|
+
#try writing a subclass.
|
841
|
+
class BeautifulSoup < BeautifulStoneSoup
|
842
|
+
|
843
|
+
@@self_closing_tags.replace(build_tag_map(nil, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame']))
|
844
|
+
|
845
|
+
@@quote_tags = {'script' => nil}
|
846
|
+
|
847
|
+
#According to the HTML standard, each of these inline tags can
|
848
|
+
#contain another tag of the same type. Furthermore, it's common
|
849
|
+
#to actually use these tags this way.
|
850
|
+
@@nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center']
|
851
|
+
|
852
|
+
#According to the HTML standard, these block tags can contain
|
853
|
+
#another tag of the same type. Furthermore, it's common
|
854
|
+
#to actually use these tags this way.
|
855
|
+
@@nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
|
856
|
+
|
857
|
+
#Lists can contain other lists, but there are restrictions.
|
858
|
+
@@nestable_list_tags = { 'ol' => [],
|
859
|
+
'ul' => [],
|
860
|
+
'li' => ['ul', 'ol'],
|
861
|
+
'dl' => [],
|
862
|
+
'dd' => ['dl'],
|
863
|
+
'dt' => ['dl'] }
|
864
|
+
|
865
|
+
#Tables can contain other tables, but there are restrictions.
|
866
|
+
@@nestable_table_tags = {'table' => ['tr', 'td'],
|
867
|
+
'tr' => ['table'],
|
868
|
+
'td' => ['tr'],
|
869
|
+
'th' => ['tr'],
|
870
|
+
}
|
871
|
+
|
872
|
+
@@non_nestable_block_tags = ['address', 'form', 'p', 'pre']
|
873
|
+
|
874
|
+
#If one of these tags is encountered, all tags up to the next tag of
|
875
|
+
#this type are popped.
|
876
|
+
@@reset_nesting_tags.replace(build_tag_map(nil, @@nestable_block_tags, 'noscript', @@non_nestable_block_tags,
|
877
|
+
@@nestable_list_tags, @@nestable_table_tags))
|
878
|
+
|
879
|
+
@@nestable_tags.replace(build_tag_map([], @@nestable_inline_tags, @@nestable_block_tags, @@nestable_list_tags, @@nestable_table_tags))
|
880
|
+
|
881
|
+
end
|
882
|
+
|
883
|
+
# This class will push a tag with only a single string child into
|
884
|
+
# the tag's parent as an attribute. The attribute's name is the tag
|
885
|
+
# name, and the value is the string child. An example should give
|
886
|
+
# the flavor of the change:
|
887
|
+
#
|
888
|
+
# <foo><bar>baz</bar></foo>
|
889
|
+
# =>
|
890
|
+
# <foo bar="baz"><bar>baz</bar></foo>
|
891
|
+
#
|
892
|
+
# You can then access fooTag['bar'] instead of fooTag.barTag.string.
|
893
|
+
#
|
894
|
+
# This is, of course, useful for scraping structures that tend to
|
895
|
+
# use subelements instead of attributes, such as SOAP messages. Note
|
896
|
+
# that it modifies its input, so don't print the modified version
|
897
|
+
# out.
|
898
|
+
class BeautifulSOAP < BeautifulStoneSoup
|
899
|
+
def pop_tag
|
900
|
+
if @tag_stack.size > 1
|
901
|
+
tag = @tag_stack[-1]
|
902
|
+
parent = @tag_stack[-2]
|
903
|
+
if (tag.is_a?(Tag) && tag.contents.size == 1 && \
|
904
|
+
tag.contents[0].is_a?(NavigableString) && !parent[tag.name])
|
905
|
+
parent[tag.name] = tag.contents[0]
|
906
|
+
end
|
907
|
+
super
|
908
|
+
end
|
909
|
+
end
|
910
|
+
end
|
911
|
+
|
912
|
+
#Enterprise class names! It has come to our attention that some people
|
913
|
+
#think the names of the Rubyful Soup parser classes are too silly
|
914
|
+
#and "unprofessional" for use in enterprise screen-scraping. We feel
|
915
|
+
#your pain! For such-minded folk, the Rubyful Soup Consortium And
|
916
|
+
#Rootin' Tootin' Texas Delicatessen recommends renaming this file to
|
917
|
+
#"RobustParser.rb" (or, in cases of extreme enterprisitude,
|
918
|
+
#"RobustParserBeanInterface.class") and using the following
|
919
|
+
#enterprise-friendly class aliases:
|
920
|
+
class RobustXMLParser < BeautifulStoneSoup; end
|
921
|
+
class RobustHTMLParser < BeautifulSoup; end
|
922
|
+
class SimplifyingSOAPParser < BeautifulSOAP; end
|
923
|
+
|
924
|
+
print BeautifulSoup.new(ARGF.read).prettify if $0 == __FILE__
|
925
|
+
|
@@ -0,0 +1,431 @@
|
|
1
|
+
#Unit tests for Rubyful Soup.
|
2
|
+
#
|
3
|
+
#These tests make sure the Rubyful Soup works as it should. If you
|
4
|
+
#find a bug in Rubyful Soup, the best way to express it is as a test
|
5
|
+
#case like this that fails.
|
6
|
+
|
7
|
+
require 'test/unit'
|
8
|
+
require 'rubygems'
|
9
|
+
require 'rubyful_soup'
|
10
|
+
|
11
|
+
class SoupTest < Test::Unit::TestCase
|
12
|
+
|
13
|
+
#Parse the given text and make sure its string rep is the other
|
14
|
+
#given text.
|
15
|
+
def assert_soup_equals(toParse, rep=nil, c=BeautifulStoneSoup)
|
16
|
+
if rep == nil
|
17
|
+
rep = toParse
|
18
|
+
end
|
19
|
+
assert_equal(c.new(toParse).to_s(false), rep)
|
20
|
+
end
|
21
|
+
|
22
|
+
#Null test to shut the compiler up.
|
23
|
+
def test_null
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
#Tests the various ways of fetching tags from a soup.
|
29
|
+
class ToteThatTag < SoupTest
|
30
|
+
|
31
|
+
def setup
|
32
|
+
ml = %{
|
33
|
+
<a id="x">1</a>
|
34
|
+
<a id="a">2</a>
|
35
|
+
<b id="b">3</b>
|
36
|
+
<b id="x">4</b>
|
37
|
+
<abc:d width="100">5</abc:d>}
|
38
|
+
@soup = BeautifulStoneSoup.new(ml)
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_fetch_by_name
|
42
|
+
matching = @soup.find_all('a')
|
43
|
+
assert_equal(matching.length, 2)
|
44
|
+
assert_equal(matching[0].name, 'a')
|
45
|
+
assert_equal(matching[0], @soup.find('a'))
|
46
|
+
assert_equal(@soup.find('abc:d').contents.length, 1)
|
47
|
+
|
48
|
+
firstB = @soup.find('b')
|
49
|
+
nextB = firstB.find_next('b')
|
50
|
+
assert_equal(nextB.contents[0], '4')
|
51
|
+
assert_equal(nextB['id'], 'x')
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_fetch_by_block
|
56
|
+
|
57
|
+
a = @soup.find_all('a')
|
58
|
+
b = @soup.find_all do |x|
|
59
|
+
x.is_a? Tag and x.name == 'a'
|
60
|
+
end
|
61
|
+
assert_equal(a,b)
|
62
|
+
|
63
|
+
a = @soup.find_text('3')
|
64
|
+
b = @soup.find_text do |x|
|
65
|
+
x.is_a? NavigableString and x == '3'
|
66
|
+
end
|
67
|
+
assert_equal(a,b)
|
68
|
+
|
69
|
+
matching = @soup.find_all do |x|
|
70
|
+
x.respond_to?('name') and x.name == x['id']
|
71
|
+
end
|
72
|
+
assert_equal(matching.length, 2)
|
73
|
+
assert_equal(matching[0].name, 'a')
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_fetch_by_attribute
|
77
|
+
matching = @soup.find_all(nil, :attrs=>{'id' => 'x'})
|
78
|
+
assert_equal(matching.length, 2)
|
79
|
+
assert_equal(matching[0].name, 'a')
|
80
|
+
assert_equal(matching[1].name, 'b')
|
81
|
+
|
82
|
+
assert_equal(@soup.find_all(nil, :attrs=>{'id' => nil}).length, 1)
|
83
|
+
assert_equal(@soup.find_all(nil, :attrs=>{'id' => nil}).length, 1)
|
84
|
+
|
85
|
+
assert_equal(@soup.find_all(nil, :attrs=>{'width' => 100}).length, 1)
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_tag_name_as_method
|
89
|
+
firstB = @soup.find('b')
|
90
|
+
assert_equal(firstB, @soup.b)
|
91
|
+
assert_equal(firstB, @soup.b_tag)
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_fetch_by_list
|
95
|
+
matching = @soup.find_all(['a', 'abc:d'])
|
96
|
+
assert_equal(matching.length, 3)
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_fetch_by_hash
|
100
|
+
matching = @soup.find_all({'a' => true, 'b' => true})
|
101
|
+
assert_equal(matching.length, 4)
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_fetch_by_re
|
105
|
+
r = /a.*/
|
106
|
+
assert_equal(@soup.find_all(r).length, 3)
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_fetch_by_method
|
110
|
+
proc = Proc.new { |x| return x.name == x['id'] }
|
111
|
+
matching = @soup.find_all(proc)
|
112
|
+
assert_equal(matching.length, 2)
|
113
|
+
assert_equal(matching[0].name, 'a')
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
#Testing the integrity of the parse tree.
|
119
|
+
class FollowThatTag < SoupTest
|
120
|
+
|
121
|
+
@@PROXIMITY_TEST = BeautifulStoneSoup.new('<b id="1"><b id="2"><b id="3"><b id="4">')
|
122
|
+
|
123
|
+
@@SIBLING_TEST = BeautifulStoneSoup.new('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
|
124
|
+
|
125
|
+
def test_parents
|
126
|
+
soup = BeautifulSoup.new('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah</b></ul></ul></ul>')
|
127
|
+
b = soup.find('b')
|
128
|
+
assert_equal(b.find_parents('ul', :attrs=>{'id' => 'foo'}).length, 2)
|
129
|
+
assert_equal(b.find_parent('ul')['a'], 'b')
|
130
|
+
end
|
131
|
+
|
132
|
+
def test_next_sibling
|
133
|
+
soup = @@SIBLING_TEST
|
134
|
+
tag = 'blockquote'
|
135
|
+
b = soup.find(tag, :attrs=>{'id' => 2})
|
136
|
+
assert_equal(b.find_next(tag)['id'], '2.1')
|
137
|
+
assert_equal(b.find_next_sibling(tag)['id'], '3')
|
138
|
+
assert_equal(b.find_next_sibling(tag)['id'], '3')
|
139
|
+
assert_equal(b.find_next_siblings(tag).length, 2)
|
140
|
+
assert_equal(b.find_next_siblings(tag, :attrs=>{'id' => 4}).length, 1)
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_previous_sibling
|
144
|
+
soup = @@SIBLING_TEST
|
145
|
+
tag = 'blockquote'
|
146
|
+
b = soup.find(tag, :attrs=>{'id' => 3})
|
147
|
+
assert_equal(b.find_previous(tag)['id'], '2.1')
|
148
|
+
assert_equal(b.find_previous_sibling(tag)['id'], '2')
|
149
|
+
assert_equal(b.find_previous_sibling(tag)['id'], '2')
|
150
|
+
assert_equal(b.find_previous_siblings(tag).length, 2)
|
151
|
+
assert_equal(b.find_previous_siblings(tag, :attrs=>{'id' => 1}).length, 1)
|
152
|
+
end
|
153
|
+
|
154
|
+
def test_text_navigation
|
155
|
+
soup = BeautifulSoup.new('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
|
156
|
+
baz = soup.find_text('Baz')
|
157
|
+
assert_equal(baz.find_parent("i")['id'], '1')
|
158
|
+
assert_equal(baz.find_next(nil, :text=> 'Blee'), 'Blee')
|
159
|
+
assert_equal(baz.find_next_sibling(nil, :text=>'Blee'), 'Blee')
|
160
|
+
assert_equal(baz.find_next_sibling(nil, :text=>'Blargh'), nil)
|
161
|
+
assert_equal(baz.find_next_sibling('hr')['id'], '1')
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
#Tests the nextSibling and previousSibling navigation.
|
167
|
+
class SiblingRivalry < SoupTest
|
168
|
+
|
169
|
+
def test_siblings
|
170
|
+
soup = BeautifulSoup.new("<ul><li>1<p>A</p>B</li><li>2</li><li>3</li></ul>")
|
171
|
+
second_li = soup.find('li').next_sibling
|
172
|
+
assert_equal(second_li.name, 'li')
|
173
|
+
assert_equal(second_li.string, '2')
|
174
|
+
assert_equal(soup.find_text('1').next_sibling.name, 'p')
|
175
|
+
assert_equal(soup.find('p').next_sibling, 'B')
|
176
|
+
assert_equal(soup.find('p').next_sibling.previous_sibling.next_sibling,
|
177
|
+
'B')
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
#Tests the various built-in functions of Tag objects.
|
182
|
+
class TagsAreObjectsToo < SoupTest
|
183
|
+
|
184
|
+
@@SOUP = BeautifulSoup.new('<top id="1">1<b>2</b>3</top>')
|
185
|
+
|
186
|
+
def test_length
|
187
|
+
assert_equal(@@SOUP.top.length, 3)
|
188
|
+
end
|
189
|
+
|
190
|
+
def test_hash_lookup
|
191
|
+
assert_equal(@@SOUP.top['id'], "1")
|
192
|
+
end
|
193
|
+
|
194
|
+
def test_iterator
|
195
|
+
bucket = []
|
196
|
+
@@SOUP.top.each do |x|
|
197
|
+
bucket << x
|
198
|
+
end
|
199
|
+
assert_equal(bucket.length, 3)
|
200
|
+
assert_equal(bucket[2], "3")
|
201
|
+
end
|
202
|
+
|
203
|
+
end
|
204
|
+
|
205
|
+
#Tests the use of 'string' as an alias for a tag's only content.
|
206
|
+
class StringEmUp < SoupTest
|
207
|
+
|
208
|
+
def test_string
|
209
|
+
s = BeautifulSoup.new('<b>foo</b>')
|
210
|
+
assert_equal(s.b.string, 'foo')
|
211
|
+
end
|
212
|
+
|
213
|
+
def test_lack_of_string
|
214
|
+
s = BeautifulSoup.new("<b>f<i>e</i>o</b>")
|
215
|
+
self.assert_equal(s.b.string, nil)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
#Tests the limit argument.
|
220
|
+
class ThatsMyLimit < SoupTest
|
221
|
+
|
222
|
+
def test_basic_limits
|
223
|
+
s = BeautifulSoup.new('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
|
224
|
+
assert_equal(s.find_all('br').length, 4)
|
225
|
+
assert_equal(s.find_all('br', :limit=> 2).length, 2)
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
#Testing the modification of the tree.
|
230
|
+
class WriteOnlyCode < SoupTest
|
231
|
+
|
232
|
+
def test_replace_contents
|
233
|
+
soup = BeautifulSoup.new('<a>foo</a>')
|
234
|
+
soup.a.contents[0] = (NavigableString.new('bar'))
|
235
|
+
assert_equal(soup.render_contents, '<a>bar</a>')
|
236
|
+
end
|
237
|
+
|
238
|
+
def test_modify_attributes
|
239
|
+
soup = BeautifulSoup.new('<a id="1"></a>')
|
240
|
+
first_a = soup.find('a')
|
241
|
+
|
242
|
+
first_a['id'] = 2
|
243
|
+
assert_equal(soup.render_contents, '<a id="2"></a>')
|
244
|
+
first_a['id'] = nil
|
245
|
+
assert_equal(soup.render_contents, '<a></a>')
|
246
|
+
|
247
|
+
first_a['id2'] = 'foo'
|
248
|
+
assert_equal(soup.render_contents, '<a id2="foo"></a>')
|
249
|
+
first_a.delete('id2')
|
250
|
+
assert_equal(soup.render_contents, '<a></a>')
|
251
|
+
end
|
252
|
+
|
253
|
+
#Makes sure tags don't step on each others' toes.
|
254
|
+
def test_new_tag_
|
255
|
+
soup = BeautifulSoup.new('')
|
256
|
+
a = Tag.new(soup, 'a')
|
257
|
+
ol = Tag.new(soup, 'ol')
|
258
|
+
a["href"] = "http://foo.com/"
|
259
|
+
assert_equal(ol["href"], nil)
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
#Our operators do it all! Call now!
|
264
|
+
class OperatorOverload < SoupTest
|
265
|
+
|
266
|
+
def test_tag_name_as_find
|
267
|
+
# Tests that referencing a tag name as a member delegates to find.
|
268
|
+
soup = BeautifulSoup.new('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
|
269
|
+
assert_equal(soup.b.i, soup.find('b').find('i'))
|
270
|
+
assert_equal(soup.b.i.string, 'bar')
|
271
|
+
assert_equal(soup.b['id'], '1')
|
272
|
+
assert_equal(soup.b.contents[0], 'foo')
|
273
|
+
assert(soup.a == nil)
|
274
|
+
|
275
|
+
#Test the .foo_tag variant of .foo.
|
276
|
+
assert_equal(soup.b_tag.i_tag.string, 'bar')
|
277
|
+
assert_equal(soup.b.i_tag.string, 'bar')
|
278
|
+
assert_equal(soup.find('b').find('i'), soup.b_tag.i_tag)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
#Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!
|
283
|
+
class NestableEgg < SoupTest
|
284
|
+
|
285
|
+
def test_para_inside_blockquote
|
286
|
+
soup = BeautifulSoup.new('<blockquote><p><b>Foo</b></p></blockquote><p>Bar')
|
287
|
+
assert_equal(soup.blockquote.p.b.string, 'Foo')
|
288
|
+
assert_equal(soup.blockquote.b.string, 'Foo')
|
289
|
+
assert_equal(soup.find('p', :recursive=>false).string, 'Bar')
|
290
|
+
end
|
291
|
+
|
292
|
+
def test_nested_tables
|
293
|
+
text = %{<table id="1"><tr><td>Here's another table:
|
294
|
+
<table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>}
|
295
|
+
soup = BeautifulSoup.new(text)
|
296
|
+
assert_equal(soup.table.table.td.string, 'Juicy text')
|
297
|
+
assert_equal(soup.find_all('table').length, 2)
|
298
|
+
assert_equal(soup.table.find_all('table').length, 1)
|
299
|
+
assert_equal(soup.find('table', :attrs=>{'id' => 2}).parent.parent.parent.name,
|
300
|
+
'table')
|
301
|
+
end
|
302
|
+
|
303
|
+
def test_bad_nested_tables
|
304
|
+
soup = BeautifulSoup.new("<table><tr><table><tr id='nested'></tr></table></tr></table>")
|
305
|
+
assert_equal(soup.table.tr.table.tr['id'], 'nested')
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
|
310
|
+
#Here we test cleanup of text that breaks an unaltered parser or is just
|
311
|
+
#obnoxious.
|
312
|
+
class CleanupOnAisleFour < SoupTest
|
313
|
+
|
314
|
+
def test_self_closing_tag
|
315
|
+
assert_equal(BeautifulStoneSoup.new("Foo<br/>Bar").find('br').to_s,
|
316
|
+
'<br />')
|
317
|
+
assert_soup_equals('<p>test1<br/>test2</p>',
|
318
|
+
'<p>test1<br />test2</p>')
|
319
|
+
end
|
320
|
+
|
321
|
+
def test_bad_closing_tags
|
322
|
+
BeautifulStoneSoup.new("<a>Foo<b>Bar</a>")
|
323
|
+
end
|
324
|
+
|
325
|
+
def test_premature_closing_tag
|
326
|
+
BeautifulStoneSoup.new("</b><a>Foo<b>Bar</a>")
|
327
|
+
end
|
328
|
+
|
329
|
+
def test_bad_doctype
|
330
|
+
assert_soup_equals("<!DOCTYPE foo='bar'>")
|
331
|
+
end
|
332
|
+
|
333
|
+
def test_whitespace_in_declaration
|
334
|
+
assert_soup_equals('<! DOCTYPE>', '<!DOCTYPE>')
|
335
|
+
end
|
336
|
+
|
337
|
+
def test_JunkInDeclaration
|
338
|
+
assert_soup_equals('<! Foo = -8>a', '<!Foo = -8>a')
|
339
|
+
end
|
340
|
+
|
341
|
+
def test_incomplete_declaration
|
342
|
+
assert_soup_equals('a<!b <p>c', 'a<!b <p>c</p>')
|
343
|
+
end
|
344
|
+
|
345
|
+
def test_valid_but_bogus_declaration
|
346
|
+
assert_soup_equals('<! Foo >a', '<!Foo >a')
|
347
|
+
end
|
348
|
+
|
349
|
+
#This fails for a totally bogus reason! I can't figure it out.
|
350
|
+
#def test_smart_quotes_not_so_smart_anymore_FAILS
|
351
|
+
# assert_soup_equals("\x91Foo\x92", '‘Foo’')
|
352
|
+
#end
|
353
|
+
|
354
|
+
#def test_incomplete_declaration_at_endFAILS
|
355
|
+
# assert_soup_equals('a<!b')
|
356
|
+
#end
|
357
|
+
|
358
|
+
end
|
359
|
+
|
360
|
+
#Verifies that the parser treats multiple feed calls the same as one
|
361
|
+
#big feed call only if constructed with
|
362
|
+
#initialTextIsEverything=False.
|
363
|
+
class KeepOnParsing < SoupTest
|
364
|
+
|
365
|
+
def test_multiple_parse_calls
|
366
|
+
f1 = '<foo>bah<bar>'
|
367
|
+
f2 = 'blee</bar></foo>'
|
368
|
+
|
369
|
+
s1 = BeautifulSoup.new(f1+f2)
|
370
|
+
s2 = BeautifulSoup.new(f1)
|
371
|
+
s2.feed(f2)
|
372
|
+
s3 = BeautifulSoup.new(f1, :initial_text_is_everything => false)
|
373
|
+
s3.feed(f2)
|
374
|
+
assert_not_equal(s1, s2)
|
375
|
+
assert_equal(s1, s3)
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
#Verifies that BeautifulSOAP parser works.
|
380
|
+
class SOAPMeUp < SoupTest
|
381
|
+
def test_basic_soap
|
382
|
+
s = "<foo><bar>baz</bar></foo>"
|
383
|
+
soup = BeautifulSOAP.new(s)
|
384
|
+
assert_equal(soup.to_s, %{<foo bar="baz"><bar>baz</bar></foo>})
|
385
|
+
end
|
386
|
+
|
387
|
+
def test_dont_overwrite_existing_attr
|
388
|
+
s = %{<foo bar="don't kill me!"><bar>baz</bar></foo>}
|
389
|
+
soup = BeautifulSOAP.new(s)
|
390
|
+
assert_equal(soup.to_s, s)
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|
394
|
+
#The Unicode test suite has not yet been ported because I haven't
|
395
|
+
#figured out how Ruby does Unicode.
|
396
|
+
|
397
|
+
# class UnicodeRed < SoupTest
|
398
|
+
# "Makes sure Unicode works."
|
399
|
+
|
400
|
+
# def setUp
|
401
|
+
# text = 'foo<b>bar</b>'
|
402
|
+
# @soup = BeautifulStoneSoup
|
403
|
+
# @soup.feed(text)
|
404
|
+
|
405
|
+
# def test_BasicUnicode
|
406
|
+
# import types
|
407
|
+
# sType = types.StringType
|
408
|
+
# uType = types.UnicodeType
|
409
|
+
|
410
|
+
# u = u'\3100'
|
411
|
+
# #It starts out ASCII...
|
412
|
+
# assert_equal(type(@soup.renderContents), sType)
|
413
|
+
# assert_equal(type(@soup.prettify), sType)
|
414
|
+
# #But you can have unicode if you want.
|
415
|
+
# assert_equal(type(unicode(@soup)), uType)
|
416
|
+
|
417
|
+
# #Add a Unicode character and it's Unicode.
|
418
|
+
# @soup.feed(u)
|
419
|
+
# assert_equal(type(@soup.renderContents), uType)
|
420
|
+
# assert_equal(type(@soup.prettify), uType)
|
421
|
+
# #But you can have ASCII if you want.
|
422
|
+
# assert_equal(type(str(@soup)), sType)
|
423
|
+
|
424
|
+
# #The part without any Unicode is still ASCII.
|
425
|
+
# assert_equal(type(@soup.b.prettify), sType)
|
426
|
+
|
427
|
+
# #But if you add a Unicode character it'll become Unicode.
|
428
|
+
# @soup.b['foo'] = u'\3100'
|
429
|
+
# assert_equal(type(@soup.b.prettify), uType)
|
430
|
+
|
431
|
+
|
metadata
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.4
|
3
|
+
specification_version: 1
|
4
|
+
name: rubyful_soup
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.1
|
7
|
+
date: 2005-10-21
|
8
|
+
summary: An HTML/XML parser that handles bad markup and provides tree traversal methods.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: leonardr@segfault.org
|
12
|
+
homepage: http://www.crummy.com/software/RubyfulSoup/
|
13
|
+
rubyforge_project:
|
14
|
+
description: "Rubyful Soup is a *ML parser that makes screen-scraping easy. It won't choke on
|
15
|
+
bad markup, and it's easy to locate the part of a document you want."
|
16
|
+
autorequire:
|
17
|
+
default_executable:
|
18
|
+
bindir: bin
|
19
|
+
has_rdoc: true
|
20
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
21
|
+
requirements:
|
22
|
+
-
|
23
|
+
- ">"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: 0.0.0
|
26
|
+
version:
|
27
|
+
platform: ruby
|
28
|
+
authors:
|
29
|
+
- Leonard Richardson
|
30
|
+
files:
|
31
|
+
- lib/rubyful_soup.rb
|
32
|
+
- tests/rubyful_soup_tests.rb
|
33
|
+
- CHANGELOG
|
34
|
+
test_files:
|
35
|
+
- tests/rubyful_soup_tests.rb
|
36
|
+
rdoc_options: []
|
37
|
+
extra_rdoc_files:
|
38
|
+
- CHANGELOG
|
39
|
+
executables: []
|
40
|
+
extensions: []
|
41
|
+
requirements: []
|
42
|
+
dependencies:
|
43
|
+
- !ruby/object:Gem::Dependency
|
44
|
+
name: htmltools
|
45
|
+
version_requirement:
|
46
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
47
|
+
requirements:
|
48
|
+
-
|
49
|
+
- ">"
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: 0.0.0
|
52
|
+
version:
|