rubyful_soup_2011 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rubyful_soup.rb +950 -0
- data/tests/rubyful_soup_tests.rb +441 -0
- metadata +57 -0
data/lib/rubyful_soup.rb
ADDED
@@ -0,0 +1,950 @@
|
|
1
|
+
#Rubyful Soup
|
2
|
+
#Elixir and Tonic
|
3
|
+
#"The Screen-Scraper's Friend"
|
4
|
+
#v1.0.4
|
5
|
+
#http://www.crummy.com/software/RubyfulSoup/
|
6
|
+
#
|
7
|
+
#Rubyful Soup is a port to the Ruby language and idiom of the Python
|
8
|
+
#library Beautiful Soup.
|
9
|
+
#See http://www.crummy.com/software/BeautifulSoup/ for details on the original.
|
10
|
+
|
11
|
+
#This library requires the sgml-parser library, written by Takahiro
|
12
|
+
#Maebashi. The easiest way to get it is to install the "htmltools"
|
13
|
+
#gem.
|
14
|
+
require 'html/sgml-parser'
|
15
|
+
require 'set'
|
16
|
+
|
17
|
+
#UTF-8 voodoo--does this really work?
|
18
|
+
$KCODE = 'u'
|
19
|
+
# require 'jcode'
|
20
|
+
|
21
|
+
#This code makes SGMLParser able to parse XML with namespaces.
|
22
|
+
class HTML::SGMLParser
|
23
|
+
if const_defined? :Tagfind
|
24
|
+
remove_const(:Tagfind)
|
25
|
+
Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
module PageElement
|
30
|
+
|
31
|
+
attr_reader :parser
|
32
|
+
attr_accessor :parent, :previous_parsed, :next_parsed, :previous_sibling
|
33
|
+
attr_accessor :next_sibling
|
34
|
+
|
35
|
+
def setup(parent=nil, previous_parsed=nil)
|
36
|
+
@parent = parent
|
37
|
+
@previous_parsed = previous_parsed
|
38
|
+
@next_parsed = nil
|
39
|
+
@previous_sibling = nil
|
40
|
+
@next_sibling = nil
|
41
|
+
if @parent and not @parent.contents.empty?
|
42
|
+
@previous_sibling = @parent.contents[-1]
|
43
|
+
@previous_sibling.next_sibling = self
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
#A bunch of different iterators over a parsed document.
|
48
|
+
{
|
49
|
+
#Iterates in parse order over the rest of the items in this document.
|
50
|
+
:next_parsed_items => :next_parsed,
|
51
|
+
|
52
|
+
#Iterates in reverse parse order over all previously parsed items in
|
53
|
+
#this document.
|
54
|
+
:previous_parsed_items => :previous_parsed,
|
55
|
+
|
56
|
+
#Iterates in parse order over all subsequent siblings of this item.
|
57
|
+
:next_siblings => :next_sibling,
|
58
|
+
|
59
|
+
#Iterates in reverse parse order over all prior siblings of this item.
|
60
|
+
:previous_siblings => :previous_sibling,
|
61
|
+
|
62
|
+
#Iterates upwards through the parentage of this item.
|
63
|
+
:parents => :parent
|
64
|
+
}.each do |k,v|
|
65
|
+
class_eval %{
|
66
|
+
def #{k}
|
67
|
+
i = self
|
68
|
+
while i
|
69
|
+
i = i.#{v}
|
70
|
+
yield i if i
|
71
|
+
end
|
72
|
+
end
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
[ #Returns first item/all items matching the given criteria and
|
77
|
+
#appearing after this PageElement in the document.
|
78
|
+
[:find_next, :find_all_next, 'next_parsed_items'],
|
79
|
+
|
80
|
+
#Returns first item/all items matching the given criteria and
|
81
|
+
#appearing before this PageElement in the document.
|
82
|
+
[:find_previous, :find_all_previous, 'previous_parsed_items'],
|
83
|
+
|
84
|
+
#Returns the nearest sibling/all siblings of this PageElement matching
|
85
|
+
#the given criteria and appearing before this PageElement in
|
86
|
+
#the document.
|
87
|
+
[:find_previous_sibling, :find_previous_siblings, 'previous_siblings'],
|
88
|
+
|
89
|
+
#Returns the nearest sibling/all siblings of this PageElement matching
|
90
|
+
#the given criteria and appearing after this PageElement in
|
91
|
+
#the document
|
92
|
+
[:find_next_sibling, :find_next_siblings, 'next_siblings'],
|
93
|
+
|
94
|
+
#Returns the nearest parent/all parents of this PageElement matching
|
95
|
+
#the given criteria.
|
96
|
+
[:find_parent, :find_parents, 'parents'],
|
97
|
+
].each do |singular, plural, method_name|
|
98
|
+
class_eval %{
|
99
|
+
def #{singular}(name=nil, args={}, &block)
|
100
|
+
args['limit'] = 1
|
101
|
+
fetch(method('#{method_name}'), name, args, block)[0]
|
102
|
+
end
|
103
|
+
|
104
|
+
def #{plural}(name=nil, args={}, &block)
|
105
|
+
fetch(method('#{method_name}'), name, args, block)
|
106
|
+
end
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
protected
|
111
|
+
|
112
|
+
#Returns a list of items matching the given criteria, obtained by
|
113
|
+
#iterating over the given iterator.
|
114
|
+
def fetch(iterator, name, args, block)
|
115
|
+
attrs = args[:attrs]
|
116
|
+
limit = args[:limit]
|
117
|
+
text = args[:text]
|
118
|
+
|
119
|
+
attrs ||= {}
|
120
|
+
if attrs != nil and not attrs.respond_to? :keys
|
121
|
+
attrs = {'class' => attrs}
|
122
|
+
end
|
123
|
+
bucket = []
|
124
|
+
catch(:stop_iteration) do
|
125
|
+
iterator.call do |item|
|
126
|
+
match = false
|
127
|
+
if block
|
128
|
+
match = true if block.call(item)
|
129
|
+
elsif item.is_a? Tag
|
130
|
+
#A tag matches if its name matches and its attributes line up.
|
131
|
+
if not text and (not name or PageElement.matches(item, name))
|
132
|
+
match = true
|
133
|
+
attrs.each_pair do |attr, matchAgainst|
|
134
|
+
check = item[attr]
|
135
|
+
unless PageElement.matches(check, matchAgainst)
|
136
|
+
match = false
|
137
|
+
break
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
elsif text
|
142
|
+
#A text matches if its string value matches the given text
|
143
|
+
#criterion.
|
144
|
+
match = PageElement.matches(item, text)
|
145
|
+
end
|
146
|
+
if match
|
147
|
+
bucket.push(item)
|
148
|
+
if limit and bucket.length >= limit
|
149
|
+
throw :stop_iteration
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
return bucket
|
155
|
+
end
|
156
|
+
|
157
|
+
#Used to tell whether a Tag or a NavigableString "matches" some data
|
158
|
+
#structure.
|
159
|
+
def PageElement.matches(chunk, how_to_match)
|
160
|
+
#puts "Seeing if #{chunk.class} #{chunk} matches #{how_to_match.class} #{how_to_match}."
|
161
|
+
#
|
162
|
+
# If given a list of items, return true if the list contains a
|
163
|
+
# text element that matches.
|
164
|
+
if chunk.is_a? Array
|
165
|
+
chunk.each do |tag|
|
166
|
+
return true if tag.is_a? NavigableString and matches(tag, how_to_match)
|
167
|
+
end
|
168
|
+
return false
|
169
|
+
elsif how_to_match.is_a? Proc
|
170
|
+
return how_to_match.call(chunk)
|
171
|
+
elsif chunk.is_a? Tag
|
172
|
+
#Custom match methods take the tag as an argument, but all other
|
173
|
+
#ways of matching match the tag name as a string
|
174
|
+
chunk = chunk.name
|
175
|
+
end
|
176
|
+
|
177
|
+
#At this point we know that chunk is a string
|
178
|
+
unless chunk.is_a? String
|
179
|
+
chunk = chunk.to_s
|
180
|
+
end
|
181
|
+
if how_to_match.is_a? Regexp
|
182
|
+
return how_to_match.match(chunk) != nil
|
183
|
+
elsif how_to_match.is_a? Array
|
184
|
+
return how_to_match.find {|x| x == chunk} != nil
|
185
|
+
elsif how_to_match.is_a? Hash
|
186
|
+
return how_to_match[chunk] != nil
|
187
|
+
else
|
188
|
+
#It's just a string
|
189
|
+
return how_to_match.to_s == chunk
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
194
|
+
|
195
|
+
module TagModule
|
196
|
+
|
197
|
+
include Enumerable
|
198
|
+
include PageElement
|
199
|
+
|
200
|
+
attr_accessor :name, :contents, :attrs, :string
|
201
|
+
|
202
|
+
#I tried to have Tag subclass Method, but it killed the
|
203
|
+
#whole thing. Maybe I should just leave well enough alone.
|
204
|
+
#
|
205
|
+
#def arity
|
206
|
+
# return methods('find_all').arity
|
207
|
+
#end
|
208
|
+
#
|
209
|
+
#def call(*args)
|
210
|
+
# return find_all(*args)
|
211
|
+
#end
|
212
|
+
#
|
213
|
+
#def to_proc
|
214
|
+
# return methods('find_all').to_proc
|
215
|
+
#end
|
216
|
+
|
217
|
+
def initialize(parser, name, attr_list=[], parent=nil, previous=nil)
|
218
|
+
@hidden = false
|
219
|
+
@parser = parser
|
220
|
+
@name = name
|
221
|
+
@attr_list = attr_list
|
222
|
+
@attrs = nil
|
223
|
+
@contents = []
|
224
|
+
setup(parent, previous)
|
225
|
+
end
|
226
|
+
|
227
|
+
# Turn the list of attributes into a hash on demand, so we don't have
|
228
|
+
# to do it for every tag while parsing.
|
229
|
+
|
230
|
+
def attrs
|
231
|
+
unless @attrs
|
232
|
+
@attrs = @attr_list.inject({}) do |m,v|
|
233
|
+
if v[1][0] == ?" and v[1][-1] == ?"
|
234
|
+
v[1] = v[1][1..-2]
|
235
|
+
end
|
236
|
+
m[v[0]] = v[1]
|
237
|
+
m
|
238
|
+
end
|
239
|
+
@attr_list = nil
|
240
|
+
end
|
241
|
+
return @attrs
|
242
|
+
end
|
243
|
+
|
244
|
+
#soup.title_tag, or soup.title, is the same as soup.find('title')
|
245
|
+
def method_missing(name, *args)
|
246
|
+
#puts "Missing method #{name} for #{self.class.name}"
|
247
|
+
name = name.to_s
|
248
|
+
if name[-4...name.length] == '_tag'
|
249
|
+
name = name[0...name.length-4]
|
250
|
+
end
|
251
|
+
return find(name, *args)
|
252
|
+
end
|
253
|
+
|
254
|
+
def [](k)
|
255
|
+
attrs[k]
|
256
|
+
end
|
257
|
+
|
258
|
+
def []=(k, v)
|
259
|
+
attrs[k] = v
|
260
|
+
end
|
261
|
+
|
262
|
+
def delete(k)
|
263
|
+
attrs.delete(k)
|
264
|
+
end
|
265
|
+
|
266
|
+
def has_key?(k)
|
267
|
+
attrs.has_key(k)
|
268
|
+
end
|
269
|
+
|
270
|
+
def each
|
271
|
+
@contents.each { |x| yield x }
|
272
|
+
end
|
273
|
+
|
274
|
+
def length
|
275
|
+
return contents.length
|
276
|
+
end
|
277
|
+
alias size length
|
278
|
+
|
279
|
+
def self_closing?
|
280
|
+
return @parser.self_closing_tag?(@name)
|
281
|
+
end
|
282
|
+
|
283
|
+
#Adds the given tag to the contents of this tag
|
284
|
+
def append(tag)
|
285
|
+
@contents.push(tag)
|
286
|
+
end
|
287
|
+
|
288
|
+
def to_str
|
289
|
+
return to_s
|
290
|
+
end
|
291
|
+
|
292
|
+
#Renders this tag and its contents as a pretty-printed string.
|
293
|
+
def prettify
|
294
|
+
return to_s(true)
|
295
|
+
end
|
296
|
+
|
297
|
+
def inspect
|
298
|
+
to_s
|
299
|
+
end
|
300
|
+
|
301
|
+
#Renders this tag and its contents as a string. NOTE: since REXML
|
302
|
+
#consumes whitespace, this method is not certain to reproduce the
|
303
|
+
#whitespace present in the original string.
|
304
|
+
def to_s(show_structure_indent=nil)
|
305
|
+
attr_strings = []
|
306
|
+
attrs.each { |k,v| attr_strings << %{#{k}="#{v}"} if v }
|
307
|
+
if self_closing?
|
308
|
+
close = ' /'
|
309
|
+
closeTag = nil
|
310
|
+
else
|
311
|
+
close = nil
|
312
|
+
closeTag = "</#{name}>"
|
313
|
+
end
|
314
|
+
indent_increment = show_structure_indent==true ? 0 : show_structure_indent
|
315
|
+
if show_structure_indent
|
316
|
+
indent_increment += 1 unless @hidden
|
317
|
+
end
|
318
|
+
contents = render_contents(indent_increment)
|
319
|
+
space = "\n #{' ' * indent_increment}" if show_structure_indent
|
320
|
+
if @hidden
|
321
|
+
s = contents
|
322
|
+
else
|
323
|
+
s = []
|
324
|
+
attribute_string = ''
|
325
|
+
unless attr_strings.empty?
|
326
|
+
attribute_string = ' ' + attr_strings.join(' ')
|
327
|
+
end
|
328
|
+
s.push(space) if show_structure_indent
|
329
|
+
s.push("<#{@name}#{attribute_string}#{close}>")
|
330
|
+
s.push(contents)
|
331
|
+
s.push(space) if closeTag and show_structure_indent
|
332
|
+
s.push(closeTag)
|
333
|
+
s = s.join('')
|
334
|
+
end
|
335
|
+
return s
|
336
|
+
end
|
337
|
+
|
338
|
+
#Renders the contents of this tag as a string.
|
339
|
+
def render_contents(show_structure_indent=nil)
|
340
|
+
s=[]
|
341
|
+
@contents.each do |c|
|
342
|
+
text = nil
|
343
|
+
if c.is_a? Tag
|
344
|
+
text = c.to_s(show_structure_indent)
|
345
|
+
else
|
346
|
+
text = c.to_s
|
347
|
+
end
|
348
|
+
if text
|
349
|
+
if show_structure_indent
|
350
|
+
text.chomp!
|
351
|
+
end
|
352
|
+
s.push(text)
|
353
|
+
end
|
354
|
+
end
|
355
|
+
return s.join('')
|
356
|
+
end
|
357
|
+
|
358
|
+
def recursive_children
|
359
|
+
stack = [[self, 0]]
|
360
|
+
catch(:stop_iteration) do
|
361
|
+
until stack.empty?
|
362
|
+
tag, start = stack.pop
|
363
|
+
for i in start...tag.contents.length
|
364
|
+
a = tag.contents[i]
|
365
|
+
yield a
|
366
|
+
if a.is_a? TagModule and not tag.contents.empty? and i < tag.contents.length
|
367
|
+
stack.push([tag, i+1])
|
368
|
+
stack.push([a, 0])
|
369
|
+
break
|
370
|
+
end
|
371
|
+
end if tag.is_a? TagModule
|
372
|
+
end
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
#Iterates over the direct children of this Tag.
|
377
|
+
def children
|
378
|
+
catch(:stop_iteration) { @contents.each { |x| yield x } }
|
379
|
+
end
|
380
|
+
|
381
|
+
#Convenience method to retrieve the first piece of text matching the
|
382
|
+
#given criteria. 'text' can be a string, a regular expression object,
|
383
|
+
#a Proc that takes a string and returns whether or not the
|
384
|
+
#string 'matches', etc.
|
385
|
+
def find_text(text=nil, &block)
|
386
|
+
args = { :text => text, :limit => 1}
|
387
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
388
|
+
fetch(iterator, nil, args, block)[0]
|
389
|
+
end
|
390
|
+
|
391
|
+
#Convenience method to retrieve all pieces of text matching the
|
392
|
+
#given criteria. 'text' can be a string, a regular expression object,
|
393
|
+
#a callable that takes a string and returns whether or not the
|
394
|
+
#string 'matches', etc.
|
395
|
+
#Args: :limit
|
396
|
+
def find_all_text(text=nil, args={}, &block)
|
397
|
+
args['text'] = text
|
398
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
399
|
+
fetch(iterator, nil, args, block)
|
400
|
+
end
|
401
|
+
|
402
|
+
#Extracts a list of Tag objects that match the given criteria. You
|
403
|
+
#can specify the name of the Tag and any attributes you want the Tag
|
404
|
+
#to have.
|
405
|
+
#
|
406
|
+
#The value of a key-value pair in the 'attrs' map can be a string, a
|
407
|
+
#list of strings, a regular expression object, or a Proc object that
|
408
|
+
#takes a string and returns whether or not the string matches for
|
409
|
+
#some custom definition of 'matches'. The same is true of the tag
|
410
|
+
#name, except that a Proc object will be passed the Tag object instead
|
411
|
+
#of just a string.
|
412
|
+
#Args: :attrs :text :limit :recursive
|
413
|
+
def find_all(name=nil, args={}, &block)
|
414
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
415
|
+
fetch(iterator, name, args, block)
|
416
|
+
end
|
417
|
+
|
418
|
+
#Returns the first Tag or NavigableString object that matches the
|
419
|
+
#given criteria. Takes much the same arguments as fetch.
|
420
|
+
#args: :attrs :text :limit :recursive
|
421
|
+
def find(name=nil, args={}, &block)
|
422
|
+
args[:limit] = 1
|
423
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
424
|
+
fetch(iterator, name, args, block)[0]
|
425
|
+
end
|
426
|
+
end
|
427
|
+
|
428
|
+
class Tag
|
429
|
+
include TagModule
|
430
|
+
end
|
431
|
+
|
432
|
+
class NavigableString < String
|
433
|
+
include PageElement
|
434
|
+
end
|
435
|
+
|
436
|
+
#This class contains the basic parser and fetch code. It defines
|
437
|
+
#a parser that knows nothing about tag behavior except for the
|
438
|
+
#following:
|
439
|
+
#
|
440
|
+
#You can't close a tag without closing all the tags it encloses.
|
441
|
+
#That is, "<foo><bar></foo>" actually means
|
442
|
+
#"<foo><bar></bar></foo>".
|
443
|
+
#
|
444
|
+
#[Another possible explanation is "<foo><bar /></foo>", but since
|
445
|
+
# this class defines no self_closing_tags, it will never use that
|
446
|
+
# explanation.]
|
447
|
+
#
|
448
|
+
#This class is useful for parsing XML or made-up markup languages,
|
449
|
+
#or when BeautifulSoup makes an assumption counter to what you were
|
450
|
+
#expecting."""
|
451
|
+
class BeautifulStoneSoup < HTML::SGMLParser
|
452
|
+
include TagModule
|
453
|
+
|
454
|
+
#As a public service we will by default silently replace MS smart quotes
|
455
|
+
#and similar characters with their HTML or ASCII equivalents.
|
456
|
+
@@ms_chars = { '\x80' => '€',
|
457
|
+
"\x81" => ' ',
|
458
|
+
"\x82" => '‚',
|
459
|
+
"\x83" => 'ƒ',
|
460
|
+
"\x84" => '„',
|
461
|
+
"\x85" => '…',
|
462
|
+
"\x86" => '†',
|
463
|
+
"\x87" => '‡',
|
464
|
+
"\x88" => '⁁',
|
465
|
+
"\x89" => '%',
|
466
|
+
"\x8A" => 'Š',
|
467
|
+
"\x8B" => '<',
|
468
|
+
"\x8C" => 'Œ',
|
469
|
+
"\x8D" => '?',
|
470
|
+
"\x8E" => 'Z',
|
471
|
+
"\x8F" => '?',
|
472
|
+
"\x90" => '?',
|
473
|
+
"\x91" => '‘',
|
474
|
+
"\x92" => '’',
|
475
|
+
"\x93" => '“',
|
476
|
+
"\x94" => '”',
|
477
|
+
"\x95" => '•',
|
478
|
+
"\x96" => '–',
|
479
|
+
"\x97" => '—',
|
480
|
+
"\x98" => '˜',
|
481
|
+
"\x99" => '™',
|
482
|
+
"\x9a" => 'š',
|
483
|
+
"\x9b" => '>',
|
484
|
+
"\x9c" => 'œ',
|
485
|
+
"\x9d" => '?',
|
486
|
+
"\x9e" => 'z',
|
487
|
+
"\x9f" => 'Ÿ'}
|
488
|
+
|
489
|
+
@@parser_massage = [[/<([^<>]*)\/>/, '<\1></\1>'],
|
490
|
+
[/<!\s+([^<>]*)>/, '<!\1>'],
|
491
|
+
[/([\x80-\x9f])/m, proc { |m| @@ms_chars[m]}]
|
492
|
+
]
|
493
|
+
|
494
|
+
@@rootTagName = '[document]'
|
495
|
+
|
496
|
+
@@nestable_tags = {}
|
497
|
+
@@reset_nesting_tags = {}
|
498
|
+
@@quoteTags = {}
|
499
|
+
@@self_closing_tags = {}
|
500
|
+
|
501
|
+
attr_accessor :hidden
|
502
|
+
|
503
|
+
def self_closing_tag?(tag)
|
504
|
+
@@self_closing_tags.has_key?(tag)
|
505
|
+
end
|
506
|
+
|
507
|
+
#Args: :initial_text_is_everything, :avoid_parser_problems, :parse_only_these
|
508
|
+
def initialize(text, args={})
|
509
|
+
super(self, @@rootTagName)
|
510
|
+
@quote_stack = []
|
511
|
+
@hidden = 1
|
512
|
+
if args[:parse_only_these]
|
513
|
+
@parse_only_these = Set.new
|
514
|
+
p = args[:parse_only_these]
|
515
|
+
if p.respond_to? :each
|
516
|
+
p.each { |x| @parse_only_these << x }
|
517
|
+
else
|
518
|
+
@parse_only_these << p
|
519
|
+
end
|
520
|
+
else
|
521
|
+
@parse_only_these = nil
|
522
|
+
end
|
523
|
+
reset
|
524
|
+
|
525
|
+
@avoid_parser_problems = args[:avoid_parser_problems] || true
|
526
|
+
if @avoid_parser_problems and not @avoid_parser_problems.is_a? Enumerable
|
527
|
+
@avoid_parser_problems = @@parser_massage
|
528
|
+
end
|
529
|
+
feed(text) if text != nil
|
530
|
+
done if args[:initial_text_is_everything] != false
|
531
|
+
end
|
532
|
+
|
533
|
+
def feed(text)
|
534
|
+
if @avoid_parser_problems
|
535
|
+
#before = text.clone
|
536
|
+
@avoid_parser_problems.each do |re, fix|
|
537
|
+
if fix.is_a? String
|
538
|
+
text.gsub!(re, fix)
|
539
|
+
else
|
540
|
+
text.gsub!(re) { |x| fix.call(x) }
|
541
|
+
end
|
542
|
+
end
|
543
|
+
#if before != text
|
544
|
+
# puts "Changed from #{before} to #{text}"
|
545
|
+
#end
|
546
|
+
end
|
547
|
+
super
|
548
|
+
end
|
549
|
+
|
550
|
+
def ==(anObject)
|
551
|
+
return anObject != nil && anObject.to_s == to_s
|
552
|
+
end
|
553
|
+
|
554
|
+
def done
|
555
|
+
end_text
|
556
|
+
pop_tag while @currentTag.name != @@rootTagName
|
557
|
+
end
|
558
|
+
|
559
|
+
def reset
|
560
|
+
super
|
561
|
+
@currentText = []
|
562
|
+
@currentTag = nil
|
563
|
+
@tag_stack = []
|
564
|
+
push_tag(self)
|
565
|
+
end
|
566
|
+
|
567
|
+
def push_tag(tag)
|
568
|
+
#puts "Push #{ tag.name }"
|
569
|
+
@currentTag.append(tag) if @currentTag
|
570
|
+
@tag_stack.push(tag)
|
571
|
+
@currentTag = @tag_stack[-1]
|
572
|
+
end
|
573
|
+
|
574
|
+
def pop_tag
|
575
|
+
tag = @tag_stack.pop
|
576
|
+
#puts "Pop #{ tag.name }"
|
577
|
+
|
578
|
+
# Tags with just one string-owning child get the child as a
|
579
|
+
# 'string' property, so that soup.tag.string is shorthand for
|
580
|
+
# soup.tag.contents[0]
|
581
|
+
if @currentTag.contents.length == 1 and @currentTag.contents[0].is_a? NavigableString
|
582
|
+
@currentTag.string = @currentTag.contents[0]
|
583
|
+
end
|
584
|
+
|
585
|
+
@currentTag = @tag_stack[-1] unless @tag_stack.empty?
|
586
|
+
@currentTag
|
587
|
+
end
|
588
|
+
|
589
|
+
# StreamListener implementation
|
590
|
+
|
591
|
+
def unknown_starttag(name, attrs)
|
592
|
+
#puts "Starting tag #{name} #{attrs.inspect}"
|
593
|
+
|
594
|
+
unless @quote_stack.empty?
|
595
|
+
#This is not a real tag.
|
596
|
+
#puts "<#{name}> is not real!"
|
597
|
+
#TODO: find idiomatic way to do this
|
598
|
+
attrString = []
|
599
|
+
attrs.each { |k,v| attrString.push('#{k}="#{v}"') }
|
600
|
+
self.handle_data('<#{name} #{attrString.join(' ')}>')
|
601
|
+
return
|
602
|
+
end
|
603
|
+
|
604
|
+
end_text
|
605
|
+
|
606
|
+
return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
|
607
|
+
self_closing = @@self_closing_tags.has_key?(name)
|
608
|
+
smart_pop(name) unless self_closing
|
609
|
+
tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
|
610
|
+
@previous_parsed.next_parsed = tag if @previous_parsed
|
611
|
+
@previous_parsed = tag
|
612
|
+
push_tag(tag)
|
613
|
+
pop_tag if self_closing
|
614
|
+
if @@quoteTags.has_key?(name)
|
615
|
+
#puts "Beginning quote (#{name})"
|
616
|
+
@quote_stack.push(name)
|
617
|
+
end
|
618
|
+
end
|
619
|
+
|
620
|
+
def unknown_endtag(name)
|
621
|
+
#Ignore tag_end calls for self-closing tags; they were
|
622
|
+
#closed in the tag_start call.
|
623
|
+
#TODO: still neccessary?
|
624
|
+
#puts "Ending tag #{name}"
|
625
|
+
return if @@self_closing_tags.has_key?(name)
|
626
|
+
|
627
|
+
if not @quote_stack.empty? and @quote_stack[-1] != name
|
628
|
+
#This is not a real end tag.
|
629
|
+
#puts "</#{name}> is not real!"
|
630
|
+
handle_data('</#{name}>')
|
631
|
+
return
|
632
|
+
end
|
633
|
+
|
634
|
+
return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
|
635
|
+
|
636
|
+
end_text
|
637
|
+
pop_to_tag(name)
|
638
|
+
@quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
|
639
|
+
end
|
640
|
+
|
641
|
+
def handle_data(data)
|
642
|
+
return unless !@parse_only_these or @tag_stack.size > 1
|
643
|
+
@currentText.push(data)
|
644
|
+
end
|
645
|
+
|
646
|
+
#Propagate comments right through.
|
647
|
+
def handle_comment(data)
|
648
|
+
handle_data("<!--#{data}-->")
|
649
|
+
end
|
650
|
+
|
651
|
+
def handle_special(data)
|
652
|
+
handle_data("<#{data}>")
|
653
|
+
end
|
654
|
+
|
655
|
+
def unknown_charref(ref)
|
656
|
+
handle_data("&#{ref};")
|
657
|
+
end
|
658
|
+
|
659
|
+
def unknown_entityref(ref)
|
660
|
+
handle_data("%#{ref}")
|
661
|
+
end
|
662
|
+
|
663
|
+
def attlistdecl(element_name, attributes, raw_content)
|
664
|
+
handle_data("<!ATTLIST #{raw_content}>")
|
665
|
+
end
|
666
|
+
|
667
|
+
def cdata(content)
|
668
|
+
handle_data("<![CDATA[#{content}]]")
|
669
|
+
end
|
670
|
+
|
671
|
+
###
|
672
|
+
|
673
|
+
def doctype(*args)
|
674
|
+
content = args.join(' ')
|
675
|
+
##{name} #{pub_sys}#{long_name}#{url}
|
676
|
+
#long_name = ' "#{long_name}"' if long_name
|
677
|
+
#url = ' "#{url}"' if url
|
678
|
+
handle_data("<!DOCTYPE #{content}>")
|
679
|
+
end
|
680
|
+
|
681
|
+
def elementdecl(content)
|
682
|
+
handle_data("<!ELEMENT #{content}>")
|
683
|
+
end
|
684
|
+
|
685
|
+
def entity(content)
|
686
|
+
|
687
|
+
end
|
688
|
+
|
689
|
+
def entitydecl(content)
|
690
|
+
handle_data("<!ENTITY #{content.join(' ')}>")
|
691
|
+
end
|
692
|
+
|
693
|
+
def instruction(name, instruction)
|
694
|
+
handle_data("<?#{name} #{instruction}>")
|
695
|
+
end
|
696
|
+
|
697
|
+
def notationdecl(content)
|
698
|
+
handle_data("<!NOTATION #{content}>")
|
699
|
+
end
|
700
|
+
|
701
|
+
def xmldecl(version, encoding, standalone)
|
702
|
+
encoding = ' encoding="#{encoding}"' if encoding
|
703
|
+
handle_data('<?xml version="#{version}"#{encoding}#{standalone}>')
|
704
|
+
end
|
705
|
+
|
706
|
+
#Called when we're done collecting some text, declarations, etc.
|
707
|
+
def end_text
|
708
|
+
currentText = @currentText.join('')
|
709
|
+
unless currentText.empty?
|
710
|
+
if currentText.strip.empty?
|
711
|
+
if currentText =~ /\n/
|
712
|
+
currentText = "\n"
|
713
|
+
else
|
714
|
+
currentText = ' '
|
715
|
+
end
|
716
|
+
end
|
717
|
+
#puts "Setting up text #{currentText}"
|
718
|
+
currentText = NavigableString.new(currentText)
|
719
|
+
currentText.setup(@currentTag, @previous_parsed)
|
720
|
+
@previous_parsed.next_parsed = currentText if @previous_parsed
|
721
|
+
@previous_parsed = currentText
|
722
|
+
@currentTag.contents.push(currentText)
|
723
|
+
end
|
724
|
+
@currentText = []
|
725
|
+
end
|
726
|
+
|
727
|
+
# Helper methods
|
728
|
+
|
729
|
+
private
|
730
|
+
|
731
|
+
#Pops the tag stack up to and including the most recent
|
732
|
+
#instance of the given tag. If inclusivePop is false, pops the tag
|
733
|
+
#stack up to but *not* including the most recent instance of
|
734
|
+
#the given tag.
|
735
|
+
def pop_to_tag(name, inclusive_pop=true)
|
736
|
+
return if name == @@rootTagName
|
737
|
+
|
738
|
+
#puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
|
739
|
+
num_pops = 0
|
740
|
+
mostRecentTag = nil
|
741
|
+
(@tag_stack.length-1).downto(0) do |i|
|
742
|
+
if name == @tag_stack[i].name
|
743
|
+
#puts "Found at #{i}, #{@tag_stack.length-i}"
|
744
|
+
num_pops = @tag_stack.length-i
|
745
|
+
break
|
746
|
+
end
|
747
|
+
end
|
748
|
+
num_pops -= 1 if not inclusive_pop
|
749
|
+
|
750
|
+
#puts "Popping #{num_pops} times."
|
751
|
+
num_pops.times { mostRecentTag = pop_tag }
|
752
|
+
mostRecentTag
|
753
|
+
end
|
754
|
+
|
755
|
+
#We need to pop up to the previous tag of this type, unless
|
756
|
+
#one of this tag's nesting reset triggers comes between this
|
757
|
+
#tag and the previous tag of this type, OR unless this tag is a
|
758
|
+
#generic nesting trigger and another generic nesting trigger
|
759
|
+
#comes between this tag and the previous tag of this type.
|
760
|
+
#
|
761
|
+
#Examples:
|
762
|
+
# <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
|
763
|
+
# <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
|
764
|
+
# <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
|
765
|
+
# <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
|
766
|
+
#
|
767
|
+
# <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
|
768
|
+
# <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
|
769
|
+
# <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
|
770
|
+
def smart_pop(name)
|
771
|
+
#puts "Smart pop for #{name}"
|
772
|
+
nesting_reset_triggers = @@nestable_tags[name]
|
773
|
+
is_nestable = nesting_reset_triggers != nil
|
774
|
+
is_reset_nesting = @@reset_nesting_tags.has_key?(name)
|
775
|
+
popTo = nil
|
776
|
+
inclusive = true
|
777
|
+
@tag_stack.reverse_each do |p|
|
778
|
+
if (p == nil or p.name == name) and not is_nestable
|
779
|
+
#Non-nestable tags get popped to the top or to their
|
780
|
+
#last occurance.
|
781
|
+
#puts "Non-nestable tag #{name} gets popped to its last occurance."
|
782
|
+
popTo = name
|
783
|
+
break
|
784
|
+
end
|
785
|
+
if (nesting_reset_triggers != nil and nesting_reset_triggers.include?(p.name)) or (nesting_reset_triggers == nil and is_reset_nesting and @@reset_nesting_tags.has_key?(p.name))
|
786
|
+
#If we encounter one of the nesting reset triggers
|
787
|
+
#peculiar to this tag, or we encounter another tag
|
788
|
+
#that causes nesting to reset, pop up to but not
|
789
|
+
#including that tag.
|
790
|
+
#puts "Nesting reset trigger encountered for #{name}: #{p.name}"
|
791
|
+
popTo = p.name
|
792
|
+
inclusive = false
|
793
|
+
break
|
794
|
+
end
|
795
|
+
p = p.parent
|
796
|
+
end
|
797
|
+
pop_to_tag(popTo, inclusive) if popTo
|
798
|
+
end
|
799
|
+
|
800
|
+
protected
|
801
|
+
|
802
|
+
#Turns a list of maps, lists, or scalars into a single map.
|
803
|
+
#Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
|
804
|
+
#of lists and partial maps.
|
805
|
+
def BeautifulStoneSoup.build_tag_map(default, *args)
|
806
|
+
built = args.inject({}) do |m, portion|
|
807
|
+
if portion.is_a? Hash
|
808
|
+
#It's a map. Merge it.
|
809
|
+
portion.each_pair { |k,v| m[k] = v }
|
810
|
+
elsif portion.is_a? Array
|
811
|
+
#It's a list. Map each item to the default.
|
812
|
+
portion.each { |k| m[k] = default }
|
813
|
+
else
|
814
|
+
#It's a scalar. Map it to the default.
|
815
|
+
m[portion] = default
|
816
|
+
end
|
817
|
+
m
|
818
|
+
end
|
819
|
+
end
|
820
|
+
end
|
821
|
+
|
822
|
+
#This parser knows the following facts about HTML:
|
823
|
+
#
|
824
|
+
#* Some tags have no closing tag and should be interpreted as being
|
825
|
+
# closed as soon as they are encountered.
|
826
|
+
#
|
827
|
+
#* The text inside some tags (ie. 'script') may contain tags which
|
828
|
+
# are not really part of the document and which should be parsed
|
829
|
+
# as text, not tags. If you want to parse the text as tags, you can
|
830
|
+
# always fetch it and parse it explicitly.
|
831
|
+
#
|
832
|
+
#* Tag nesting rules:
|
833
|
+
#
|
834
|
+
# Most tags can't be nested at all. For instance, the occurance of
|
835
|
+
# a <p> tag should implicitly close the previous <p> tag.
|
836
|
+
#
|
837
|
+
# <p>Para1<p>Para2
|
838
|
+
# should be transformed into:
|
839
|
+
# <p>Para1</p><p>Para2
|
840
|
+
#
|
841
|
+
# Some tags can be nested arbitrarily. For instance, the occurance
|
842
|
+
# of a <blockquote> tag should _not_ implicitly close the previous
|
843
|
+
# <blockquote> tag.
|
844
|
+
#
|
845
|
+
# Alice said: <blockquote>Bob said: <blockquote>Blah
|
846
|
+
# should NOT be transformed into:
|
847
|
+
# Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
|
848
|
+
#
|
849
|
+
# Some tags can be nested, but the nesting is reset by the
|
850
|
+
# interposition of other tags. For instance, a <tr> tag should
|
851
|
+
# implicitly close the previous <tr> tag within the same <table>,
|
852
|
+
# but not close a <tr> tag in another table.
|
853
|
+
#
|
854
|
+
# <table><tr>Blah<tr>Blah
|
855
|
+
# should be transformed into:
|
856
|
+
# <table><tr>Blah</tr><tr>Blah
|
857
|
+
# but,
|
858
|
+
# <tr>Blah<table><tr>Blah
|
859
|
+
# should NOT be transformed into
|
860
|
+
# <tr>Blah<table></tr><tr>Blah
|
861
|
+
#
|
862
|
+
#Differing assumptions about tag nesting rules are a major source
|
863
|
+
#of problems with the BeautifulSoup class. If BeautifulSoup is not
|
864
|
+
#treating as nestable a tag your page author treats as nestable,
|
865
|
+
#try writing a subclass.
|
866
|
+
class BeautifulSoup < BeautifulStoneSoup
|
867
|
+
|
868
|
+
@@self_closing_tags.replace(build_tag_map(nil, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame']))
|
869
|
+
|
870
|
+
@@quote_tags = {'script' => nil}
|
871
|
+
|
872
|
+
#According to the HTML standard, each of these inline tags can
|
873
|
+
#contain another tag of the same type. Furthermore, it's common
|
874
|
+
#to actually use these tags this way.
|
875
|
+
@@nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center']
|
876
|
+
|
877
|
+
#According to the HTML standard, these block tags can contain
|
878
|
+
#another tag of the same type. Furthermore, it's common
|
879
|
+
#to actually use these tags this way.
|
880
|
+
@@nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
|
881
|
+
|
882
|
+
#Lists can contain other lists, but there are restrictions.
|
883
|
+
@@nestable_list_tags = { 'ol' => [],
|
884
|
+
'ul' => [],
|
885
|
+
'li' => ['ul', 'ol'],
|
886
|
+
'dl' => [],
|
887
|
+
'dd' => ['dl'],
|
888
|
+
'dt' => ['dl'] }
|
889
|
+
|
890
|
+
#Tables can contain other tables, but there are restrictions.
|
891
|
+
@@nestable_table_tags = {'table' => ['tr', 'td'],
|
892
|
+
'tr' => ['table'],
|
893
|
+
'td' => ['tr'],
|
894
|
+
'th' => ['tr'],
|
895
|
+
}
|
896
|
+
|
897
|
+
@@non_nestable_block_tags = ['address', 'form', 'p', 'pre']
|
898
|
+
|
899
|
+
#If one of these tags is encountered, all tags up to the next tag of
|
900
|
+
#this type are popped.
|
901
|
+
@@reset_nesting_tags.replace(build_tag_map(nil, @@nestable_block_tags, 'noscript', @@non_nestable_block_tags,
|
902
|
+
@@nestable_list_tags, @@nestable_table_tags))
|
903
|
+
|
904
|
+
@@nestable_tags.replace(build_tag_map([], @@nestable_inline_tags, @@nestable_block_tags, @@nestable_list_tags, @@nestable_table_tags))
|
905
|
+
|
906
|
+
end
|
907
|
+
|
908
|
+
# This class will push a tag with only a single string child into
|
909
|
+
# the tag's parent as an attribute. The attribute's name is the tag
|
910
|
+
# name, and the value is the string child. An example should give
|
911
|
+
# the flavor of the change:
|
912
|
+
#
|
913
|
+
# <foo><bar>baz</bar></foo>
|
914
|
+
# =>
|
915
|
+
# <foo bar="baz"><bar>baz</bar></foo>
|
916
|
+
#
|
917
|
+
# You can then access fooTag['bar'] instead of fooTag.barTag.string.
|
918
|
+
#
|
919
|
+
# This is, of course, useful for scraping structures that tend to
|
920
|
+
# use subelements instead of attributes, such as SOAP messages. Note
|
921
|
+
# that it modifies its input, so don't print the modified version
|
922
|
+
# out.
|
923
|
+
class BeautifulSOAP < BeautifulStoneSoup
|
924
|
+
def pop_tag
|
925
|
+
if @tag_stack.size > 1
|
926
|
+
tag = @tag_stack[-1]
|
927
|
+
parent = @tag_stack[-2]
|
928
|
+
if (tag.is_a?(Tag) && tag.contents.size == 1 && \
|
929
|
+
tag.contents[0].is_a?(NavigableString) && !parent[tag.name])
|
930
|
+
parent[tag.name] = tag.contents[0]
|
931
|
+
end
|
932
|
+
super
|
933
|
+
end
|
934
|
+
end
|
935
|
+
end
|
936
|
+
|
937
|
+
#Enterprise class names! It has come to our attention that some people
|
938
|
+
#think the names of the Rubyful Soup parser classes are too silly
|
939
|
+
#and "unprofessional" for use in enterprise screen-scraping. We feel
|
940
|
+
#your pain! For such-minded folk, the Rubyful Soup Consortium And
|
941
|
+
#Rootin' Tootin' Texas Delicatessen recommends renaming this file to
|
942
|
+
#"RobustParser.rb" (or, in cases of extreme enterprisitude,
|
943
|
+
#"RobustParserBeanInterface.class") and using the following
|
944
|
+
#enterprise-friendly class aliases:
|
945
|
+
class RobustXMLParser < BeautifulStoneSoup; end
|
946
|
+
class RobustHTMLParser < BeautifulSoup; end
|
947
|
+
class SimplifyingSOAPParser < BeautifulSOAP; end
|
948
|
+
|
949
|
+
print BeautifulSoup.new(ARGF.read).prettify if $0 == __FILE__
|
950
|
+
|