rubyful_soup_2011 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rubyful_soup.rb +950 -0
- data/tests/rubyful_soup_tests.rb +441 -0
- metadata +57 -0
data/lib/rubyful_soup.rb
ADDED
@@ -0,0 +1,950 @@
|
|
1
|
+
#Rubyful Soup
|
2
|
+
#Elixir and Tonic
|
3
|
+
#"The Screen-Scraper's Friend"
|
4
|
+
#v1.0.4
|
5
|
+
#http://www.crummy.com/software/RubyfulSoup/
|
6
|
+
#
|
7
|
+
#Rubyful Soup is a port to the Ruby language and idiom of the Python
|
8
|
+
#library Beautiful Soup.
|
9
|
+
#See http://www.crummy.com/software/BeautifulSoup/ for details on the original.
|
10
|
+
|
11
|
+
#This library requires the sgml-parser library, written by Takahiro
|
12
|
+
#Maebashi. The easiest way to get it is to install the "htmltools"
|
13
|
+
#gem.
|
14
|
+
require 'html/sgml-parser'
|
15
|
+
require 'set'
|
16
|
+
|
17
|
+
#UTF-8 voodoo--does this really work?
|
18
|
+
$KCODE = 'u'
|
19
|
+
# require 'jcode'
|
20
|
+
|
21
|
+
#This code makes SGMLParser able to parse XML with namespaces.
|
22
|
+
class HTML::SGMLParser
|
23
|
+
if const_defined? :Tagfind
|
24
|
+
remove_const(:Tagfind)
|
25
|
+
Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
module PageElement
|
30
|
+
|
31
|
+
attr_reader :parser
|
32
|
+
attr_accessor :parent, :previous_parsed, :next_parsed, :previous_sibling
|
33
|
+
attr_accessor :next_sibling
|
34
|
+
|
35
|
+
def setup(parent=nil, previous_parsed=nil)
|
36
|
+
@parent = parent
|
37
|
+
@previous_parsed = previous_parsed
|
38
|
+
@next_parsed = nil
|
39
|
+
@previous_sibling = nil
|
40
|
+
@next_sibling = nil
|
41
|
+
if @parent and not @parent.contents.empty?
|
42
|
+
@previous_sibling = @parent.contents[-1]
|
43
|
+
@previous_sibling.next_sibling = self
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
#A bunch of different iterators over a parsed document.
|
48
|
+
{
|
49
|
+
#Iterates in parse order over the rest of the items in this document.
|
50
|
+
:next_parsed_items => :next_parsed,
|
51
|
+
|
52
|
+
#Iterates in reverse parse order over all previously parsed items in
|
53
|
+
#this document.
|
54
|
+
:previous_parsed_items => :previous_parsed,
|
55
|
+
|
56
|
+
#Iterates in parse order over all subsequent siblings of this item.
|
57
|
+
:next_siblings => :next_sibling,
|
58
|
+
|
59
|
+
#Iterates in reverse parse order over all prior siblings of this item.
|
60
|
+
:previous_siblings => :previous_sibling,
|
61
|
+
|
62
|
+
#Iterates upwards through the parentage of this item.
|
63
|
+
:parents => :parent
|
64
|
+
}.each do |k,v|
|
65
|
+
class_eval %{
|
66
|
+
def #{k}
|
67
|
+
i = self
|
68
|
+
while i
|
69
|
+
i = i.#{v}
|
70
|
+
yield i if i
|
71
|
+
end
|
72
|
+
end
|
73
|
+
}
|
74
|
+
end
|
75
|
+
|
76
|
+
[ #Returns first item/all items matching the given criteria and
|
77
|
+
#appearing after this PageElement in the document.
|
78
|
+
[:find_next, :find_all_next, 'next_parsed_items'],
|
79
|
+
|
80
|
+
#Returns first item/all items matching the given criteria and
|
81
|
+
#appearing before this PageElement in the document.
|
82
|
+
[:find_previous, :find_all_previous, 'previous_parsed_items'],
|
83
|
+
|
84
|
+
#Returns the nearest sibling/all siblings of this PageElement matching
|
85
|
+
#the given criteria and appearing before this PageElement in
|
86
|
+
#the document.
|
87
|
+
[:find_previous_sibling, :find_previous_siblings, 'previous_siblings'],
|
88
|
+
|
89
|
+
#Returns the nearest sibling/all siblings of this PageElement matching
|
90
|
+
#the given criteria and appearing after this PageElement in
|
91
|
+
#the document
|
92
|
+
[:find_next_sibling, :find_next_siblings, 'next_siblings'],
|
93
|
+
|
94
|
+
#Returns the nearest parent/all parents of this PageElement matching
|
95
|
+
#the given criteria.
|
96
|
+
[:find_parent, :find_parents, 'parents'],
|
97
|
+
].each do |singular, plural, method_name|
|
98
|
+
class_eval %{
|
99
|
+
def #{singular}(name=nil, args={}, &block)
|
100
|
+
args['limit'] = 1
|
101
|
+
fetch(method('#{method_name}'), name, args, block)[0]
|
102
|
+
end
|
103
|
+
|
104
|
+
def #{plural}(name=nil, args={}, &block)
|
105
|
+
fetch(method('#{method_name}'), name, args, block)
|
106
|
+
end
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
protected
|
111
|
+
|
112
|
+
#Returns a list of items matching the given criteria, obtained by
|
113
|
+
#iterating over the given iterator.
|
114
|
+
def fetch(iterator, name, args, block)
|
115
|
+
attrs = args[:attrs]
|
116
|
+
limit = args[:limit]
|
117
|
+
text = args[:text]
|
118
|
+
|
119
|
+
attrs ||= {}
|
120
|
+
if attrs != nil and not attrs.respond_to? :keys
|
121
|
+
attrs = {'class' => attrs}
|
122
|
+
end
|
123
|
+
bucket = []
|
124
|
+
catch(:stop_iteration) do
|
125
|
+
iterator.call do |item|
|
126
|
+
match = false
|
127
|
+
if block
|
128
|
+
match = true if block.call(item)
|
129
|
+
elsif item.is_a? Tag
|
130
|
+
#A tag matches if its name matches and its attributes line up.
|
131
|
+
if not text and (not name or PageElement.matches(item, name))
|
132
|
+
match = true
|
133
|
+
attrs.each_pair do |attr, matchAgainst|
|
134
|
+
check = item[attr]
|
135
|
+
unless PageElement.matches(check, matchAgainst)
|
136
|
+
match = false
|
137
|
+
break
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
elsif text
|
142
|
+
#A text matches if its string value matches the given text
|
143
|
+
#criterion.
|
144
|
+
match = PageElement.matches(item, text)
|
145
|
+
end
|
146
|
+
if match
|
147
|
+
bucket.push(item)
|
148
|
+
if limit and bucket.length >= limit
|
149
|
+
throw :stop_iteration
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
return bucket
|
155
|
+
end
|
156
|
+
|
157
|
+
#Used to tell whether a Tag or a NavigableString "matches" some data
|
158
|
+
#structure.
|
159
|
+
def PageElement.matches(chunk, how_to_match)
|
160
|
+
#puts "Seeing if #{chunk.class} #{chunk} matches #{how_to_match.class} #{how_to_match}."
|
161
|
+
#
|
162
|
+
# If given a list of items, return true if the list contains a
|
163
|
+
# text element that matches.
|
164
|
+
if chunk.is_a? Array
|
165
|
+
chunk.each do |tag|
|
166
|
+
return true if tag.is_a? NavigableString and matches(tag, how_to_match)
|
167
|
+
end
|
168
|
+
return false
|
169
|
+
elsif how_to_match.is_a? Proc
|
170
|
+
return how_to_match.call(chunk)
|
171
|
+
elsif chunk.is_a? Tag
|
172
|
+
#Custom match methods take the tag as an argument, but all other
|
173
|
+
#ways of matching match the tag name as a string
|
174
|
+
chunk = chunk.name
|
175
|
+
end
|
176
|
+
|
177
|
+
#At this point we know that chunk is a string
|
178
|
+
unless chunk.is_a? String
|
179
|
+
chunk = chunk.to_s
|
180
|
+
end
|
181
|
+
if how_to_match.is_a? Regexp
|
182
|
+
return how_to_match.match(chunk) != nil
|
183
|
+
elsif how_to_match.is_a? Array
|
184
|
+
return how_to_match.find {|x| x == chunk} != nil
|
185
|
+
elsif how_to_match.is_a? Hash
|
186
|
+
return how_to_match[chunk] != nil
|
187
|
+
else
|
188
|
+
#It's just a string
|
189
|
+
return how_to_match.to_s == chunk
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
194
|
+
|
195
|
+
module TagModule
|
196
|
+
|
197
|
+
include Enumerable
|
198
|
+
include PageElement
|
199
|
+
|
200
|
+
attr_accessor :name, :contents, :attrs, :string
|
201
|
+
|
202
|
+
#I tried to have Tag subclass Method, but it killed the
|
203
|
+
#whole thing. Maybe I should just leave well enough alone.
|
204
|
+
#
|
205
|
+
#def arity
|
206
|
+
# return methods('find_all').arity
|
207
|
+
#end
|
208
|
+
#
|
209
|
+
#def call(*args)
|
210
|
+
# return find_all(*args)
|
211
|
+
#end
|
212
|
+
#
|
213
|
+
#def to_proc
|
214
|
+
# return methods('find_all').to_proc
|
215
|
+
#end
|
216
|
+
|
217
|
+
def initialize(parser, name, attr_list=[], parent=nil, previous=nil)
|
218
|
+
@hidden = false
|
219
|
+
@parser = parser
|
220
|
+
@name = name
|
221
|
+
@attr_list = attr_list
|
222
|
+
@attrs = nil
|
223
|
+
@contents = []
|
224
|
+
setup(parent, previous)
|
225
|
+
end
|
226
|
+
|
227
|
+
# Turn the list of attributes into a hash on demand, so we don't have
|
228
|
+
# to do it for every tag while parsing.
|
229
|
+
|
230
|
+
def attrs
|
231
|
+
unless @attrs
|
232
|
+
@attrs = @attr_list.inject({}) do |m,v|
|
233
|
+
if v[1][0] == ?" and v[1][-1] == ?"
|
234
|
+
v[1] = v[1][1..-2]
|
235
|
+
end
|
236
|
+
m[v[0]] = v[1]
|
237
|
+
m
|
238
|
+
end
|
239
|
+
@attr_list = nil
|
240
|
+
end
|
241
|
+
return @attrs
|
242
|
+
end
|
243
|
+
|
244
|
+
#soup.title_tag, or soup.title, is the same as soup.find('title')
|
245
|
+
def method_missing(name, *args)
|
246
|
+
#puts "Missing method #{name} for #{self.class.name}"
|
247
|
+
name = name.to_s
|
248
|
+
if name[-4...name.length] == '_tag'
|
249
|
+
name = name[0...name.length-4]
|
250
|
+
end
|
251
|
+
return find(name, *args)
|
252
|
+
end
|
253
|
+
|
254
|
+
def [](k)
|
255
|
+
attrs[k]
|
256
|
+
end
|
257
|
+
|
258
|
+
def []=(k, v)
|
259
|
+
attrs[k] = v
|
260
|
+
end
|
261
|
+
|
262
|
+
def delete(k)
|
263
|
+
attrs.delete(k)
|
264
|
+
end
|
265
|
+
|
266
|
+
def has_key?(k)
|
267
|
+
attrs.has_key(k)
|
268
|
+
end
|
269
|
+
|
270
|
+
def each
|
271
|
+
@contents.each { |x| yield x }
|
272
|
+
end
|
273
|
+
|
274
|
+
def length
|
275
|
+
return contents.length
|
276
|
+
end
|
277
|
+
alias size length
|
278
|
+
|
279
|
+
def self_closing?
|
280
|
+
return @parser.self_closing_tag?(@name)
|
281
|
+
end
|
282
|
+
|
283
|
+
#Adds the given tag to the contents of this tag
|
284
|
+
def append(tag)
|
285
|
+
@contents.push(tag)
|
286
|
+
end
|
287
|
+
|
288
|
+
def to_str
|
289
|
+
return to_s
|
290
|
+
end
|
291
|
+
|
292
|
+
#Renders this tag and its contents as a pretty-printed string.
|
293
|
+
def prettify
|
294
|
+
return to_s(true)
|
295
|
+
end
|
296
|
+
|
297
|
+
def inspect
|
298
|
+
to_s
|
299
|
+
end
|
300
|
+
|
301
|
+
#Renders this tag and its contents as a string. NOTE: since REXML
|
302
|
+
#consumes whitespace, this method is not certain to reproduce the
|
303
|
+
#whitespace present in the original string.
|
304
|
+
def to_s(show_structure_indent=nil)
|
305
|
+
attr_strings = []
|
306
|
+
attrs.each { |k,v| attr_strings << %{#{k}="#{v}"} if v }
|
307
|
+
if self_closing?
|
308
|
+
close = ' /'
|
309
|
+
closeTag = nil
|
310
|
+
else
|
311
|
+
close = nil
|
312
|
+
closeTag = "</#{name}>"
|
313
|
+
end
|
314
|
+
indent_increment = show_structure_indent==true ? 0 : show_structure_indent
|
315
|
+
if show_structure_indent
|
316
|
+
indent_increment += 1 unless @hidden
|
317
|
+
end
|
318
|
+
contents = render_contents(indent_increment)
|
319
|
+
space = "\n #{' ' * indent_increment}" if show_structure_indent
|
320
|
+
if @hidden
|
321
|
+
s = contents
|
322
|
+
else
|
323
|
+
s = []
|
324
|
+
attribute_string = ''
|
325
|
+
unless attr_strings.empty?
|
326
|
+
attribute_string = ' ' + attr_strings.join(' ')
|
327
|
+
end
|
328
|
+
s.push(space) if show_structure_indent
|
329
|
+
s.push("<#{@name}#{attribute_string}#{close}>")
|
330
|
+
s.push(contents)
|
331
|
+
s.push(space) if closeTag and show_structure_indent
|
332
|
+
s.push(closeTag)
|
333
|
+
s = s.join('')
|
334
|
+
end
|
335
|
+
return s
|
336
|
+
end
|
337
|
+
|
338
|
+
#Renders the contents of this tag as a string.
|
339
|
+
def render_contents(show_structure_indent=nil)
|
340
|
+
s=[]
|
341
|
+
@contents.each do |c|
|
342
|
+
text = nil
|
343
|
+
if c.is_a? Tag
|
344
|
+
text = c.to_s(show_structure_indent)
|
345
|
+
else
|
346
|
+
text = c.to_s
|
347
|
+
end
|
348
|
+
if text
|
349
|
+
if show_structure_indent
|
350
|
+
text.chomp!
|
351
|
+
end
|
352
|
+
s.push(text)
|
353
|
+
end
|
354
|
+
end
|
355
|
+
return s.join('')
|
356
|
+
end
|
357
|
+
|
358
|
+
def recursive_children
|
359
|
+
stack = [[self, 0]]
|
360
|
+
catch(:stop_iteration) do
|
361
|
+
until stack.empty?
|
362
|
+
tag, start = stack.pop
|
363
|
+
for i in start...tag.contents.length
|
364
|
+
a = tag.contents[i]
|
365
|
+
yield a
|
366
|
+
if a.is_a? TagModule and not tag.contents.empty? and i < tag.contents.length
|
367
|
+
stack.push([tag, i+1])
|
368
|
+
stack.push([a, 0])
|
369
|
+
break
|
370
|
+
end
|
371
|
+
end if tag.is_a? TagModule
|
372
|
+
end
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
#Iterates over the direct children of this Tag.
|
377
|
+
def children
|
378
|
+
catch(:stop_iteration) { @contents.each { |x| yield x } }
|
379
|
+
end
|
380
|
+
|
381
|
+
#Convenience method to retrieve the first piece of text matching the
|
382
|
+
#given criteria. 'text' can be a string, a regular expression object,
|
383
|
+
#a Proc that takes a string and returns whether or not the
|
384
|
+
#string 'matches', etc.
|
385
|
+
def find_text(text=nil, &block)
|
386
|
+
args = { :text => text, :limit => 1}
|
387
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
388
|
+
fetch(iterator, nil, args, block)[0]
|
389
|
+
end
|
390
|
+
|
391
|
+
#Convenience method to retrieve all pieces of text matching the
|
392
|
+
#given criteria. 'text' can be a string, a regular expression object,
|
393
|
+
#a callable that takes a string and returns whether or not the
|
394
|
+
#string 'matches', etc.
|
395
|
+
#Args: :limit
|
396
|
+
def find_all_text(text=nil, args={}, &block)
|
397
|
+
args['text'] = text
|
398
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
399
|
+
fetch(iterator, nil, args, block)
|
400
|
+
end
|
401
|
+
|
402
|
+
#Extracts a list of Tag objects that match the given criteria. You
|
403
|
+
#can specify the name of the Tag and any attributes you want the Tag
|
404
|
+
#to have.
|
405
|
+
#
|
406
|
+
#The value of a key-value pair in the 'attrs' map can be a string, a
|
407
|
+
#list of strings, a regular expression object, or a Proc object that
|
408
|
+
#takes a string and returns whether or not the string matches for
|
409
|
+
#some custom definition of 'matches'. The same is true of the tag
|
410
|
+
#name, except that a Proc object will be passed the Tag object instead
|
411
|
+
#of just a string.
|
412
|
+
#Args: :attrs :text :limit :recursive
|
413
|
+
def find_all(name=nil, args={}, &block)
|
414
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
415
|
+
fetch(iterator, name, args, block)
|
416
|
+
end
|
417
|
+
|
418
|
+
#Returns the first Tag or NavigableString object that matches the
|
419
|
+
#given criteria. Takes much the same arguments as fetch.
|
420
|
+
#args: :attrs :text :limit :recursive
|
421
|
+
def find(name=nil, args={}, &block)
|
422
|
+
args[:limit] = 1
|
423
|
+
iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
|
424
|
+
fetch(iterator, name, args, block)[0]
|
425
|
+
end
|
426
|
+
end
|
427
|
+
|
428
|
+
class Tag
|
429
|
+
include TagModule
|
430
|
+
end
|
431
|
+
|
432
|
+
class NavigableString < String
|
433
|
+
include PageElement
|
434
|
+
end
|
435
|
+
|
436
|
+
#This class contains the basic parser and fetch code. It defines
|
437
|
+
#a parser that knows nothing about tag behavior except for the
|
438
|
+
#following:
|
439
|
+
#
|
440
|
+
#You can't close a tag without closing all the tags it encloses.
|
441
|
+
#That is, "<foo><bar></foo>" actually means
|
442
|
+
#"<foo><bar></bar></foo>".
|
443
|
+
#
|
444
|
+
#[Another possible explanation is "<foo><bar /></foo>", but since
|
445
|
+
# this class defines no self_closing_tags, it will never use that
|
446
|
+
# explanation.]
|
447
|
+
#
|
448
|
+
#This class is useful for parsing XML or made-up markup languages,
|
449
|
+
#or when BeautifulSoup makes an assumption counter to what you were
|
450
|
+
#expecting."""
|
451
|
+
class BeautifulStoneSoup < HTML::SGMLParser
|
452
|
+
include TagModule
|
453
|
+
|
454
|
+
#As a public service we will by default silently replace MS smart quotes
|
455
|
+
#and similar characters with their HTML or ASCII equivalents.
|
456
|
+
@@ms_chars = { '\x80' => '€',
|
457
|
+
"\x81" => ' ',
|
458
|
+
"\x82" => '‚',
|
459
|
+
"\x83" => 'ƒ',
|
460
|
+
"\x84" => '„',
|
461
|
+
"\x85" => '…',
|
462
|
+
"\x86" => '†',
|
463
|
+
"\x87" => '‡',
|
464
|
+
"\x88" => '⁁',
|
465
|
+
"\x89" => '%',
|
466
|
+
"\x8A" => 'Š',
|
467
|
+
"\x8B" => '<',
|
468
|
+
"\x8C" => 'Œ',
|
469
|
+
"\x8D" => '?',
|
470
|
+
"\x8E" => 'Z',
|
471
|
+
"\x8F" => '?',
|
472
|
+
"\x90" => '?',
|
473
|
+
"\x91" => '‘',
|
474
|
+
"\x92" => '’',
|
475
|
+
"\x93" => '“',
|
476
|
+
"\x94" => '”',
|
477
|
+
"\x95" => '•',
|
478
|
+
"\x96" => '–',
|
479
|
+
"\x97" => '—',
|
480
|
+
"\x98" => '˜',
|
481
|
+
"\x99" => '™',
|
482
|
+
"\x9a" => 'š',
|
483
|
+
"\x9b" => '>',
|
484
|
+
"\x9c" => 'œ',
|
485
|
+
"\x9d" => '?',
|
486
|
+
"\x9e" => 'z',
|
487
|
+
"\x9f" => 'Ÿ'}
|
488
|
+
|
489
|
+
@@parser_massage = [[/<([^<>]*)\/>/, '<\1></\1>'],
|
490
|
+
[/<!\s+([^<>]*)>/, '<!\1>'],
|
491
|
+
[/([\x80-\x9f])/m, proc { |m| @@ms_chars[m]}]
|
492
|
+
]
|
493
|
+
|
494
|
+
@@rootTagName = '[document]'
|
495
|
+
|
496
|
+
@@nestable_tags = {}
|
497
|
+
@@reset_nesting_tags = {}
|
498
|
+
@@quoteTags = {}
|
499
|
+
@@self_closing_tags = {}
|
500
|
+
|
501
|
+
attr_accessor :hidden
|
502
|
+
|
503
|
+
def self_closing_tag?(tag)
|
504
|
+
@@self_closing_tags.has_key?(tag)
|
505
|
+
end
|
506
|
+
|
507
|
+
#Args: :initial_text_is_everything, :avoid_parser_problems, :parse_only_these
|
508
|
+
def initialize(text, args={})
|
509
|
+
super(self, @@rootTagName)
|
510
|
+
@quote_stack = []
|
511
|
+
@hidden = 1
|
512
|
+
if args[:parse_only_these]
|
513
|
+
@parse_only_these = Set.new
|
514
|
+
p = args[:parse_only_these]
|
515
|
+
if p.respond_to? :each
|
516
|
+
p.each { |x| @parse_only_these << x }
|
517
|
+
else
|
518
|
+
@parse_only_these << p
|
519
|
+
end
|
520
|
+
else
|
521
|
+
@parse_only_these = nil
|
522
|
+
end
|
523
|
+
reset
|
524
|
+
|
525
|
+
@avoid_parser_problems = args[:avoid_parser_problems] || true
|
526
|
+
if @avoid_parser_problems and not @avoid_parser_problems.is_a? Enumerable
|
527
|
+
@avoid_parser_problems = @@parser_massage
|
528
|
+
end
|
529
|
+
feed(text) if text != nil
|
530
|
+
done if args[:initial_text_is_everything] != false
|
531
|
+
end
|
532
|
+
|
533
|
+
def feed(text)
|
534
|
+
if @avoid_parser_problems
|
535
|
+
#before = text.clone
|
536
|
+
@avoid_parser_problems.each do |re, fix|
|
537
|
+
if fix.is_a? String
|
538
|
+
text.gsub!(re, fix)
|
539
|
+
else
|
540
|
+
text.gsub!(re) { |x| fix.call(x) }
|
541
|
+
end
|
542
|
+
end
|
543
|
+
#if before != text
|
544
|
+
# puts "Changed from #{before} to #{text}"
|
545
|
+
#end
|
546
|
+
end
|
547
|
+
super
|
548
|
+
end
|
549
|
+
|
550
|
+
def ==(anObject)
|
551
|
+
return anObject != nil && anObject.to_s == to_s
|
552
|
+
end
|
553
|
+
|
554
|
+
def done
|
555
|
+
end_text
|
556
|
+
pop_tag while @currentTag.name != @@rootTagName
|
557
|
+
end
|
558
|
+
|
559
|
+
def reset
|
560
|
+
super
|
561
|
+
@currentText = []
|
562
|
+
@currentTag = nil
|
563
|
+
@tag_stack = []
|
564
|
+
push_tag(self)
|
565
|
+
end
|
566
|
+
|
567
|
+
def push_tag(tag)
|
568
|
+
#puts "Push #{ tag.name }"
|
569
|
+
@currentTag.append(tag) if @currentTag
|
570
|
+
@tag_stack.push(tag)
|
571
|
+
@currentTag = @tag_stack[-1]
|
572
|
+
end
|
573
|
+
|
574
|
+
def pop_tag
|
575
|
+
tag = @tag_stack.pop
|
576
|
+
#puts "Pop #{ tag.name }"
|
577
|
+
|
578
|
+
# Tags with just one string-owning child get the child as a
|
579
|
+
# 'string' property, so that soup.tag.string is shorthand for
|
580
|
+
# soup.tag.contents[0]
|
581
|
+
if @currentTag.contents.length == 1 and @currentTag.contents[0].is_a? NavigableString
|
582
|
+
@currentTag.string = @currentTag.contents[0]
|
583
|
+
end
|
584
|
+
|
585
|
+
@currentTag = @tag_stack[-1] unless @tag_stack.empty?
|
586
|
+
@currentTag
|
587
|
+
end
|
588
|
+
|
589
|
+
# StreamListener implementation
|
590
|
+
|
591
|
+
def unknown_starttag(name, attrs)
|
592
|
+
#puts "Starting tag #{name} #{attrs.inspect}"
|
593
|
+
|
594
|
+
unless @quote_stack.empty?
|
595
|
+
#This is not a real tag.
|
596
|
+
#puts "<#{name}> is not real!"
|
597
|
+
#TODO: find idiomatic way to do this
|
598
|
+
attrString = []
|
599
|
+
attrs.each { |k,v| attrString.push('#{k}="#{v}"') }
|
600
|
+
self.handle_data('<#{name} #{attrString.join(' ')}>')
|
601
|
+
return
|
602
|
+
end
|
603
|
+
|
604
|
+
end_text
|
605
|
+
|
606
|
+
return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
|
607
|
+
self_closing = @@self_closing_tags.has_key?(name)
|
608
|
+
smart_pop(name) unless self_closing
|
609
|
+
tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
|
610
|
+
@previous_parsed.next_parsed = tag if @previous_parsed
|
611
|
+
@previous_parsed = tag
|
612
|
+
push_tag(tag)
|
613
|
+
pop_tag if self_closing
|
614
|
+
if @@quoteTags.has_key?(name)
|
615
|
+
#puts "Beginning quote (#{name})"
|
616
|
+
@quote_stack.push(name)
|
617
|
+
end
|
618
|
+
end
|
619
|
+
|
620
|
+
def unknown_endtag(name)
|
621
|
+
#Ignore tag_end calls for self-closing tags; they were
|
622
|
+
#closed in the tag_start call.
|
623
|
+
#TODO: still neccessary?
|
624
|
+
#puts "Ending tag #{name}"
|
625
|
+
return if @@self_closing_tags.has_key?(name)
|
626
|
+
|
627
|
+
if not @quote_stack.empty? and @quote_stack[-1] != name
|
628
|
+
#This is not a real end tag.
|
629
|
+
#puts "</#{name}> is not real!"
|
630
|
+
handle_data('</#{name}>')
|
631
|
+
return
|
632
|
+
end
|
633
|
+
|
634
|
+
return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
|
635
|
+
|
636
|
+
end_text
|
637
|
+
pop_to_tag(name)
|
638
|
+
@quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
|
639
|
+
end
|
640
|
+
|
641
|
+
def handle_data(data)
|
642
|
+
return unless !@parse_only_these or @tag_stack.size > 1
|
643
|
+
@currentText.push(data)
|
644
|
+
end
|
645
|
+
|
646
|
+
#Propagate comments right through.
|
647
|
+
def handle_comment(data)
|
648
|
+
handle_data("<!--#{data}-->")
|
649
|
+
end
|
650
|
+
|
651
|
+
def handle_special(data)
|
652
|
+
handle_data("<#{data}>")
|
653
|
+
end
|
654
|
+
|
655
|
+
def unknown_charref(ref)
|
656
|
+
handle_data("&#{ref};")
|
657
|
+
end
|
658
|
+
|
659
|
+
def unknown_entityref(ref)
|
660
|
+
handle_data("%#{ref}")
|
661
|
+
end
|
662
|
+
|
663
|
+
def attlistdecl(element_name, attributes, raw_content)
|
664
|
+
handle_data("<!ATTLIST #{raw_content}>")
|
665
|
+
end
|
666
|
+
|
667
|
+
def cdata(content)
|
668
|
+
handle_data("<![CDATA[#{content}]]")
|
669
|
+
end
|
670
|
+
|
671
|
+
###
|
672
|
+
|
673
|
+
def doctype(*args)
|
674
|
+
content = args.join(' ')
|
675
|
+
##{name} #{pub_sys}#{long_name}#{url}
|
676
|
+
#long_name = ' "#{long_name}"' if long_name
|
677
|
+
#url = ' "#{url}"' if url
|
678
|
+
handle_data("<!DOCTYPE #{content}>")
|
679
|
+
end
|
680
|
+
|
681
|
+
def elementdecl(content)
|
682
|
+
handle_data("<!ELEMENT #{content}>")
|
683
|
+
end
|
684
|
+
|
685
|
+
def entity(content)
|
686
|
+
|
687
|
+
end
|
688
|
+
|
689
|
+
def entitydecl(content)
|
690
|
+
handle_data("<!ENTITY #{content.join(' ')}>")
|
691
|
+
end
|
692
|
+
|
693
|
+
def instruction(name, instruction)
|
694
|
+
handle_data("<?#{name} #{instruction}>")
|
695
|
+
end
|
696
|
+
|
697
|
+
def notationdecl(content)
|
698
|
+
handle_data("<!NOTATION #{content}>")
|
699
|
+
end
|
700
|
+
|
701
|
+
def xmldecl(version, encoding, standalone)
|
702
|
+
encoding = ' encoding="#{encoding}"' if encoding
|
703
|
+
handle_data('<?xml version="#{version}"#{encoding}#{standalone}>')
|
704
|
+
end
|
705
|
+
|
706
|
+
#Called when we're done collecting some text, declarations, etc.
|
707
|
+
def end_text
|
708
|
+
currentText = @currentText.join('')
|
709
|
+
unless currentText.empty?
|
710
|
+
if currentText.strip.empty?
|
711
|
+
if currentText =~ /\n/
|
712
|
+
currentText = "\n"
|
713
|
+
else
|
714
|
+
currentText = ' '
|
715
|
+
end
|
716
|
+
end
|
717
|
+
#puts "Setting up text #{currentText}"
|
718
|
+
currentText = NavigableString.new(currentText)
|
719
|
+
currentText.setup(@currentTag, @previous_parsed)
|
720
|
+
@previous_parsed.next_parsed = currentText if @previous_parsed
|
721
|
+
@previous_parsed = currentText
|
722
|
+
@currentTag.contents.push(currentText)
|
723
|
+
end
|
724
|
+
@currentText = []
|
725
|
+
end
|
726
|
+
|
727
|
+
# Helper methods
|
728
|
+
|
729
|
+
private
|
730
|
+
|
731
|
+
#Pops the tag stack up to and including the most recent
|
732
|
+
#instance of the given tag. If inclusivePop is false, pops the tag
|
733
|
+
#stack up to but *not* including the most recent instance of
|
734
|
+
#the given tag.
|
735
|
+
def pop_to_tag(name, inclusive_pop=true)
|
736
|
+
return if name == @@rootTagName
|
737
|
+
|
738
|
+
#puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
|
739
|
+
num_pops = 0
|
740
|
+
mostRecentTag = nil
|
741
|
+
(@tag_stack.length-1).downto(0) do |i|
|
742
|
+
if name == @tag_stack[i].name
|
743
|
+
#puts "Found at #{i}, #{@tag_stack.length-i}"
|
744
|
+
num_pops = @tag_stack.length-i
|
745
|
+
break
|
746
|
+
end
|
747
|
+
end
|
748
|
+
num_pops -= 1 if not inclusive_pop
|
749
|
+
|
750
|
+
#puts "Popping #{num_pops} times."
|
751
|
+
num_pops.times { mostRecentTag = pop_tag }
|
752
|
+
mostRecentTag
|
753
|
+
end
|
754
|
+
|
755
|
+
#We need to pop up to the previous tag of this type, unless
|
756
|
+
#one of this tag's nesting reset triggers comes between this
|
757
|
+
#tag and the previous tag of this type, OR unless this tag is a
|
758
|
+
#generic nesting trigger and another generic nesting trigger
|
759
|
+
#comes between this tag and the previous tag of this type.
|
760
|
+
#
|
761
|
+
#Examples:
|
762
|
+
# <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
|
763
|
+
# <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
|
764
|
+
# <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
|
765
|
+
# <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
|
766
|
+
#
|
767
|
+
# <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
|
768
|
+
# <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
|
769
|
+
# <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
|
770
|
+
def smart_pop(name)
|
771
|
+
#puts "Smart pop for #{name}"
|
772
|
+
nesting_reset_triggers = @@nestable_tags[name]
|
773
|
+
is_nestable = nesting_reset_triggers != nil
|
774
|
+
is_reset_nesting = @@reset_nesting_tags.has_key?(name)
|
775
|
+
popTo = nil
|
776
|
+
inclusive = true
|
777
|
+
@tag_stack.reverse_each do |p|
|
778
|
+
if (p == nil or p.name == name) and not is_nestable
|
779
|
+
#Non-nestable tags get popped to the top or to their
|
780
|
+
#last occurance.
|
781
|
+
#puts "Non-nestable tag #{name} gets popped to its last occurance."
|
782
|
+
popTo = name
|
783
|
+
break
|
784
|
+
end
|
785
|
+
if (nesting_reset_triggers != nil and nesting_reset_triggers.include?(p.name)) or (nesting_reset_triggers == nil and is_reset_nesting and @@reset_nesting_tags.has_key?(p.name))
|
786
|
+
#If we encounter one of the nesting reset triggers
|
787
|
+
#peculiar to this tag, or we encounter another tag
|
788
|
+
#that causes nesting to reset, pop up to but not
|
789
|
+
#including that tag.
|
790
|
+
#puts "Nesting reset trigger encountered for #{name}: #{p.name}"
|
791
|
+
popTo = p.name
|
792
|
+
inclusive = false
|
793
|
+
break
|
794
|
+
end
|
795
|
+
p = p.parent
|
796
|
+
end
|
797
|
+
pop_to_tag(popTo, inclusive) if popTo
|
798
|
+
end
|
799
|
+
|
800
|
+
protected
|
801
|
+
|
802
|
+
#Turns a list of maps, lists, or scalars into a single map.
|
803
|
+
#Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
|
804
|
+
#of lists and partial maps.
|
805
|
+
def BeautifulStoneSoup.build_tag_map(default, *args)
|
806
|
+
built = args.inject({}) do |m, portion|
|
807
|
+
if portion.is_a? Hash
|
808
|
+
#It's a map. Merge it.
|
809
|
+
portion.each_pair { |k,v| m[k] = v }
|
810
|
+
elsif portion.is_a? Array
|
811
|
+
#It's a list. Map each item to the default.
|
812
|
+
portion.each { |k| m[k] = default }
|
813
|
+
else
|
814
|
+
#It's a scalar. Map it to the default.
|
815
|
+
m[portion] = default
|
816
|
+
end
|
817
|
+
m
|
818
|
+
end
|
819
|
+
end
|
820
|
+
end
|
821
|
+
|
822
|
+
#This parser knows the following facts about HTML:
|
823
|
+
#
|
824
|
+
#* Some tags have no closing tag and should be interpreted as being
|
825
|
+
# closed as soon as they are encountered.
|
826
|
+
#
|
827
|
+
#* The text inside some tags (ie. 'script') may contain tags which
|
828
|
+
# are not really part of the document and which should be parsed
|
829
|
+
# as text, not tags. If you want to parse the text as tags, you can
|
830
|
+
# always fetch it and parse it explicitly.
|
831
|
+
#
|
832
|
+
#* Tag nesting rules:
|
833
|
+
#
|
834
|
+
# Most tags can't be nested at all. For instance, the occurance of
|
835
|
+
# a <p> tag should implicitly close the previous <p> tag.
|
836
|
+
#
|
837
|
+
# <p>Para1<p>Para2
|
838
|
+
# should be transformed into:
|
839
|
+
# <p>Para1</p><p>Para2
|
840
|
+
#
|
841
|
+
# Some tags can be nested arbitrarily. For instance, the occurance
|
842
|
+
# of a <blockquote> tag should _not_ implicitly close the previous
|
843
|
+
# <blockquote> tag.
|
844
|
+
#
|
845
|
+
# Alice said: <blockquote>Bob said: <blockquote>Blah
|
846
|
+
# should NOT be transformed into:
|
847
|
+
# Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
|
848
|
+
#
|
849
|
+
# Some tags can be nested, but the nesting is reset by the
|
850
|
+
# interposition of other tags. For instance, a <tr> tag should
|
851
|
+
# implicitly close the previous <tr> tag within the same <table>,
|
852
|
+
# but not close a <tr> tag in another table.
|
853
|
+
#
|
854
|
+
# <table><tr>Blah<tr>Blah
|
855
|
+
# should be transformed into:
|
856
|
+
# <table><tr>Blah</tr><tr>Blah
|
857
|
+
# but,
|
858
|
+
# <tr>Blah<table><tr>Blah
|
859
|
+
# should NOT be transformed into
|
860
|
+
# <tr>Blah<table></tr><tr>Blah
|
861
|
+
#
|
862
|
+
#Differing assumptions about tag nesting rules are a major source
|
863
|
+
#of problems with the BeautifulSoup class. If BeautifulSoup is not
|
864
|
+
#treating as nestable a tag your page author treats as nestable,
|
865
|
+
#try writing a subclass.
|
866
|
+
class BeautifulSoup < BeautifulStoneSoup
|
867
|
+
|
868
|
+
@@self_closing_tags.replace(build_tag_map(nil, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame']))
|
869
|
+
|
870
|
+
@@quote_tags = {'script' => nil}
|
871
|
+
|
872
|
+
#According to the HTML standard, each of these inline tags can
|
873
|
+
#contain another tag of the same type. Furthermore, it's common
|
874
|
+
#to actually use these tags this way.
|
875
|
+
@@nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center']
|
876
|
+
|
877
|
+
#According to the HTML standard, these block tags can contain
|
878
|
+
#another tag of the same type. Furthermore, it's common
|
879
|
+
#to actually use these tags this way.
|
880
|
+
@@nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
|
881
|
+
|
882
|
+
#Lists can contain other lists, but there are restrictions.
|
883
|
+
@@nestable_list_tags = { 'ol' => [],
|
884
|
+
'ul' => [],
|
885
|
+
'li' => ['ul', 'ol'],
|
886
|
+
'dl' => [],
|
887
|
+
'dd' => ['dl'],
|
888
|
+
'dt' => ['dl'] }
|
889
|
+
|
890
|
+
#Tables can contain other tables, but there are restrictions.
|
891
|
+
@@nestable_table_tags = {'table' => ['tr', 'td'],
|
892
|
+
'tr' => ['table'],
|
893
|
+
'td' => ['tr'],
|
894
|
+
'th' => ['tr'],
|
895
|
+
}
|
896
|
+
|
897
|
+
@@non_nestable_block_tags = ['address', 'form', 'p', 'pre']
|
898
|
+
|
899
|
+
#If one of these tags is encountered, all tags up to the next tag of
|
900
|
+
#this type are popped.
|
901
|
+
@@reset_nesting_tags.replace(build_tag_map(nil, @@nestable_block_tags, 'noscript', @@non_nestable_block_tags,
|
902
|
+
@@nestable_list_tags, @@nestable_table_tags))
|
903
|
+
|
904
|
+
@@nestable_tags.replace(build_tag_map([], @@nestable_inline_tags, @@nestable_block_tags, @@nestable_list_tags, @@nestable_table_tags))
|
905
|
+
|
906
|
+
end
|
907
|
+
|
908
|
+
# This class will push a tag with only a single string child into
|
909
|
+
# the tag's parent as an attribute. The attribute's name is the tag
|
910
|
+
# name, and the value is the string child. An example should give
|
911
|
+
# the flavor of the change:
|
912
|
+
#
|
913
|
+
# <foo><bar>baz</bar></foo>
|
914
|
+
# =>
|
915
|
+
# <foo bar="baz"><bar>baz</bar></foo>
|
916
|
+
#
|
917
|
+
# You can then access fooTag['bar'] instead of fooTag.barTag.string.
|
918
|
+
#
|
919
|
+
# This is, of course, useful for scraping structures that tend to
|
920
|
+
# use subelements instead of attributes, such as SOAP messages. Note
|
921
|
+
# that it modifies its input, so don't print the modified version
|
922
|
+
# out.
|
923
|
+
class BeautifulSOAP < BeautifulStoneSoup
|
924
|
+
def pop_tag
|
925
|
+
if @tag_stack.size > 1
|
926
|
+
tag = @tag_stack[-1]
|
927
|
+
parent = @tag_stack[-2]
|
928
|
+
if (tag.is_a?(Tag) && tag.contents.size == 1 && \
|
929
|
+
tag.contents[0].is_a?(NavigableString) && !parent[tag.name])
|
930
|
+
parent[tag.name] = tag.contents[0]
|
931
|
+
end
|
932
|
+
super
|
933
|
+
end
|
934
|
+
end
|
935
|
+
end
|
936
|
+
|
937
|
+
#Enterprise class names! It has come to our attention that some people
|
938
|
+
#think the names of the Rubyful Soup parser classes are too silly
|
939
|
+
#and "unprofessional" for use in enterprise screen-scraping. We feel
|
940
|
+
#your pain! For such-minded folk, the Rubyful Soup Consortium And
|
941
|
+
#Rootin' Tootin' Texas Delicatessen recommends renaming this file to
|
942
|
+
#"RobustParser.rb" (or, in cases of extreme enterprisitude,
|
943
|
+
#"RobustParserBeanInterface.class") and using the following
|
944
|
+
#enterprise-friendly class aliases:
|
945
|
+
class RobustXMLParser < BeautifulStoneSoup; end
|
946
|
+
class RobustHTMLParser < BeautifulSoup; end
|
947
|
+
class SimplifyingSOAPParser < BeautifulSOAP; end
|
948
|
+
|
949
|
+
print BeautifulSoup.new(ARGF.read).prettify if $0 == __FILE__
|
950
|
+
|