rubyful_soup_2011 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,950 @@
1
+ #Rubyful Soup
2
+ #Elixir and Tonic
3
+ #"The Screen-Scraper's Friend"
4
+ #v1.0.4
5
+ #http://www.crummy.com/software/RubyfulSoup/
6
+ #
7
+ #Rubyful Soup is a port to the Ruby language and idiom of the Python
8
+ #library Beautiful Soup.
9
+ #See http://www.crummy.com/software/BeautifulSoup/ for details on the original.
10
+
11
+ #This library requires the sgml-parser library, written by Takahiro
12
+ #Maebashi. The easiest way to get it is to install the "htmltools"
13
+ #gem.
14
+ require 'html/sgml-parser'
15
+ require 'set'
16
+
17
+ #UTF-8 voodoo--does this really work?
18
+ $KCODE = 'u'
19
+ # require 'jcode'
20
+
21
+ #This code makes SGMLParser able to parse XML with namespaces.
22
+ class HTML::SGMLParser
23
+ if const_defined? :Tagfind
24
+ remove_const(:Tagfind)
25
+ Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/
26
+ end
27
+ end
28
+
29
+ module PageElement
30
+
31
+ attr_reader :parser
32
+ attr_accessor :parent, :previous_parsed, :next_parsed, :previous_sibling
33
+ attr_accessor :next_sibling
34
+
35
+ def setup(parent=nil, previous_parsed=nil)
36
+ @parent = parent
37
+ @previous_parsed = previous_parsed
38
+ @next_parsed = nil
39
+ @previous_sibling = nil
40
+ @next_sibling = nil
41
+ if @parent and not @parent.contents.empty?
42
+ @previous_sibling = @parent.contents[-1]
43
+ @previous_sibling.next_sibling = self
44
+ end
45
+ end
46
+
47
+ #A bunch of different iterators over a parsed document.
48
+ {
49
+ #Iterates in parse order over the rest of the items in this document.
50
+ :next_parsed_items => :next_parsed,
51
+
52
+ #Iterates in reverse parse order over all previously parsed items in
53
+ #this document.
54
+ :previous_parsed_items => :previous_parsed,
55
+
56
+ #Iterates in parse order over all subsequent siblings of this item.
57
+ :next_siblings => :next_sibling,
58
+
59
+ #Iterates in reverse parse order over all prior siblings of this item.
60
+ :previous_siblings => :previous_sibling,
61
+
62
+ #Iterates upwards through the parentage of this item.
63
+ :parents => :parent
64
+ }.each do |k,v|
65
+ class_eval %{
66
+ def #{k}
67
+ i = self
68
+ while i
69
+ i = i.#{v}
70
+ yield i if i
71
+ end
72
+ end
73
+ }
74
+ end
75
+
76
+ [ #Returns first item/all items matching the given criteria and
77
+ #appearing after this PageElement in the document.
78
+ [:find_next, :find_all_next, 'next_parsed_items'],
79
+
80
+ #Returns first item/all items matching the given criteria and
81
+ #appearing before this PageElement in the document.
82
+ [:find_previous, :find_all_previous, 'previous_parsed_items'],
83
+
84
+ #Returns the nearest sibling/all siblings of this PageElement matching
85
+ #the given criteria and appearing before this PageElement in
86
+ #the document.
87
+ [:find_previous_sibling, :find_previous_siblings, 'previous_siblings'],
88
+
89
+ #Returns the nearest sibling/all siblings of this PageElement matching
90
+ #the given criteria and appearing after this PageElement in
91
+ #the document
92
+ [:find_next_sibling, :find_next_siblings, 'next_siblings'],
93
+
94
+ #Returns the nearest parent/all parents of this PageElement matching
95
+ #the given criteria.
96
+ [:find_parent, :find_parents, 'parents'],
97
+ ].each do |singular, plural, method_name|
98
+ class_eval %{
99
+ def #{singular}(name=nil, args={}, &block)
100
+ args['limit'] = 1
101
+ fetch(method('#{method_name}'), name, args, block)[0]
102
+ end
103
+
104
+ def #{plural}(name=nil, args={}, &block)
105
+ fetch(method('#{method_name}'), name, args, block)
106
+ end
107
+ }
108
+ end
109
+
110
+ protected
111
+
112
+ #Returns a list of items matching the given criteria, obtained by
113
+ #iterating over the given iterator.
114
+ def fetch(iterator, name, args, block)
115
+ attrs = args[:attrs]
116
+ limit = args[:limit]
117
+ text = args[:text]
118
+
119
+ attrs ||= {}
120
+ if attrs != nil and not attrs.respond_to? :keys
121
+ attrs = {'class' => attrs}
122
+ end
123
+ bucket = []
124
+ catch(:stop_iteration) do
125
+ iterator.call do |item|
126
+ match = false
127
+ if block
128
+ match = true if block.call(item)
129
+ elsif item.is_a? Tag
130
+ #A tag matches if its name matches and its attributes line up.
131
+ if not text and (not name or PageElement.matches(item, name))
132
+ match = true
133
+ attrs.each_pair do |attr, matchAgainst|
134
+ check = item[attr]
135
+ unless PageElement.matches(check, matchAgainst)
136
+ match = false
137
+ break
138
+ end
139
+ end
140
+ end
141
+ elsif text
142
+ #A text matches if its string value matches the given text
143
+ #criterion.
144
+ match = PageElement.matches(item, text)
145
+ end
146
+ if match
147
+ bucket.push(item)
148
+ if limit and bucket.length >= limit
149
+ throw :stop_iteration
150
+ end
151
+ end
152
+ end
153
+ end
154
+ return bucket
155
+ end
156
+
157
+ #Used to tell whether a Tag or a NavigableString "matches" some data
158
+ #structure.
159
+ def PageElement.matches(chunk, how_to_match)
160
+ #puts "Seeing if #{chunk.class} #{chunk} matches #{how_to_match.class} #{how_to_match}."
161
+ #
162
+ # If given a list of items, return true if the list contains a
163
+ # text element that matches.
164
+ if chunk.is_a? Array
165
+ chunk.each do |tag|
166
+ return true if tag.is_a? NavigableString and matches(tag, how_to_match)
167
+ end
168
+ return false
169
+ elsif how_to_match.is_a? Proc
170
+ return how_to_match.call(chunk)
171
+ elsif chunk.is_a? Tag
172
+ #Custom match methods take the tag as an argument, but all other
173
+ #ways of matching match the tag name as a string
174
+ chunk = chunk.name
175
+ end
176
+
177
+ #At this point we know that chunk is a string
178
+ unless chunk.is_a? String
179
+ chunk = chunk.to_s
180
+ end
181
+ if how_to_match.is_a? Regexp
182
+ return how_to_match.match(chunk) != nil
183
+ elsif how_to_match.is_a? Array
184
+ return how_to_match.find {|x| x == chunk} != nil
185
+ elsif how_to_match.is_a? Hash
186
+ return how_to_match[chunk] != nil
187
+ else
188
+ #It's just a string
189
+ return how_to_match.to_s == chunk
190
+ end
191
+ end
192
+
193
+ end
194
+
195
+ module TagModule
196
+
197
+ include Enumerable
198
+ include PageElement
199
+
200
+ attr_accessor :name, :contents, :attrs, :string
201
+
202
+ #I tried to have Tag subclass Method, but it killed the
203
+ #whole thing. Maybe I should just leave well enough alone.
204
+ #
205
+ #def arity
206
+ # return methods('find_all').arity
207
+ #end
208
+ #
209
+ #def call(*args)
210
+ # return find_all(*args)
211
+ #end
212
+ #
213
+ #def to_proc
214
+ # return methods('find_all').to_proc
215
+ #end
216
+
217
+ def initialize(parser, name, attr_list=[], parent=nil, previous=nil)
218
+ @hidden = false
219
+ @parser = parser
220
+ @name = name
221
+ @attr_list = attr_list
222
+ @attrs = nil
223
+ @contents = []
224
+ setup(parent, previous)
225
+ end
226
+
227
+ # Turn the list of attributes into a hash on demand, so we don't have
228
+ # to do it for every tag while parsing.
229
+
230
+ def attrs
231
+ unless @attrs
232
+ @attrs = @attr_list.inject({}) do |m,v|
233
+ if v[1][0] == ?" and v[1][-1] == ?"
234
+ v[1] = v[1][1..-2]
235
+ end
236
+ m[v[0]] = v[1]
237
+ m
238
+ end
239
+ @attr_list = nil
240
+ end
241
+ return @attrs
242
+ end
243
+
244
+ #soup.title_tag, or soup.title, is the same as soup.find('title')
245
+ def method_missing(name, *args)
246
+ #puts "Missing method #{name} for #{self.class.name}"
247
+ name = name.to_s
248
+ if name[-4...name.length] == '_tag'
249
+ name = name[0...name.length-4]
250
+ end
251
+ return find(name, *args)
252
+ end
253
+
254
+ def [](k)
255
+ attrs[k]
256
+ end
257
+
258
+ def []=(k, v)
259
+ attrs[k] = v
260
+ end
261
+
262
+ def delete(k)
263
+ attrs.delete(k)
264
+ end
265
+
266
+ def has_key?(k)
267
+ attrs.has_key(k)
268
+ end
269
+
270
+ def each
271
+ @contents.each { |x| yield x }
272
+ end
273
+
274
+ def length
275
+ return contents.length
276
+ end
277
+ alias size length
278
+
279
+ def self_closing?
280
+ return @parser.self_closing_tag?(@name)
281
+ end
282
+
283
+ #Adds the given tag to the contents of this tag
284
+ def append(tag)
285
+ @contents.push(tag)
286
+ end
287
+
288
+ def to_str
289
+ return to_s
290
+ end
291
+
292
+ #Renders this tag and its contents as a pretty-printed string.
293
+ def prettify
294
+ return to_s(true)
295
+ end
296
+
297
+ def inspect
298
+ to_s
299
+ end
300
+
301
+ #Renders this tag and its contents as a string. NOTE: since REXML
302
+ #consumes whitespace, this method is not certain to reproduce the
303
+ #whitespace present in the original string.
304
+ def to_s(show_structure_indent=nil)
305
+ attr_strings = []
306
+ attrs.each { |k,v| attr_strings << %{#{k}="#{v}"} if v }
307
+ if self_closing?
308
+ close = ' /'
309
+ closeTag = nil
310
+ else
311
+ close = nil
312
+ closeTag = "</#{name}>"
313
+ end
314
+ indent_increment = show_structure_indent==true ? 0 : show_structure_indent
315
+ if show_structure_indent
316
+ indent_increment += 1 unless @hidden
317
+ end
318
+ contents = render_contents(indent_increment)
319
+ space = "\n #{' ' * indent_increment}" if show_structure_indent
320
+ if @hidden
321
+ s = contents
322
+ else
323
+ s = []
324
+ attribute_string = ''
325
+ unless attr_strings.empty?
326
+ attribute_string = ' ' + attr_strings.join(' ')
327
+ end
328
+ s.push(space) if show_structure_indent
329
+ s.push("<#{@name}#{attribute_string}#{close}>")
330
+ s.push(contents)
331
+ s.push(space) if closeTag and show_structure_indent
332
+ s.push(closeTag)
333
+ s = s.join('')
334
+ end
335
+ return s
336
+ end
337
+
338
+ #Renders the contents of this tag as a string.
339
+ def render_contents(show_structure_indent=nil)
340
+ s=[]
341
+ @contents.each do |c|
342
+ text = nil
343
+ if c.is_a? Tag
344
+ text = c.to_s(show_structure_indent)
345
+ else
346
+ text = c.to_s
347
+ end
348
+ if text
349
+ if show_structure_indent
350
+ text.chomp!
351
+ end
352
+ s.push(text)
353
+ end
354
+ end
355
+ return s.join('')
356
+ end
357
+
358
+ def recursive_children
359
+ stack = [[self, 0]]
360
+ catch(:stop_iteration) do
361
+ until stack.empty?
362
+ tag, start = stack.pop
363
+ for i in start...tag.contents.length
364
+ a = tag.contents[i]
365
+ yield a
366
+ if a.is_a? TagModule and not tag.contents.empty? and i < tag.contents.length
367
+ stack.push([tag, i+1])
368
+ stack.push([a, 0])
369
+ break
370
+ end
371
+ end if tag.is_a? TagModule
372
+ end
373
+ end
374
+ end
375
+
376
+ #Iterates over the direct children of this Tag.
377
+ def children
378
+ catch(:stop_iteration) { @contents.each { |x| yield x } }
379
+ end
380
+
381
+ #Convenience method to retrieve the first piece of text matching the
382
+ #given criteria. 'text' can be a string, a regular expression object,
383
+ #a Proc that takes a string and returns whether or not the
384
+ #string 'matches', etc.
385
+ def find_text(text=nil, &block)
386
+ args = { :text => text, :limit => 1}
387
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
388
+ fetch(iterator, nil, args, block)[0]
389
+ end
390
+
391
+ #Convenience method to retrieve all pieces of text matching the
392
+ #given criteria. 'text' can be a string, a regular expression object,
393
+ #a callable that takes a string and returns whether or not the
394
+ #string 'matches', etc.
395
+ #Args: :limit
396
+ def find_all_text(text=nil, args={}, &block)
397
+ args['text'] = text
398
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
399
+ fetch(iterator, nil, args, block)
400
+ end
401
+
402
+ #Extracts a list of Tag objects that match the given criteria. You
403
+ #can specify the name of the Tag and any attributes you want the Tag
404
+ #to have.
405
+ #
406
+ #The value of a key-value pair in the 'attrs' map can be a string, a
407
+ #list of strings, a regular expression object, or a Proc object that
408
+ #takes a string and returns whether or not the string matches for
409
+ #some custom definition of 'matches'. The same is true of the tag
410
+ #name, except that a Proc object will be passed the Tag object instead
411
+ #of just a string.
412
+ #Args: :attrs :text :limit :recursive
413
+ def find_all(name=nil, args={}, &block)
414
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
415
+ fetch(iterator, name, args, block)
416
+ end
417
+
418
+ #Returns the first Tag or NavigableString object that matches the
419
+ #given criteria. Takes much the same arguments as fetch.
420
+ #args: :attrs :text :limit :recursive
421
+ def find(name=nil, args={}, &block)
422
+ args[:limit] = 1
423
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
424
+ fetch(iterator, name, args, block)[0]
425
+ end
426
+ end
427
+
428
+ class Tag
429
+ include TagModule
430
+ end
431
+
432
+ class NavigableString < String
433
+ include PageElement
434
+ end
435
+
436
+ #This class contains the basic parser and fetch code. It defines
437
+ #a parser that knows nothing about tag behavior except for the
438
+ #following:
439
+ #
440
+ #You can't close a tag without closing all the tags it encloses.
441
+ #That is, "<foo><bar></foo>" actually means
442
+ #"<foo><bar></bar></foo>".
443
+ #
444
+ #[Another possible explanation is "<foo><bar /></foo>", but since
445
+ # this class defines no self_closing_tags, it will never use that
446
+ # explanation.]
447
+ #
448
+ #This class is useful for parsing XML or made-up markup languages,
449
+ #or when BeautifulSoup makes an assumption counter to what you were
450
+ #expecting."""
451
+ class BeautifulStoneSoup < HTML::SGMLParser
452
+ include TagModule
453
+
454
+ #As a public service we will by default silently replace MS smart quotes
455
+ #and similar characters with their HTML or ASCII equivalents.
456
+ @@ms_chars = { '\x80' => '&euro;',
457
+ "\x81" => ' ',
458
+ "\x82" => '&sbquo;',
459
+ "\x83" => '&fnof;',
460
+ "\x84" => '&bdquo;',
461
+ "\x85" => '&hellip;',
462
+ "\x86" => '&dagger;',
463
+ "\x87" => '&Dagger;',
464
+ "\x88" => '&caret;',
465
+ "\x89" => '%',
466
+ "\x8A" => '&Scaron;',
467
+ "\x8B" => '&lt;',
468
+ "\x8C" => '&OElig;',
469
+ "\x8D" => '?',
470
+ "\x8E" => 'Z',
471
+ "\x8F" => '?',
472
+ "\x90" => '?',
473
+ "\x91" => '&lsquo;',
474
+ "\x92" => '&rsquo;',
475
+ "\x93" => '&ldquo;',
476
+ "\x94" => '&rdquo;',
477
+ "\x95" => '&bull;',
478
+ "\x96" => '&ndash;',
479
+ "\x97" => '&mdash;',
480
+ "\x98" => '&tilde;',
481
+ "\x99" => '&trade;',
482
+ "\x9a" => '&scaron;',
483
+ "\x9b" => '&gt;',
484
+ "\x9c" => '&oelig;',
485
+ "\x9d" => '?',
486
+ "\x9e" => 'z',
487
+ "\x9f" => '&Yuml;'}
488
+
489
+ @@parser_massage = [[/<([^<>]*)\/>/, '<\1></\1>'],
490
+ [/<!\s+([^<>]*)>/, '<!\1>'],
491
+ [/([\x80-\x9f])/m, proc { |m| @@ms_chars[m]}]
492
+ ]
493
+
494
+ @@rootTagName = '[document]'
495
+
496
+ @@nestable_tags = {}
497
+ @@reset_nesting_tags = {}
498
+ @@quoteTags = {}
499
+ @@self_closing_tags = {}
500
+
501
+ attr_accessor :hidden
502
+
503
+ def self_closing_tag?(tag)
504
+ @@self_closing_tags.has_key?(tag)
505
+ end
506
+
507
+ #Args: :initial_text_is_everything, :avoid_parser_problems, :parse_only_these
508
+ def initialize(text, args={})
509
+ super(self, @@rootTagName)
510
+ @quote_stack = []
511
+ @hidden = 1
512
+ if args[:parse_only_these]
513
+ @parse_only_these = Set.new
514
+ p = args[:parse_only_these]
515
+ if p.respond_to? :each
516
+ p.each { |x| @parse_only_these << x }
517
+ else
518
+ @parse_only_these << p
519
+ end
520
+ else
521
+ @parse_only_these = nil
522
+ end
523
+ reset
524
+
525
+ @avoid_parser_problems = args[:avoid_parser_problems] || true
526
+ if @avoid_parser_problems and not @avoid_parser_problems.is_a? Enumerable
527
+ @avoid_parser_problems = @@parser_massage
528
+ end
529
+ feed(text) if text != nil
530
+ done if args[:initial_text_is_everything] != false
531
+ end
532
+
533
+ def feed(text)
534
+ if @avoid_parser_problems
535
+ #before = text.clone
536
+ @avoid_parser_problems.each do |re, fix|
537
+ if fix.is_a? String
538
+ text.gsub!(re, fix)
539
+ else
540
+ text.gsub!(re) { |x| fix.call(x) }
541
+ end
542
+ end
543
+ #if before != text
544
+ # puts "Changed from #{before} to #{text}"
545
+ #end
546
+ end
547
+ super
548
+ end
549
+
550
+ def ==(anObject)
551
+ return anObject != nil && anObject.to_s == to_s
552
+ end
553
+
554
+ def done
555
+ end_text
556
+ pop_tag while @currentTag.name != @@rootTagName
557
+ end
558
+
559
+ def reset
560
+ super
561
+ @currentText = []
562
+ @currentTag = nil
563
+ @tag_stack = []
564
+ push_tag(self)
565
+ end
566
+
567
+ def push_tag(tag)
568
+ #puts "Push #{ tag.name }"
569
+ @currentTag.append(tag) if @currentTag
570
+ @tag_stack.push(tag)
571
+ @currentTag = @tag_stack[-1]
572
+ end
573
+
574
+ def pop_tag
575
+ tag = @tag_stack.pop
576
+ #puts "Pop #{ tag.name }"
577
+
578
+ # Tags with just one string-owning child get the child as a
579
+ # 'string' property, so that soup.tag.string is shorthand for
580
+ # soup.tag.contents[0]
581
+ if @currentTag.contents.length == 1 and @currentTag.contents[0].is_a? NavigableString
582
+ @currentTag.string = @currentTag.contents[0]
583
+ end
584
+
585
+ @currentTag = @tag_stack[-1] unless @tag_stack.empty?
586
+ @currentTag
587
+ end
588
+
589
+ # StreamListener implementation
590
+
591
+ def unknown_starttag(name, attrs)
592
+ #puts "Starting tag #{name} #{attrs.inspect}"
593
+
594
+ unless @quote_stack.empty?
595
+ #This is not a real tag.
596
+ #puts "<#{name}> is not real!"
597
+ #TODO: find idiomatic way to do this
598
+ attrString = []
599
+ attrs.each { |k,v| attrString.push('#{k}="#{v}"') }
600
+ self.handle_data('<#{name} #{attrString.join(' ')}>')
601
+ return
602
+ end
603
+
604
+ end_text
605
+
606
+ return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
607
+ self_closing = @@self_closing_tags.has_key?(name)
608
+ smart_pop(name) unless self_closing
609
+ tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
610
+ @previous_parsed.next_parsed = tag if @previous_parsed
611
+ @previous_parsed = tag
612
+ push_tag(tag)
613
+ pop_tag if self_closing
614
+ if @@quoteTags.has_key?(name)
615
+ #puts "Beginning quote (#{name})"
616
+ @quote_stack.push(name)
617
+ end
618
+ end
619
+
620
+ def unknown_endtag(name)
621
+ #Ignore tag_end calls for self-closing tags; they were
622
+ #closed in the tag_start call.
623
+ #TODO: still neccessary?
624
+ #puts "Ending tag #{name}"
625
+ return if @@self_closing_tags.has_key?(name)
626
+
627
+ if not @quote_stack.empty? and @quote_stack[-1] != name
628
+ #This is not a real end tag.
629
+ #puts "</#{name}> is not real!"
630
+ handle_data('</#{name}>')
631
+ return
632
+ end
633
+
634
+ return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
635
+
636
+ end_text
637
+ pop_to_tag(name)
638
+ @quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
639
+ end
640
+
641
+ def handle_data(data)
642
+ return unless !@parse_only_these or @tag_stack.size > 1
643
+ @currentText.push(data)
644
+ end
645
+
646
+ #Propagate comments right through.
647
+ def handle_comment(data)
648
+ handle_data("<!--#{data}-->")
649
+ end
650
+
651
+ def handle_special(data)
652
+ handle_data("<#{data}>")
653
+ end
654
+
655
+ def unknown_charref(ref)
656
+ handle_data("&#{ref};")
657
+ end
658
+
659
+ def unknown_entityref(ref)
660
+ handle_data("%#{ref}")
661
+ end
662
+
663
+ def attlistdecl(element_name, attributes, raw_content)
664
+ handle_data("<!ATTLIST #{raw_content}>")
665
+ end
666
+
667
+ def cdata(content)
668
+ handle_data("<![CDATA[#{content}]]")
669
+ end
670
+
671
+ ###
672
+
673
+ def doctype(*args)
674
+ content = args.join(' ')
675
+ ##{name} #{pub_sys}#{long_name}#{url}
676
+ #long_name = ' "#{long_name}"' if long_name
677
+ #url = ' "#{url}"' if url
678
+ handle_data("<!DOCTYPE #{content}>")
679
+ end
680
+
681
+ def elementdecl(content)
682
+ handle_data("<!ELEMENT #{content}>")
683
+ end
684
+
685
+ def entity(content)
686
+
687
+ end
688
+
689
+ def entitydecl(content)
690
+ handle_data("<!ENTITY #{content.join(' ')}>")
691
+ end
692
+
693
+ def instruction(name, instruction)
694
+ handle_data("<?#{name} #{instruction}>")
695
+ end
696
+
697
+ def notationdecl(content)
698
+ handle_data("<!NOTATION #{content}>")
699
+ end
700
+
701
+ def xmldecl(version, encoding, standalone)
702
+ encoding = ' encoding="#{encoding}"' if encoding
703
+ handle_data('<?xml version="#{version}"#{encoding}#{standalone}>')
704
+ end
705
+
706
+ #Called when we're done collecting some text, declarations, etc.
707
+ def end_text
708
+ currentText = @currentText.join('')
709
+ unless currentText.empty?
710
+ if currentText.strip.empty?
711
+ if currentText =~ /\n/
712
+ currentText = "\n"
713
+ else
714
+ currentText = ' '
715
+ end
716
+ end
717
+ #puts "Setting up text #{currentText}"
718
+ currentText = NavigableString.new(currentText)
719
+ currentText.setup(@currentTag, @previous_parsed)
720
+ @previous_parsed.next_parsed = currentText if @previous_parsed
721
+ @previous_parsed = currentText
722
+ @currentTag.contents.push(currentText)
723
+ end
724
+ @currentText = []
725
+ end
726
+
727
+ # Helper methods
728
+
729
+ private
730
+
731
+ #Pops the tag stack up to and including the most recent
732
+ #instance of the given tag. If inclusivePop is false, pops the tag
733
+ #stack up to but *not* including the most recent instance of
734
+ #the given tag.
735
+ def pop_to_tag(name, inclusive_pop=true)
736
+ return if name == @@rootTagName
737
+
738
+ #puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
739
+ num_pops = 0
740
+ mostRecentTag = nil
741
+ (@tag_stack.length-1).downto(0) do |i|
742
+ if name == @tag_stack[i].name
743
+ #puts "Found at #{i}, #{@tag_stack.length-i}"
744
+ num_pops = @tag_stack.length-i
745
+ break
746
+ end
747
+ end
748
+ num_pops -= 1 if not inclusive_pop
749
+
750
+ #puts "Popping #{num_pops} times."
751
+ num_pops.times { mostRecentTag = pop_tag }
752
+ mostRecentTag
753
+ end
754
+
755
+ #We need to pop up to the previous tag of this type, unless
756
+ #one of this tag's nesting reset triggers comes between this
757
+ #tag and the previous tag of this type, OR unless this tag is a
758
+ #generic nesting trigger and another generic nesting trigger
759
+ #comes between this tag and the previous tag of this type.
760
+ #
761
+ #Examples:
762
+ # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
763
+ # <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
764
+ # <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
765
+ # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
766
+ #
767
+ # <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
768
+ # <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
769
+ # <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
770
+ def smart_pop(name)
771
+ #puts "Smart pop for #{name}"
772
+ nesting_reset_triggers = @@nestable_tags[name]
773
+ is_nestable = nesting_reset_triggers != nil
774
+ is_reset_nesting = @@reset_nesting_tags.has_key?(name)
775
+ popTo = nil
776
+ inclusive = true
777
+ @tag_stack.reverse_each do |p|
778
+ if (p == nil or p.name == name) and not is_nestable
779
+ #Non-nestable tags get popped to the top or to their
780
+ #last occurance.
781
+ #puts "Non-nestable tag #{name} gets popped to its last occurance."
782
+ popTo = name
783
+ break
784
+ end
785
+ if (nesting_reset_triggers != nil and nesting_reset_triggers.include?(p.name)) or (nesting_reset_triggers == nil and is_reset_nesting and @@reset_nesting_tags.has_key?(p.name))
786
+ #If we encounter one of the nesting reset triggers
787
+ #peculiar to this tag, or we encounter another tag
788
+ #that causes nesting to reset, pop up to but not
789
+ #including that tag.
790
+ #puts "Nesting reset trigger encountered for #{name}: #{p.name}"
791
+ popTo = p.name
792
+ inclusive = false
793
+ break
794
+ end
795
+ p = p.parent
796
+ end
797
+ pop_to_tag(popTo, inclusive) if popTo
798
+ end
799
+
800
+ protected
801
+
802
+ #Turns a list of maps, lists, or scalars into a single map.
803
+ #Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
804
+ #of lists and partial maps.
805
+ def BeautifulStoneSoup.build_tag_map(default, *args)
806
+ built = args.inject({}) do |m, portion|
807
+ if portion.is_a? Hash
808
+ #It's a map. Merge it.
809
+ portion.each_pair { |k,v| m[k] = v }
810
+ elsif portion.is_a? Array
811
+ #It's a list. Map each item to the default.
812
+ portion.each { |k| m[k] = default }
813
+ else
814
+ #It's a scalar. Map it to the default.
815
+ m[portion] = default
816
+ end
817
+ m
818
+ end
819
+ end
820
+ end
821
+
822
+ #This parser knows the following facts about HTML:
823
+ #
824
+ #* Some tags have no closing tag and should be interpreted as being
825
+ # closed as soon as they are encountered.
826
+ #
827
+ #* The text inside some tags (ie. 'script') may contain tags which
828
+ # are not really part of the document and which should be parsed
829
+ # as text, not tags. If you want to parse the text as tags, you can
830
+ # always fetch it and parse it explicitly.
831
+ #
832
+ #* Tag nesting rules:
833
+ #
834
+ # Most tags can't be nested at all. For instance, the occurance of
835
+ # a <p> tag should implicitly close the previous <p> tag.
836
+ #
837
+ # <p>Para1<p>Para2
838
+ # should be transformed into:
839
+ # <p>Para1</p><p>Para2
840
+ #
841
+ # Some tags can be nested arbitrarily. For instance, the occurance
842
+ # of a <blockquote> tag should _not_ implicitly close the previous
843
+ # <blockquote> tag.
844
+ #
845
+ # Alice said: <blockquote>Bob said: <blockquote>Blah
846
+ # should NOT be transformed into:
847
+ # Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
848
+ #
849
+ # Some tags can be nested, but the nesting is reset by the
850
+ # interposition of other tags. For instance, a <tr> tag should
851
+ # implicitly close the previous <tr> tag within the same <table>,
852
+ # but not close a <tr> tag in another table.
853
+ #
854
+ # <table><tr>Blah<tr>Blah
855
+ # should be transformed into:
856
+ # <table><tr>Blah</tr><tr>Blah
857
+ # but,
858
+ # <tr>Blah<table><tr>Blah
859
+ # should NOT be transformed into
860
+ # <tr>Blah<table></tr><tr>Blah
861
+ #
862
+ #Differing assumptions about tag nesting rules are a major source
863
+ #of problems with the BeautifulSoup class. If BeautifulSoup is not
864
+ #treating as nestable a tag your page author treats as nestable,
865
+ #try writing a subclass.
866
+ class BeautifulSoup < BeautifulStoneSoup
867
+
868
+ @@self_closing_tags.replace(build_tag_map(nil, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame']))
869
+
870
+ @@quote_tags = {'script' => nil}
871
+
872
+ #According to the HTML standard, each of these inline tags can
873
+ #contain another tag of the same type. Furthermore, it's common
874
+ #to actually use these tags this way.
875
+ @@nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center']
876
+
877
+ #According to the HTML standard, these block tags can contain
878
+ #another tag of the same type. Furthermore, it's common
879
+ #to actually use these tags this way.
880
+ @@nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
881
+
882
+ #Lists can contain other lists, but there are restrictions.
883
+ @@nestable_list_tags = { 'ol' => [],
884
+ 'ul' => [],
885
+ 'li' => ['ul', 'ol'],
886
+ 'dl' => [],
887
+ 'dd' => ['dl'],
888
+ 'dt' => ['dl'] }
889
+
890
+ #Tables can contain other tables, but there are restrictions.
891
+ @@nestable_table_tags = {'table' => ['tr', 'td'],
892
+ 'tr' => ['table'],
893
+ 'td' => ['tr'],
894
+ 'th' => ['tr'],
895
+ }
896
+
897
+ @@non_nestable_block_tags = ['address', 'form', 'p', 'pre']
898
+
899
+ #If one of these tags is encountered, all tags up to the next tag of
900
+ #this type are popped.
901
+ @@reset_nesting_tags.replace(build_tag_map(nil, @@nestable_block_tags, 'noscript', @@non_nestable_block_tags,
902
+ @@nestable_list_tags, @@nestable_table_tags))
903
+
904
+ @@nestable_tags.replace(build_tag_map([], @@nestable_inline_tags, @@nestable_block_tags, @@nestable_list_tags, @@nestable_table_tags))
905
+
906
+ end
907
+
908
+ # This class will push a tag with only a single string child into
909
+ # the tag's parent as an attribute. The attribute's name is the tag
910
+ # name, and the value is the string child. An example should give
911
+ # the flavor of the change:
912
+ #
913
+ # <foo><bar>baz</bar></foo>
914
+ # =>
915
+ # <foo bar="baz"><bar>baz</bar></foo>
916
+ #
917
+ # You can then access fooTag['bar'] instead of fooTag.barTag.string.
918
+ #
919
+ # This is, of course, useful for scraping structures that tend to
920
+ # use subelements instead of attributes, such as SOAP messages. Note
921
+ # that it modifies its input, so don't print the modified version
922
+ # out.
923
+ class BeautifulSOAP < BeautifulStoneSoup
924
+ def pop_tag
925
+ if @tag_stack.size > 1
926
+ tag = @tag_stack[-1]
927
+ parent = @tag_stack[-2]
928
+ if (tag.is_a?(Tag) && tag.contents.size == 1 && \
929
+ tag.contents[0].is_a?(NavigableString) && !parent[tag.name])
930
+ parent[tag.name] = tag.contents[0]
931
+ end
932
+ super
933
+ end
934
+ end
935
+ end
936
+
937
+ #Enterprise class names! It has come to our attention that some people
938
+ #think the names of the Rubyful Soup parser classes are too silly
939
+ #and "unprofessional" for use in enterprise screen-scraping. We feel
940
+ #your pain! For such-minded folk, the Rubyful Soup Consortium And
941
+ #Rootin' Tootin' Texas Delicatessen recommends renaming this file to
942
+ #"RobustParser.rb" (or, in cases of extreme enterprisitude,
943
+ #"RobustParserBeanInterface.class") and using the following
944
+ #enterprise-friendly class aliases:
945
+ class RobustXMLParser < BeautifulStoneSoup; end
946
+ class RobustHTMLParser < BeautifulSoup; end
947
+ class SimplifyingSOAPParser < BeautifulSOAP; end
948
+
949
+ print BeautifulSoup.new(ARGF.read).prettify if $0 == __FILE__
950
+