rubyful_soup_2011 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,950 @@
1
+ #Rubyful Soup
2
+ #Elixir and Tonic
3
+ #"The Screen-Scraper's Friend"
4
+ #v1.0.4
5
+ #http://www.crummy.com/software/RubyfulSoup/
6
+ #
7
+ #Rubyful Soup is a port to the Ruby language and idiom of the Python
8
+ #library Beautiful Soup.
9
+ #See http://www.crummy.com/software/BeautifulSoup/ for details on the original.
10
+
11
+ #This library requires the sgml-parser library, written by Takahiro
12
+ #Maebashi. The easiest way to get it is to install the "htmltools"
13
+ #gem.
14
+ require 'html/sgml-parser'
15
+ require 'set'
16
+
17
+ #UTF-8 voodoo--does this really work?
18
+ $KCODE = 'u'
19
+ # require 'jcode'
20
+
21
+ #This code makes SGMLParser able to parse XML with namespaces.
22
+ class HTML::SGMLParser
23
+ if const_defined? :Tagfind
24
+ remove_const(:Tagfind)
25
+ Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/
26
+ end
27
+ end
28
+
29
+ module PageElement
30
+
31
+ attr_reader :parser
32
+ attr_accessor :parent, :previous_parsed, :next_parsed, :previous_sibling
33
+ attr_accessor :next_sibling
34
+
35
+ def setup(parent=nil, previous_parsed=nil)
36
+ @parent = parent
37
+ @previous_parsed = previous_parsed
38
+ @next_parsed = nil
39
+ @previous_sibling = nil
40
+ @next_sibling = nil
41
+ if @parent and not @parent.contents.empty?
42
+ @previous_sibling = @parent.contents[-1]
43
+ @previous_sibling.next_sibling = self
44
+ end
45
+ end
46
+
47
+ #A bunch of different iterators over a parsed document.
48
+ {
49
+ #Iterates in parse order over the rest of the items in this document.
50
+ :next_parsed_items => :next_parsed,
51
+
52
+ #Iterates in reverse parse order over all previously parsed items in
53
+ #this document.
54
+ :previous_parsed_items => :previous_parsed,
55
+
56
+ #Iterates in parse order over all subsequent siblings of this item.
57
+ :next_siblings => :next_sibling,
58
+
59
+ #Iterates in reverse parse order over all prior siblings of this item.
60
+ :previous_siblings => :previous_sibling,
61
+
62
+ #Iterates upwards through the parentage of this item.
63
+ :parents => :parent
64
+ }.each do |k,v|
65
+ class_eval %{
66
+ def #{k}
67
+ i = self
68
+ while i
69
+ i = i.#{v}
70
+ yield i if i
71
+ end
72
+ end
73
+ }
74
+ end
75
+
76
+ [ #Returns first item/all items matching the given criteria and
77
+ #appearing after this PageElement in the document.
78
+ [:find_next, :find_all_next, 'next_parsed_items'],
79
+
80
+ #Returns first item/all items matching the given criteria and
81
+ #appearing before this PageElement in the document.
82
+ [:find_previous, :find_all_previous, 'previous_parsed_items'],
83
+
84
+ #Returns the nearest sibling/all siblings of this PageElement matching
85
+ #the given criteria and appearing before this PageElement in
86
+ #the document.
87
+ [:find_previous_sibling, :find_previous_siblings, 'previous_siblings'],
88
+
89
+ #Returns the nearest sibling/all siblings of this PageElement matching
90
+ #the given criteria and appearing after this PageElement in
91
+ #the document
92
+ [:find_next_sibling, :find_next_siblings, 'next_siblings'],
93
+
94
+ #Returns the nearest parent/all parents of this PageElement matching
95
+ #the given criteria.
96
+ [:find_parent, :find_parents, 'parents'],
97
+ ].each do |singular, plural, method_name|
98
+ class_eval %{
99
+ def #{singular}(name=nil, args={}, &block)
100
+ args['limit'] = 1
101
+ fetch(method('#{method_name}'), name, args, block)[0]
102
+ end
103
+
104
+ def #{plural}(name=nil, args={}, &block)
105
+ fetch(method('#{method_name}'), name, args, block)
106
+ end
107
+ }
108
+ end
109
+
110
+ protected
111
+
112
+ #Returns a list of items matching the given criteria, obtained by
113
+ #iterating over the given iterator.
114
+ def fetch(iterator, name, args, block)
115
+ attrs = args[:attrs]
116
+ limit = args[:limit]
117
+ text = args[:text]
118
+
119
+ attrs ||= {}
120
+ if attrs != nil and not attrs.respond_to? :keys
121
+ attrs = {'class' => attrs}
122
+ end
123
+ bucket = []
124
+ catch(:stop_iteration) do
125
+ iterator.call do |item|
126
+ match = false
127
+ if block
128
+ match = true if block.call(item)
129
+ elsif item.is_a? Tag
130
+ #A tag matches if its name matches and its attributes line up.
131
+ if not text and (not name or PageElement.matches(item, name))
132
+ match = true
133
+ attrs.each_pair do |attr, matchAgainst|
134
+ check = item[attr]
135
+ unless PageElement.matches(check, matchAgainst)
136
+ match = false
137
+ break
138
+ end
139
+ end
140
+ end
141
+ elsif text
142
+ #A text matches if its string value matches the given text
143
+ #criterion.
144
+ match = PageElement.matches(item, text)
145
+ end
146
+ if match
147
+ bucket.push(item)
148
+ if limit and bucket.length >= limit
149
+ throw :stop_iteration
150
+ end
151
+ end
152
+ end
153
+ end
154
+ return bucket
155
+ end
156
+
157
+ #Used to tell whether a Tag or a NavigableString "matches" some data
158
+ #structure.
159
+ def PageElement.matches(chunk, how_to_match)
160
+ #puts "Seeing if #{chunk.class} #{chunk} matches #{how_to_match.class} #{how_to_match}."
161
+ #
162
+ # If given a list of items, return true if the list contains a
163
+ # text element that matches.
164
+ if chunk.is_a? Array
165
+ chunk.each do |tag|
166
+ return true if tag.is_a? NavigableString and matches(tag, how_to_match)
167
+ end
168
+ return false
169
+ elsif how_to_match.is_a? Proc
170
+ return how_to_match.call(chunk)
171
+ elsif chunk.is_a? Tag
172
+ #Custom match methods take the tag as an argument, but all other
173
+ #ways of matching match the tag name as a string
174
+ chunk = chunk.name
175
+ end
176
+
177
+ #At this point we know that chunk is a string
178
+ unless chunk.is_a? String
179
+ chunk = chunk.to_s
180
+ end
181
+ if how_to_match.is_a? Regexp
182
+ return how_to_match.match(chunk) != nil
183
+ elsif how_to_match.is_a? Array
184
+ return how_to_match.find {|x| x == chunk} != nil
185
+ elsif how_to_match.is_a? Hash
186
+ return how_to_match[chunk] != nil
187
+ else
188
+ #It's just a string
189
+ return how_to_match.to_s == chunk
190
+ end
191
+ end
192
+
193
+ end
194
+
195
+ module TagModule
196
+
197
+ include Enumerable
198
+ include PageElement
199
+
200
+ attr_accessor :name, :contents, :attrs, :string
201
+
202
+ #I tried to have Tag subclass Method, but it killed the
203
+ #whole thing. Maybe I should just leave well enough alone.
204
+ #
205
+ #def arity
206
+ # return methods('find_all').arity
207
+ #end
208
+ #
209
+ #def call(*args)
210
+ # return find_all(*args)
211
+ #end
212
+ #
213
+ #def to_proc
214
+ # return methods('find_all').to_proc
215
+ #end
216
+
217
+ def initialize(parser, name, attr_list=[], parent=nil, previous=nil)
218
+ @hidden = false
219
+ @parser = parser
220
+ @name = name
221
+ @attr_list = attr_list
222
+ @attrs = nil
223
+ @contents = []
224
+ setup(parent, previous)
225
+ end
226
+
227
+ # Turn the list of attributes into a hash on demand, so we don't have
228
+ # to do it for every tag while parsing.
229
+
230
+ def attrs
231
+ unless @attrs
232
+ @attrs = @attr_list.inject({}) do |m,v|
233
+ if v[1][0] == ?" and v[1][-1] == ?"
234
+ v[1] = v[1][1..-2]
235
+ end
236
+ m[v[0]] = v[1]
237
+ m
238
+ end
239
+ @attr_list = nil
240
+ end
241
+ return @attrs
242
+ end
243
+
244
+ #soup.title_tag, or soup.title, is the same as soup.find('title')
245
+ def method_missing(name, *args)
246
+ #puts "Missing method #{name} for #{self.class.name}"
247
+ name = name.to_s
248
+ if name[-4...name.length] == '_tag'
249
+ name = name[0...name.length-4]
250
+ end
251
+ return find(name, *args)
252
+ end
253
+
254
+ def [](k)
255
+ attrs[k]
256
+ end
257
+
258
+ def []=(k, v)
259
+ attrs[k] = v
260
+ end
261
+
262
+ def delete(k)
263
+ attrs.delete(k)
264
+ end
265
+
266
+ def has_key?(k)
267
+ attrs.has_key(k)
268
+ end
269
+
270
+ def each
271
+ @contents.each { |x| yield x }
272
+ end
273
+
274
+ def length
275
+ return contents.length
276
+ end
277
+ alias size length
278
+
279
+ def self_closing?
280
+ return @parser.self_closing_tag?(@name)
281
+ end
282
+
283
+ #Adds the given tag to the contents of this tag
284
+ def append(tag)
285
+ @contents.push(tag)
286
+ end
287
+
288
+ def to_str
289
+ return to_s
290
+ end
291
+
292
+ #Renders this tag and its contents as a pretty-printed string.
293
+ def prettify
294
+ return to_s(true)
295
+ end
296
+
297
+ def inspect
298
+ to_s
299
+ end
300
+
301
+ #Renders this tag and its contents as a string. NOTE: since REXML
302
+ #consumes whitespace, this method is not certain to reproduce the
303
+ #whitespace present in the original string.
304
+ def to_s(show_structure_indent=nil)
305
+ attr_strings = []
306
+ attrs.each { |k,v| attr_strings << %{#{k}="#{v}"} if v }
307
+ if self_closing?
308
+ close = ' /'
309
+ closeTag = nil
310
+ else
311
+ close = nil
312
+ closeTag = "</#{name}>"
313
+ end
314
+ indent_increment = show_structure_indent==true ? 0 : show_structure_indent
315
+ if show_structure_indent
316
+ indent_increment += 1 unless @hidden
317
+ end
318
+ contents = render_contents(indent_increment)
319
+ space = "\n #{' ' * indent_increment}" if show_structure_indent
320
+ if @hidden
321
+ s = contents
322
+ else
323
+ s = []
324
+ attribute_string = ''
325
+ unless attr_strings.empty?
326
+ attribute_string = ' ' + attr_strings.join(' ')
327
+ end
328
+ s.push(space) if show_structure_indent
329
+ s.push("<#{@name}#{attribute_string}#{close}>")
330
+ s.push(contents)
331
+ s.push(space) if closeTag and show_structure_indent
332
+ s.push(closeTag)
333
+ s = s.join('')
334
+ end
335
+ return s
336
+ end
337
+
338
+ #Renders the contents of this tag as a string.
339
+ def render_contents(show_structure_indent=nil)
340
+ s=[]
341
+ @contents.each do |c|
342
+ text = nil
343
+ if c.is_a? Tag
344
+ text = c.to_s(show_structure_indent)
345
+ else
346
+ text = c.to_s
347
+ end
348
+ if text
349
+ if show_structure_indent
350
+ text.chomp!
351
+ end
352
+ s.push(text)
353
+ end
354
+ end
355
+ return s.join('')
356
+ end
357
+
358
+ def recursive_children
359
+ stack = [[self, 0]]
360
+ catch(:stop_iteration) do
361
+ until stack.empty?
362
+ tag, start = stack.pop
363
+ for i in start...tag.contents.length
364
+ a = tag.contents[i]
365
+ yield a
366
+ if a.is_a? TagModule and not tag.contents.empty? and i < tag.contents.length
367
+ stack.push([tag, i+1])
368
+ stack.push([a, 0])
369
+ break
370
+ end
371
+ end if tag.is_a? TagModule
372
+ end
373
+ end
374
+ end
375
+
376
+ #Iterates over the direct children of this Tag.
377
+ def children
378
+ catch(:stop_iteration) { @contents.each { |x| yield x } }
379
+ end
380
+
381
+ #Convenience method to retrieve the first piece of text matching the
382
+ #given criteria. 'text' can be a string, a regular expression object,
383
+ #a Proc that takes a string and returns whether or not the
384
+ #string 'matches', etc.
385
+ def find_text(text=nil, &block)
386
+ args = { :text => text, :limit => 1}
387
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
388
+ fetch(iterator, nil, args, block)[0]
389
+ end
390
+
391
+ #Convenience method to retrieve all pieces of text matching the
392
+ #given criteria. 'text' can be a string, a regular expression object,
393
+ #a callable that takes a string and returns whether or not the
394
+ #string 'matches', etc.
395
+ #Args: :limit
396
+ def find_all_text(text=nil, args={}, &block)
397
+ args['text'] = text
398
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
399
+ fetch(iterator, nil, args, block)
400
+ end
401
+
402
+ #Extracts a list of Tag objects that match the given criteria. You
403
+ #can specify the name of the Tag and any attributes you want the Tag
404
+ #to have.
405
+ #
406
+ #The value of a key-value pair in the 'attrs' map can be a string, a
407
+ #list of strings, a regular expression object, or a Proc object that
408
+ #takes a string and returns whether or not the string matches for
409
+ #some custom definition of 'matches'. The same is true of the tag
410
+ #name, except that a Proc object will be passed the Tag object instead
411
+ #of just a string.
412
+ #Args: :attrs :text :limit :recursive
413
+ def find_all(name=nil, args={}, &block)
414
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
415
+ fetch(iterator, name, args, block)
416
+ end
417
+
418
+ #Returns the first Tag or NavigableString object that matches the
419
+ #given criteria. Takes much the same arguments as fetch.
420
+ #args: :attrs :text :limit :recursive
421
+ def find(name=nil, args={}, &block)
422
+ args[:limit] = 1
423
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
424
+ fetch(iterator, name, args, block)[0]
425
+ end
426
+ end
427
+
428
+ class Tag
429
+ include TagModule
430
+ end
431
+
432
+ class NavigableString < String
433
+ include PageElement
434
+ end
435
+
436
+ #This class contains the basic parser and fetch code. It defines
437
+ #a parser that knows nothing about tag behavior except for the
438
+ #following:
439
+ #
440
+ #You can't close a tag without closing all the tags it encloses.
441
+ #That is, "<foo><bar></foo>" actually means
442
+ #"<foo><bar></bar></foo>".
443
+ #
444
+ #[Another possible explanation is "<foo><bar /></foo>", but since
445
+ # this class defines no self_closing_tags, it will never use that
446
+ # explanation.]
447
+ #
448
+ #This class is useful for parsing XML or made-up markup languages,
449
+ #or when BeautifulSoup makes an assumption counter to what you were
450
+ #expecting."""
451
+ class BeautifulStoneSoup < HTML::SGMLParser
452
+ include TagModule
453
+
454
+ #As a public service we will by default silently replace MS smart quotes
455
+ #and similar characters with their HTML or ASCII equivalents.
456
+ @@ms_chars = { '\x80' => '&euro;',
457
+ "\x81" => ' ',
458
+ "\x82" => '&sbquo;',
459
+ "\x83" => '&fnof;',
460
+ "\x84" => '&bdquo;',
461
+ "\x85" => '&hellip;',
462
+ "\x86" => '&dagger;',
463
+ "\x87" => '&Dagger;',
464
+ "\x88" => '&caret;',
465
+ "\x89" => '%',
466
+ "\x8A" => '&Scaron;',
467
+ "\x8B" => '&lt;',
468
+ "\x8C" => '&OElig;',
469
+ "\x8D" => '?',
470
+ "\x8E" => 'Z',
471
+ "\x8F" => '?',
472
+ "\x90" => '?',
473
+ "\x91" => '&lsquo;',
474
+ "\x92" => '&rsquo;',
475
+ "\x93" => '&ldquo;',
476
+ "\x94" => '&rdquo;',
477
+ "\x95" => '&bull;',
478
+ "\x96" => '&ndash;',
479
+ "\x97" => '&mdash;',
480
+ "\x98" => '&tilde;',
481
+ "\x99" => '&trade;',
482
+ "\x9a" => '&scaron;',
483
+ "\x9b" => '&gt;',
484
+ "\x9c" => '&oelig;',
485
+ "\x9d" => '?',
486
+ "\x9e" => 'z',
487
+ "\x9f" => '&Yuml;'}
488
+
489
+ @@parser_massage = [[/<([^<>]*)\/>/, '<\1></\1>'],
490
+ [/<!\s+([^<>]*)>/, '<!\1>'],
491
+ [/([\x80-\x9f])/m, proc { |m| @@ms_chars[m]}]
492
+ ]
493
+
494
+ @@rootTagName = '[document]'
495
+
496
+ @@nestable_tags = {}
497
+ @@reset_nesting_tags = {}
498
+ @@quoteTags = {}
499
+ @@self_closing_tags = {}
500
+
501
+ attr_accessor :hidden
502
+
503
+ def self_closing_tag?(tag)
504
+ @@self_closing_tags.has_key?(tag)
505
+ end
506
+
507
+ #Args: :initial_text_is_everything, :avoid_parser_problems, :parse_only_these
508
+ def initialize(text, args={})
509
+ super(self, @@rootTagName)
510
+ @quote_stack = []
511
+ @hidden = 1
512
+ if args[:parse_only_these]
513
+ @parse_only_these = Set.new
514
+ p = args[:parse_only_these]
515
+ if p.respond_to? :each
516
+ p.each { |x| @parse_only_these << x }
517
+ else
518
+ @parse_only_these << p
519
+ end
520
+ else
521
+ @parse_only_these = nil
522
+ end
523
+ reset
524
+
525
+ @avoid_parser_problems = args[:avoid_parser_problems] || true
526
+ if @avoid_parser_problems and not @avoid_parser_problems.is_a? Enumerable
527
+ @avoid_parser_problems = @@parser_massage
528
+ end
529
+ feed(text) if text != nil
530
+ done if args[:initial_text_is_everything] != false
531
+ end
532
+
533
+ def feed(text)
534
+ if @avoid_parser_problems
535
+ #before = text.clone
536
+ @avoid_parser_problems.each do |re, fix|
537
+ if fix.is_a? String
538
+ text.gsub!(re, fix)
539
+ else
540
+ text.gsub!(re) { |x| fix.call(x) }
541
+ end
542
+ end
543
+ #if before != text
544
+ # puts "Changed from #{before} to #{text}"
545
+ #end
546
+ end
547
+ super
548
+ end
549
+
550
+ def ==(anObject)
551
+ return anObject != nil && anObject.to_s == to_s
552
+ end
553
+
554
+ def done
555
+ end_text
556
+ pop_tag while @currentTag.name != @@rootTagName
557
+ end
558
+
559
+ def reset
560
+ super
561
+ @currentText = []
562
+ @currentTag = nil
563
+ @tag_stack = []
564
+ push_tag(self)
565
+ end
566
+
567
+ def push_tag(tag)
568
+ #puts "Push #{ tag.name }"
569
+ @currentTag.append(tag) if @currentTag
570
+ @tag_stack.push(tag)
571
+ @currentTag = @tag_stack[-1]
572
+ end
573
+
574
+ def pop_tag
575
+ tag = @tag_stack.pop
576
+ #puts "Pop #{ tag.name }"
577
+
578
+ # Tags with just one string-owning child get the child as a
579
+ # 'string' property, so that soup.tag.string is shorthand for
580
+ # soup.tag.contents[0]
581
+ if @currentTag.contents.length == 1 and @currentTag.contents[0].is_a? NavigableString
582
+ @currentTag.string = @currentTag.contents[0]
583
+ end
584
+
585
+ @currentTag = @tag_stack[-1] unless @tag_stack.empty?
586
+ @currentTag
587
+ end
588
+
589
+ # StreamListener implementation
590
+
591
+ def unknown_starttag(name, attrs)
592
+ #puts "Starting tag #{name} #{attrs.inspect}"
593
+
594
+ unless @quote_stack.empty?
595
+ #This is not a real tag.
596
+ #puts "<#{name}> is not real!"
597
+ #TODO: find idiomatic way to do this
598
+ attrString = []
599
+ attrs.each { |k,v| attrString.push('#{k}="#{v}"') }
600
+ self.handle_data('<#{name} #{attrString.join(' ')}>')
601
+ return
602
+ end
603
+
604
+ end_text
605
+
606
+ return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
607
+ self_closing = @@self_closing_tags.has_key?(name)
608
+ smart_pop(name) unless self_closing
609
+ tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
610
+ @previous_parsed.next_parsed = tag if @previous_parsed
611
+ @previous_parsed = tag
612
+ push_tag(tag)
613
+ pop_tag if self_closing
614
+ if @@quoteTags.has_key?(name)
615
+ #puts "Beginning quote (#{name})"
616
+ @quote_stack.push(name)
617
+ end
618
+ end
619
+
620
+ def unknown_endtag(name)
621
+ #Ignore tag_end calls for self-closing tags; they were
622
+ #closed in the tag_start call.
623
+ #TODO: still neccessary?
624
+ #puts "Ending tag #{name}"
625
+ return if @@self_closing_tags.has_key?(name)
626
+
627
+ if not @quote_stack.empty? and @quote_stack[-1] != name
628
+ #This is not a real end tag.
629
+ #puts "</#{name}> is not real!"
630
+ handle_data('</#{name}>')
631
+ return
632
+ end
633
+
634
+ return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
635
+
636
+ end_text
637
+ pop_to_tag(name)
638
+ @quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
639
+ end
640
+
641
+ def handle_data(data)
642
+ return unless !@parse_only_these or @tag_stack.size > 1
643
+ @currentText.push(data)
644
+ end
645
+
646
+ #Propagate comments right through.
647
+ def handle_comment(data)
648
+ handle_data("<!--#{data}-->")
649
+ end
650
+
651
+ def handle_special(data)
652
+ handle_data("<#{data}>")
653
+ end
654
+
655
+ def unknown_charref(ref)
656
+ handle_data("&#{ref};")
657
+ end
658
+
659
+ def unknown_entityref(ref)
660
+ handle_data("%#{ref}")
661
+ end
662
+
663
+ def attlistdecl(element_name, attributes, raw_content)
664
+ handle_data("<!ATTLIST #{raw_content}>")
665
+ end
666
+
667
+ def cdata(content)
668
+ handle_data("<![CDATA[#{content}]]")
669
+ end
670
+
671
+ ###
672
+
673
+ def doctype(*args)
674
+ content = args.join(' ')
675
+ ##{name} #{pub_sys}#{long_name}#{url}
676
+ #long_name = ' "#{long_name}"' if long_name
677
+ #url = ' "#{url}"' if url
678
+ handle_data("<!DOCTYPE #{content}>")
679
+ end
680
+
681
+ def elementdecl(content)
682
+ handle_data("<!ELEMENT #{content}>")
683
+ end
684
+
685
+ def entity(content)
686
+
687
+ end
688
+
689
+ def entitydecl(content)
690
+ handle_data("<!ENTITY #{content.join(' ')}>")
691
+ end
692
+
693
+ def instruction(name, instruction)
694
+ handle_data("<?#{name} #{instruction}>")
695
+ end
696
+
697
+ def notationdecl(content)
698
+ handle_data("<!NOTATION #{content}>")
699
+ end
700
+
701
+ def xmldecl(version, encoding, standalone)
702
+ encoding = ' encoding="#{encoding}"' if encoding
703
+ handle_data('<?xml version="#{version}"#{encoding}#{standalone}>')
704
+ end
705
+
706
+ #Called when we're done collecting some text, declarations, etc.
707
+ def end_text
708
+ currentText = @currentText.join('')
709
+ unless currentText.empty?
710
+ if currentText.strip.empty?
711
+ if currentText =~ /\n/
712
+ currentText = "\n"
713
+ else
714
+ currentText = ' '
715
+ end
716
+ end
717
+ #puts "Setting up text #{currentText}"
718
+ currentText = NavigableString.new(currentText)
719
+ currentText.setup(@currentTag, @previous_parsed)
720
+ @previous_parsed.next_parsed = currentText if @previous_parsed
721
+ @previous_parsed = currentText
722
+ @currentTag.contents.push(currentText)
723
+ end
724
+ @currentText = []
725
+ end
726
+
727
+ # Helper methods
728
+
729
+ private
730
+
731
+ #Pops the tag stack up to and including the most recent
732
+ #instance of the given tag. If inclusivePop is false, pops the tag
733
+ #stack up to but *not* including the most recent instance of
734
+ #the given tag.
735
+ def pop_to_tag(name, inclusive_pop=true)
736
+ return if name == @@rootTagName
737
+
738
+ #puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
739
+ num_pops = 0
740
+ mostRecentTag = nil
741
+ (@tag_stack.length-1).downto(0) do |i|
742
+ if name == @tag_stack[i].name
743
+ #puts "Found at #{i}, #{@tag_stack.length-i}"
744
+ num_pops = @tag_stack.length-i
745
+ break
746
+ end
747
+ end
748
+ num_pops -= 1 if not inclusive_pop
749
+
750
+ #puts "Popping #{num_pops} times."
751
+ num_pops.times { mostRecentTag = pop_tag }
752
+ mostRecentTag
753
+ end
754
+
755
+ #We need to pop up to the previous tag of this type, unless
756
+ #one of this tag's nesting reset triggers comes between this
757
+ #tag and the previous tag of this type, OR unless this tag is a
758
+ #generic nesting trigger and another generic nesting trigger
759
+ #comes between this tag and the previous tag of this type.
760
+ #
761
+ #Examples:
762
+ # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
763
+ # <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
764
+ # <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
765
+ # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
766
+ #
767
+ # <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
768
+ # <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
769
+ # <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
770
+ def smart_pop(name)
771
+ #puts "Smart pop for #{name}"
772
+ nesting_reset_triggers = @@nestable_tags[name]
773
+ is_nestable = nesting_reset_triggers != nil
774
+ is_reset_nesting = @@reset_nesting_tags.has_key?(name)
775
+ popTo = nil
776
+ inclusive = true
777
+ @tag_stack.reverse_each do |p|
778
+ if (p == nil or p.name == name) and not is_nestable
779
+ #Non-nestable tags get popped to the top or to their
780
+ #last occurance.
781
+ #puts "Non-nestable tag #{name} gets popped to its last occurance."
782
+ popTo = name
783
+ break
784
+ end
785
+ if (nesting_reset_triggers != nil and nesting_reset_triggers.include?(p.name)) or (nesting_reset_triggers == nil and is_reset_nesting and @@reset_nesting_tags.has_key?(p.name))
786
+ #If we encounter one of the nesting reset triggers
787
+ #peculiar to this tag, or we encounter another tag
788
+ #that causes nesting to reset, pop up to but not
789
+ #including that tag.
790
+ #puts "Nesting reset trigger encountered for #{name}: #{p.name}"
791
+ popTo = p.name
792
+ inclusive = false
793
+ break
794
+ end
795
+ p = p.parent
796
+ end
797
+ pop_to_tag(popTo, inclusive) if popTo
798
+ end
799
+
800
+ protected
801
+
802
+ #Turns a list of maps, lists, or scalars into a single map.
803
+ #Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
804
+ #of lists and partial maps.
805
+ def BeautifulStoneSoup.build_tag_map(default, *args)
806
+ built = args.inject({}) do |m, portion|
807
+ if portion.is_a? Hash
808
+ #It's a map. Merge it.
809
+ portion.each_pair { |k,v| m[k] = v }
810
+ elsif portion.is_a? Array
811
+ #It's a list. Map each item to the default.
812
+ portion.each { |k| m[k] = default }
813
+ else
814
+ #It's a scalar. Map it to the default.
815
+ m[portion] = default
816
+ end
817
+ m
818
+ end
819
+ end
820
+ end
821
+
822
+ #This parser knows the following facts about HTML:
823
+ #
824
+ #* Some tags have no closing tag and should be interpreted as being
825
+ # closed as soon as they are encountered.
826
+ #
827
+ #* The text inside some tags (ie. 'script') may contain tags which
828
+ # are not really part of the document and which should be parsed
829
+ # as text, not tags. If you want to parse the text as tags, you can
830
+ # always fetch it and parse it explicitly.
831
+ #
832
+ #* Tag nesting rules:
833
+ #
834
+ # Most tags can't be nested at all. For instance, the occurance of
835
+ # a <p> tag should implicitly close the previous <p> tag.
836
+ #
837
+ # <p>Para1<p>Para2
838
+ # should be transformed into:
839
+ # <p>Para1</p><p>Para2
840
+ #
841
+ # Some tags can be nested arbitrarily. For instance, the occurance
842
+ # of a <blockquote> tag should _not_ implicitly close the previous
843
+ # <blockquote> tag.
844
+ #
845
+ # Alice said: <blockquote>Bob said: <blockquote>Blah
846
+ # should NOT be transformed into:
847
+ # Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
848
+ #
849
+ # Some tags can be nested, but the nesting is reset by the
850
+ # interposition of other tags. For instance, a <tr> tag should
851
+ # implicitly close the previous <tr> tag within the same <table>,
852
+ # but not close a <tr> tag in another table.
853
+ #
854
+ # <table><tr>Blah<tr>Blah
855
+ # should be transformed into:
856
+ # <table><tr>Blah</tr><tr>Blah
857
+ # but,
858
+ # <tr>Blah<table><tr>Blah
859
+ # should NOT be transformed into
860
+ # <tr>Blah<table></tr><tr>Blah
861
+ #
862
+ #Differing assumptions about tag nesting rules are a major source
863
+ #of problems with the BeautifulSoup class. If BeautifulSoup is not
864
+ #treating as nestable a tag your page author treats as nestable,
865
+ #try writing a subclass.
866
+ class BeautifulSoup < BeautifulStoneSoup
867
+
868
+ @@self_closing_tags.replace(build_tag_map(nil, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame']))
869
+
870
+ @@quote_tags = {'script' => nil}
871
+
872
+ #According to the HTML standard, each of these inline tags can
873
+ #contain another tag of the same type. Furthermore, it's common
874
+ #to actually use these tags this way.
875
+ @@nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center']
876
+
877
+ #According to the HTML standard, these block tags can contain
878
+ #another tag of the same type. Furthermore, it's common
879
+ #to actually use these tags this way.
880
+ @@nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
881
+
882
+ #Lists can contain other lists, but there are restrictions.
883
+ @@nestable_list_tags = { 'ol' => [],
884
+ 'ul' => [],
885
+ 'li' => ['ul', 'ol'],
886
+ 'dl' => [],
887
+ 'dd' => ['dl'],
888
+ 'dt' => ['dl'] }
889
+
890
+ #Tables can contain other tables, but there are restrictions.
891
+ @@nestable_table_tags = {'table' => ['tr', 'td'],
892
+ 'tr' => ['table'],
893
+ 'td' => ['tr'],
894
+ 'th' => ['tr'],
895
+ }
896
+
897
+ @@non_nestable_block_tags = ['address', 'form', 'p', 'pre']
898
+
899
+ #If one of these tags is encountered, all tags up to the next tag of
900
+ #this type are popped.
901
+ @@reset_nesting_tags.replace(build_tag_map(nil, @@nestable_block_tags, 'noscript', @@non_nestable_block_tags,
902
+ @@nestable_list_tags, @@nestable_table_tags))
903
+
904
+ @@nestable_tags.replace(build_tag_map([], @@nestable_inline_tags, @@nestable_block_tags, @@nestable_list_tags, @@nestable_table_tags))
905
+
906
+ end
907
+
908
+ # This class will push a tag with only a single string child into
909
+ # the tag's parent as an attribute. The attribute's name is the tag
910
+ # name, and the value is the string child. An example should give
911
+ # the flavor of the change:
912
+ #
913
+ # <foo><bar>baz</bar></foo>
914
+ # =>
915
+ # <foo bar="baz"><bar>baz</bar></foo>
916
+ #
917
+ # You can then access fooTag['bar'] instead of fooTag.barTag.string.
918
+ #
919
+ # This is, of course, useful for scraping structures that tend to
920
+ # use subelements instead of attributes, such as SOAP messages. Note
921
+ # that it modifies its input, so don't print the modified version
922
+ # out.
923
+ class BeautifulSOAP < BeautifulStoneSoup
924
+ def pop_tag
925
+ if @tag_stack.size > 1
926
+ tag = @tag_stack[-1]
927
+ parent = @tag_stack[-2]
928
+ if (tag.is_a?(Tag) && tag.contents.size == 1 && \
929
+ tag.contents[0].is_a?(NavigableString) && !parent[tag.name])
930
+ parent[tag.name] = tag.contents[0]
931
+ end
932
+ super
933
+ end
934
+ end
935
+ end
936
+
937
+ #Enterprise class names! It has come to our attention that some people
938
+ #think the names of the Rubyful Soup parser classes are too silly
939
+ #and "unprofessional" for use in enterprise screen-scraping. We feel
940
+ #your pain! For such-minded folk, the Rubyful Soup Consortium And
941
+ #Rootin' Tootin' Texas Delicatessen recommends renaming this file to
942
+ #"RobustParser.rb" (or, in cases of extreme enterprisitude,
943
+ #"RobustParserBeanInterface.class") and using the following
944
+ #enterprise-friendly class aliases:
945
+ class RobustXMLParser < BeautifulStoneSoup; end
946
+ class RobustHTMLParser < BeautifulSoup; end
947
+ class SimplifyingSOAPParser < BeautifulSOAP; end
948
+
949
+ print BeautifulSoup.new(ARGF.read).prettify if $0 == __FILE__
950
+