rubyful_soup 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG ADDED
@@ -0,0 +1,12 @@
1
+ Rubyful Soup Changelog
2
+
3
+ 1.0.1
4
+
5
+ Changes from James Edward Gray (james at grayproductions dot net) to
6
+ quiet warnings.
7
+
8
+ Packaged as a gem for the first time.
9
+
10
+ 1.0.0
11
+
12
+ First full release
@@ -0,0 +1,925 @@
1
+ #Rubyful Soup
2
+ #Elixir and Tonic
3
+ #"The Screen-Scraper's Friend"
4
+ #v1.0.1
5
+ #http://www.crummy.com/software/RubyfulSoup/
6
+ #
7
+ #Rubyful Soup is a port to the Ruby language and idiom of the Python
8
+ #library Beautiful Soup.
9
+ #See http://www.crummy.com/software/BeautifulSoup/ for details on the original.
10
+
11
+ #This library requires the sgml-parser library, written by Takahiro
12
+ #Maebashi. The easiest way to get it is to install the "htmltools"
13
+ #gem.
14
+ require 'rubygems'
15
+ require 'sgml-parser'
16
+
17
+ #UTF-8 voodoo--does this really work?
18
+ $KCODE = 'u'
19
+ require 'jcode'
20
+
21
+ #This code makes SGMLParser able to parse XML with namespaces.
22
+ class SGMLParser
23
+ if const_defined? :Tagfind
24
+ remove_const(:Tagfind)
25
+ Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/
26
+ end
27
+ end
28
+
29
+ module PageElement
30
+
31
+ attr_reader :parser
32
+ attr_accessor :parent, :previous_parsed, :next_parsed, :previous_sibling
33
+ attr_accessor :next_sibling
34
+
35
+ def setup(parent=nil, previous_parsed=nil)
36
+ @parent = parent
37
+ @previous_parsed = previous_parsed
38
+ @next_parsed = nil
39
+ @previous_sibling = nil
40
+ @next_sibling = nil
41
+ if @parent and not @parent.contents.empty?
42
+ @previous_sibling = @parent.contents[-1]
43
+ @previous_sibling.next_sibling = self
44
+ end
45
+ end
46
+
47
+ #A bunch of different iterators over a parsed document.
48
+ {
49
+ #Iterates in parse order over the rest of the items in this document.
50
+ :next_parsed_items => :next_parsed,
51
+
52
+ #Iterates in reverse parse order over all previously parsed items in
53
+ #this document.
54
+ :previous_parsed_items => :previous_parsed,
55
+
56
+ #Iterates in parse order over all subsequent siblings of this item.
57
+ :next_siblings => :next_sibling,
58
+
59
+ #Iterates in reverse parse order over all prior siblings of this item.
60
+ :previous_siblings => :previous_sibling,
61
+
62
+ #Iterates upwards through the parentage of this item.
63
+ :parents => :parent
64
+ }.each do |k,v|
65
+ class_eval %{
66
+ def #{k}
67
+ i = self
68
+ while i
69
+ i = i.#{v}
70
+ yield i if i
71
+ end
72
+ end
73
+ }
74
+ end
75
+
76
+ [ #Returns first item/all items matching the given criteria and
77
+ #appearing after this PageElement in the document.
78
+ [:find_next, :find_all_next, 'next_parsed_items'],
79
+
80
+ #Returns first item/all items matching the given criteria and
81
+ #appearing before this PageElement in the document.
82
+ [:find_previous, :find_all_previous, 'previous_parsed_items'],
83
+
84
+ #Returns the nearest sibling/all siblings of this PageElement matching
85
+ #the given criteria and appearing before this PageElement in
86
+ #the document.
87
+ [:find_previous_sibling, :find_previous_siblings, 'previous_siblings'],
88
+
89
+ #Returns the nearest sibling/all siblings of this PageElement matching
90
+ #the given criteria and appearing after this PageElement in
91
+ #the document
92
+ [:find_next_sibling, :find_next_siblings, 'next_siblings'],
93
+
94
+ #Returns the nearest parent/all parents of this PageElement matching
95
+ #the given criteria.
96
+ [:find_parent, :find_parents, 'parents'],
97
+ ].each do |singular, plural, method_name|
98
+ class_eval %{
99
+ def #{singular}(name=nil, args={}, &block)
100
+ args['limit'] = 1
101
+ fetch(method('#{method_name}'), name, args, block)[0]
102
+ end
103
+
104
+ def #{plural}(name=nil, args={}, &block)
105
+ fetch(method('#{method_name}'), name, args, block)
106
+ end
107
+ }
108
+ end
109
+
110
+ protected
111
+
112
+ #Returns a list of items matching the given criteria, obtained by
113
+ #iterating over the given iterator.
114
+ def fetch(iterator, name, args, block)
115
+ attrs = args[:attrs]
116
+ limit = args[:limit]
117
+ text = args[:text]
118
+
119
+ attrs ||= {}
120
+ if attrs != nil and not attrs.respond_to? :keys
121
+ attrs = {'class' => attrs}
122
+ end
123
+ bucket = []
124
+ catch(:stop_iteration) do
125
+ iterator.call do |item|
126
+ match = false
127
+ if block
128
+ match = true if block.call(item)
129
+ elsif item.is_a? Tag
130
+ #A tag matches if its name matches and its attributes line up.
131
+ if not text and (not name or PageElement.matches(item, name))
132
+ match = true
133
+ attrs.each_pair do |attr, matchAgainst|
134
+ check = item[attr]
135
+ unless PageElement.matches(check, matchAgainst)
136
+ match = false
137
+ break
138
+ end
139
+ end
140
+ end
141
+ elsif text
142
+ #A text matches if its string value matches the given text
143
+ #criterion.
144
+ match = PageElement.matches(item, text)
145
+ end
146
+ if match
147
+ bucket.push(item)
148
+ if limit and bucket.length >= limit
149
+ throw :stop_iteration
150
+ end
151
+ end
152
+ end
153
+ end
154
+ return bucket
155
+ end
156
+
157
+ #Used to tell whether a Tag or a NavigableString "matches" some data
158
+ #structure.
159
+ def PageElement.matches(chunk, how_to_match)
160
+ #puts "Seeing if #{chunk.class} #{chunk} matches #{how_to_match.class} #{how_to_match}."
161
+ #
162
+ # If given a list of items, return true if the list contains a
163
+ # text element that matches.
164
+ if chunk.is_a? Array
165
+ chunk.each do |tag|
166
+ return true if tag.is_a? NavigableString and matches(tag, how_to_match)
167
+ end
168
+ return false
169
+ elsif how_to_match.is_a? Proc
170
+ return how_to_match.call(chunk)
171
+ elsif chunk.is_a? Tag
172
+ #Custom match methods take the tag as an argument, but all other
173
+ #ways of matching match the tag name as a string
174
+ chunk = chunk.name
175
+ end
176
+
177
+ #At this point we know that chunk is a string
178
+ unless chunk.is_a? String
179
+ chunk = chunk.to_s
180
+ end
181
+ if how_to_match.is_a? Regexp
182
+ return how_to_match.match(chunk) != nil
183
+ elsif how_to_match.is_a? Array
184
+ return how_to_match.find {|x| x == chunk} != nil
185
+ elsif how_to_match.is_a? Hash
186
+ return how_to_match[chunk] != nil
187
+ else
188
+ #It's just a string
189
+ return how_to_match.to_s == chunk
190
+ end
191
+ end
192
+
193
+ end
194
+
195
+ module TagModule
196
+
197
+ include Enumerable
198
+ include PageElement
199
+
200
+ attr_accessor :name, :contents, :attrs, :string
201
+
202
+ #I tried to have Tag subclass Method, but it killed the
203
+ #whole thing. Maybe I should just leave well enough alone.
204
+ #
205
+ #def arity
206
+ # return methods('find_all').arity
207
+ #end
208
+ #
209
+ #def call(*args)
210
+ # return find_all(*args)
211
+ #end
212
+ #
213
+ #def to_proc
214
+ # return methods('find_all').to_proc
215
+ #end
216
+
217
+ def initialize(parser, name, attrs=nil, parent=nil, previous=nil)
218
+ @hidden = false
219
+ @parser = parser
220
+ @name = name
221
+ attrs ||= {}
222
+ @attrs = attrs
223
+ @contents = []
224
+ setup(parent, previous)
225
+ end
226
+
227
+ #soup.title_tag or soup.title is the same as soup.find('title')
228
+ def method_missing(name, *args)
229
+ #puts "Missing method #{name}"
230
+ name = name.to_s
231
+ if name[-4...name.length] == '_tag'
232
+ name = name[0...name.length-4]
233
+ end
234
+ return find(name, *args)
235
+ end
236
+
237
+ #TODO: is there a mixin for Hash?
238
+ def [](k)
239
+ return @attrs[k]
240
+ end
241
+
242
+ def []=(k, v)
243
+ @attrs[k] = v
244
+ end
245
+
246
+ def delete(k)
247
+ @attrs.delete(k)
248
+ end
249
+
250
+ def has_key?(k)
251
+ return @attrs.has_key(k)
252
+ end
253
+
254
+ #End things that would go away if there was a mixin for Hash.
255
+
256
+ def each
257
+ @contents.each { |x| yield x }
258
+ end
259
+
260
+ def length
261
+ return contents.length
262
+ end
263
+ alias size length
264
+
265
+ def self_closing?
266
+ return @parser.self_closing_tag?(@name)
267
+ end
268
+
269
+ #Adds the given tag to the contents of this tag
270
+ def append(tag)
271
+ @contents.push(tag)
272
+ end
273
+
274
+ def to_str
275
+ return to_s
276
+ end
277
+
278
+ #Renders this tag and its contents as a pretty-printed string.
279
+ def prettify
280
+ return to_s(true)
281
+ end
282
+
283
+ def inspect
284
+ to_s
285
+ end
286
+
287
+ #Renders this tag and its contents as a string. NOTE: since REXML
288
+ #consumes whitespace, this method is not certain to reproduce the
289
+ #whitespace present in the original string.
290
+ def to_s(show_structure_indent=nil)
291
+ attrs = []
292
+ @attrs.each { |k,v| attrs.push("#{k}=\"#{v}\"") if v }
293
+ if self_closing?
294
+ close = ' /'
295
+ closeTag = nil
296
+ else
297
+ close = nil
298
+ closeTag = "</#{name}>"
299
+ end
300
+ indent_increment = show_structure_indent==true ? 0 : show_structure_indent
301
+ if show_structure_indent
302
+ indent_increment += 1 unless @hidden
303
+ end
304
+ contents = render_contents(indent_increment)
305
+ space = "\n #{' ' * indent_increment}" if show_structure_indent
306
+ if @hidden
307
+ s = contents
308
+ else
309
+ s = []
310
+ attribute_string = ''
311
+ unless attrs.empty?
312
+ attribute_string = ' ' + attrs.join(' ')
313
+ end
314
+ s.push(space) if show_structure_indent
315
+ s.push("<#{@name}#{attribute_string}#{close}>")
316
+ s.push(contents)
317
+ s.push(space) if closeTag and show_structure_indent
318
+ s.push(closeTag)
319
+ s = s.join('')
320
+ end
321
+ return s
322
+ end
323
+
324
+ #Renders the contents of this tag as a string.
325
+ def render_contents(show_structure_indent=nil)
326
+ s=[]
327
+ @contents.each do |c|
328
+ text = nil
329
+ if c.is_a? Tag
330
+ text = c.to_s(show_structure_indent)
331
+ else
332
+ text = c.to_s
333
+ end
334
+ if text
335
+ if show_structure_indent
336
+ text.chomp!
337
+ end
338
+ s.push(text)
339
+ end
340
+ end
341
+ return s.join('')
342
+ end
343
+
344
+ def recursive_children
345
+ stack = [[self, 0]]
346
+ catch(:stop_iteration) do
347
+ until stack.empty?
348
+ tag, start = stack.pop
349
+ for i in start...tag.contents.length
350
+ a = tag.contents[i]
351
+ yield a
352
+ if a.is_a? TagModule and not tag.contents.empty? and i < tag.contents.length
353
+ stack.push([tag, i+1])
354
+ stack.push([a, 0])
355
+ break
356
+ end
357
+ end if tag.is_a? TagModule
358
+ end
359
+ end
360
+ end
361
+
362
+ #Iterates over the direct children of this Tag.
363
+ def children
364
+ catch(:stop_iteration) { @contents.each { |x| yield x } }
365
+ end
366
+
367
+ #Convenience method to retrieve the first piece of text matching the
368
+ #given criteria. 'text' can be a string, a regular expression object,
369
+ #a Proc that takes a string and returns whether or not the
370
+ #string 'matches', etc.
371
+ def find_text(text=nil, &block)
372
+ args = { :text => text, :limit => 1}
373
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
374
+ fetch(iterator, nil, args, block)[0]
375
+ end
376
+
377
+ #Convenience method to retrieve all pieces of text matching the
378
+ #given criteria. 'text' can be a string, a regular expression object,
379
+ #a callable that takes a string and returns whether or not the
380
+ #string 'matches', etc.
381
+ #Args: :limit
382
+ def find_all_text(text=nil, args={}, &block)
383
+ args['text'] = text
384
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
385
+ fetch(iterator, nil, args, block)
386
+ end
387
+
388
+ #Extracts a list of Tag objects that match the given criteria. You
389
+ #can specify the name of the Tag and any attributes you want the Tag
390
+ #to have.
391
+ #
392
+ #The value of a key-value pair in the 'attrs' map can be a string, a
393
+ #list of strings, a regular expression object, or a Proc object that
394
+ #takes a string and returns whether or not the string matches for
395
+ #some custom definition of 'matches'. The same is true of the tag
396
+ #name, except that a Proc object will be passed the Tag object instead
397
+ #of just a string.
398
+ #Args: :attrs :text :limit :recursive
399
+ def find_all(name=nil, args={}, &block)
400
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
401
+ fetch(iterator, name, args, block)
402
+ end
403
+
404
+ #Returns the first Tag or NavigableString object that matches the
405
+ #given criteria. Takes much the same arguments as fetch.
406
+ #args: :attrs :text :limit :recursive
407
+ def find(name=nil, args={}, &block)
408
+ args[:limit] = 1
409
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
410
+ fetch(iterator, name, args, block)[0]
411
+ end
412
+ end
413
+
414
+ class Tag
415
+ include TagModule
416
+ end
417
+
418
+ class NavigableString < String
419
+ include PageElement
420
+ end
421
+
422
+ #This class contains the basic parser and fetch code. It defines
423
+ #a parser that knows nothing about tag behavior except for the
424
+ #following:
425
+ #
426
+ #You can't close a tag without closing all the tags it encloses.
427
+ #That is, "<foo><bar></foo>" actually means
428
+ #"<foo><bar></bar></foo>".
429
+ #
430
+ #[Another possible explanation is "<foo><bar /></foo>", but since
431
+ # this class defines no self_closing_tags, it will never use that
432
+ # explanation.]
433
+ #
434
+ #This class is useful for parsing XML or made-up markup languages,
435
+ #or when BeautifulSoup makes an assumption counter to what you were
436
+ #expecting."""
437
+ class BeautifulStoneSoup < SGMLParser
438
+ include TagModule
439
+
440
+ #As a public service we will by default silently replace MS smart quotes
441
+ #and similar characters with their HTML or ASCII equivalents.
442
+ @@ms_chars = { '\x80' => '&euro;',
443
+ "\x81" => ' ',
444
+ "\x82" => '&sbquo;',
445
+ "\x83" => '&fnof;',
446
+ "\x84" => '&bdquo;',
447
+ "\x85" => '&hellip;',
448
+ "\x86" => '&dagger;',
449
+ "\x87" => '&Dagger;',
450
+ "\x88" => '&caret;',
451
+ "\x89" => '%',
452
+ "\x8A" => '&Scaron;',
453
+ "\x8B" => '&lt;',
454
+ "\x8C" => '&OElig;',
455
+ "\x8D" => '?',
456
+ "\x8E" => 'Z',
457
+ "\x8F" => '?',
458
+ "\x90" => '?',
459
+ "\x91" => '&lsquo;',
460
+ "\x92" => '&rsquo;',
461
+ "\x93" => '&ldquo;',
462
+ "\x94" => '&rdquo;',
463
+ "\x95" => '&bull;',
464
+ "\x96" => '&ndash;',
465
+ "\x97" => '&mdash;',
466
+ "\x98" => '&tilde;',
467
+ "\x99" => '&trade;',
468
+ "\x9a" => '&scaron;',
469
+ "\x9b" => '&gt;',
470
+ "\x9c" => '&oelig;',
471
+ "\x9d" => '?',
472
+ "\x9e" => 'z',
473
+ "\x9f" => '&Yuml;'}
474
+
475
+ @@parser_massage = [[/<([^<>]*)\/>/, '<\1></\1>'],
476
+ [/<!\s+([^<>]*)>/, '<!\1>'],
477
+ [/([\x80-\x9f])/m, proc { |m| @@ms_chars[m]}]
478
+ ]
479
+
480
+ @@rootTagName = '[document]'
481
+
482
+ @@nestable_tags = {}
483
+ @@reset_nesting_tags = {}
484
+ @@quoteTags = {}
485
+ @@self_closing_tags = {}
486
+
487
+ attr_accessor :hidden
488
+
489
+ def self_closing_tag?(tag)
490
+ @@self_closing_tags.has_key?(tag)
491
+ end
492
+
493
+ #Args: :initial_text_is_everything, :avoid_parser_problems
494
+ def initialize(text, args={})
495
+ super(self, @@rootTagName)
496
+ @quote_stack = []
497
+ @hidden = 1
498
+ reset
499
+
500
+ @avoid_parser_problems = args[:avoid_parser_problems] || true
501
+ if @avoid_parser_problems and not @avoid_parser_problems.is_a? Enumerable
502
+ @avoid_parser_problems = @@parser_massage
503
+ end
504
+ feed(text) if text != nil
505
+ done if args[:initial_text_is_everything] != false
506
+ end
507
+
508
+ def feed(text)
509
+ if @avoid_parser_problems
510
+ #before = text.clone
511
+ @avoid_parser_problems.each do |re, fix|
512
+ if fix.is_a? String
513
+ text.gsub!(re, fix)
514
+ else
515
+ text.gsub!(re) { |x| fix.call(x) }
516
+ end
517
+ end
518
+ #if before != text
519
+ # puts "Changed from #{before} to #{text}"
520
+ #end
521
+ end
522
+ super
523
+ end
524
+
525
+ def ==(anObject)
526
+ return anObject.to_s == to_s
527
+ end
528
+
529
+ def done
530
+ end_text
531
+ pop_tag while @currentTag.name != @@rootTagName
532
+ end
533
+
534
+ def reset
535
+ super
536
+ @currentText = []
537
+ @currentTag = nil
538
+ @tag_stack = []
539
+ push_tag(self)
540
+ end
541
+
542
+ def push_tag(tag)
543
+ #puts "Push #{ tag.name }"
544
+ @currentTag.append(tag) if @currentTag
545
+ @tag_stack.push(tag)
546
+ @currentTag = @tag_stack[-1]
547
+ end
548
+
549
+ def pop_tag
550
+ tag = @tag_stack.pop
551
+ #puts "Pop #{ tag.name }"
552
+
553
+ # Tags with just one string-owning child get the child as a
554
+ # 'string' property, so that soup.tag.string is shorthand for
555
+ # soup.tag.contents[0]
556
+ if @currentTag.contents.length == 1 and @currentTag.contents[0].is_a? NavigableString
557
+ @currentTag.string = @currentTag.contents[0]
558
+ end
559
+
560
+ @currentTag = @tag_stack[-1] unless @tag_stack.empty?
561
+ @currentTag
562
+ end
563
+
564
+ # StreamListener implementation
565
+
566
+ def unknown_starttag(name, attrs)
567
+ #puts "Starting tag #{name} #{attrs.inspect}"
568
+ attrs = attrs.inject({}) do |m,v|
569
+ if v[1][0] == ?" and v[1][-1] == ?":
570
+ v[1] = v[1][1..-2]
571
+ end
572
+ m[v[0]] = v[1]
573
+ m
574
+ end
575
+ unless @quote_stack.empty?
576
+ #This is not a real tag.
577
+ #puts "<#{name}> is not real!"
578
+ #TODO: find idiomatic way to do this
579
+ attrString = []
580
+ attrs.each { |k,v| attrString.push('#{k}="#{v}"') }
581
+ self.handle_data('<#{name} #{attrString.join(' ')}>')
582
+ return
583
+ end
584
+
585
+ end_text
586
+ self_closing = @@self_closing_tags.has_key?(name)
587
+ smart_pop(name) unless self_closing
588
+ tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
589
+ @previous_parsed.next_parsed = tag if @previous_parsed
590
+ @previous_parsed = tag
591
+ push_tag(tag)
592
+ pop_tag if self_closing
593
+ if @@quoteTags.has_key?(name)
594
+ #puts "Beginning quote (#{name})"
595
+ @quote_stack.push(name)
596
+ end
597
+ end
598
+
599
+ def unknown_endtag(name)
600
+ #Ignore tag_end calls for self-closing tags; they were
601
+ #closed in the tag_start call.
602
+ #TODO: still neccessary?
603
+ #puts "Ending tag #{name}"
604
+ return if @@self_closing_tags.has_key?(name)
605
+
606
+ if not @quote_stack.empty? and @quote_stack[-1] != name
607
+ #This is not a real end tag.
608
+ #puts "</#{name}> is not real!"
609
+ handle_data('</#{name}>')
610
+ return
611
+ end
612
+ end_text
613
+ pop_to_tag(name)
614
+ @quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
615
+ end
616
+
617
+ def handle_data(data)
618
+ @currentText.push(data)
619
+ end
620
+
621
+ #Propagate comments right through.
622
+ def handle_comment(data)
623
+ handle_data("<!--#{comment}-->")
624
+ end
625
+
626
+ def handle_special(data)
627
+ handle_data("<#{data}>")
628
+ end
629
+
630
+ def unknown_charref(ref)
631
+ handle_data("&#{ref};")
632
+ end
633
+
634
+ def unknown_entityref(ref)
635
+ handle_data("%#{content}")
636
+ end
637
+
638
+ def attlistdecl(element_name, attributes, raw_content)
639
+ handle_data("<!ATTLIST #{raw_content}>")
640
+ end
641
+
642
+ def cdata(content)
643
+ handle_data("<![CDATA[#{content}]]")
644
+ end
645
+
646
+ ###
647
+
648
+ def doctype(*args)
649
+ content = args.join(' ')
650
+ ##{name} #{pub_sys}#{long_name}#{url}
651
+ #long_name = ' "#{long_name}"' if long_name
652
+ #url = ' "#{url}"' if url
653
+ handle_data("<!DOCTYPE #{content}>")
654
+ end
655
+
656
+ def elementdecl(content)
657
+ handle_data("<!ELEMENT #{content}>")
658
+ end
659
+
660
+ def entity(content)
661
+
662
+ end
663
+
664
+ def entitydecl(content)
665
+ handle_data("<!ENTITY #{content.join(' ')}>")
666
+ end
667
+
668
+ def instruction(name, instruction)
669
+ handle_data("<?#{name} #{instruction}>")
670
+ end
671
+
672
+ def notationdecl(content)
673
+ handle_data("<!NOTATION #{content}>")
674
+ end
675
+
676
+ def xmldecl(version, encoding, standalone)
677
+ encoding = ' encoding="#{encoding}"' if encoding
678
+ handle_data('<?xml version="#{version}"#{encoding}#{standalone}>')
679
+ end
680
+
681
+ #Called when we're done collecting some text, declarations, etc.
682
+ def end_text
683
+ currentText = @currentText.join('')
684
+ unless currentText.empty?
685
+ if currentText.strip.empty?
686
+ if currentText =~ /\n/
687
+ currentText = "\n"
688
+ else
689
+ currentText = ' '
690
+ end
691
+ end
692
+ #puts "Setting up text #{currentText}"
693
+ currentText = NavigableString.new(currentText)
694
+ currentText.setup(@currentTag, @previous_parsed)
695
+ @previous_parsed.next_parsed = currentText if @previous_parsed
696
+ @previous_parsed = currentText
697
+ @currentTag.contents.push(currentText)
698
+ end
699
+ @currentText = []
700
+ end
701
+
702
+ # Helper methods
703
+
704
+ private
705
+
706
+ #Pops the tag stack up to and including the most recent
707
+ #instance of the given tag. If inclusivePop is false, pops the tag
708
+ #stack up to but *not* including the most recent instance of
709
+ #the given tag.
710
+ def pop_to_tag(name, inclusive_pop=true)
711
+ return if name == @@rootTagName
712
+
713
+ #puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
714
+ num_pops = 0
715
+ mostRecentTag = nil
716
+ (0...@tag_stack.length).to_a.reverse.each do |i|
717
+ if name == @tag_stack[i].name
718
+ #puts "Found at #{i}, #{@tag_stack.length-i}"
719
+ num_pops = @tag_stack.length-i
720
+ break
721
+ end
722
+ end
723
+ num_pops -= 1 if not inclusive_pop
724
+
725
+ #puts "Popping #{num_pops} times."
726
+ num_pops.times { mostRecentTag = pop_tag }
727
+ mostRecentTag
728
+ end
729
+
730
+ #We need to pop up to the previous tag of this type, unless
731
+ #one of this tag's nesting reset triggers comes between this
732
+ #tag and the previous tag of this type, OR unless this tag is a
733
+ #generic nesting trigger and another generic nesting trigger
734
+ #comes between this tag and the previous tag of this type.
735
+ #
736
+ #Examples:
737
+ # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
738
+ # <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
739
+ # <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
740
+ # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
741
+ #
742
+ # <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
743
+ # <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
744
+ # <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
745
+ def smart_pop(name)
746
+ #puts "Smart pop for #{name}"
747
+ nesting_reset_triggers = @@nestable_tags[name]
748
+ is_nestable = nesting_reset_triggers != nil
749
+ is_reset_nesting = @@reset_nesting_tags.has_key?(name)
750
+ popTo = nil
751
+ inclusive = true
752
+ for p in @tag_stack.reverse
753
+ if (p == nil or p.name == name) and not is_nestable
754
+ #Non-nestable tags get popped to the top or to their
755
+ #last occurance.
756
+ #puts "Non-nestable tag #{name} gets popped to its last occurance."
757
+ popTo = name
758
+ break
759
+ end
760
+ if (nesting_reset_triggers != nil and nesting_reset_triggers.include?(p.name)) or (nesting_reset_triggers == nil and is_reset_nesting and @@reset_nesting_tags.has_key?(p.name))
761
+ #If we encounter one of the nesting reset triggers
762
+ #peculiar to this tag, or we encounter another tag
763
+ #that causes nesting to reset, pop up to but not
764
+ #including that tag.
765
+ #puts "Nesting reset trigger encountered for #{name}: #{p.name}"
766
+ popTo = p.name
767
+ inclusive = false
768
+ break
769
+ end
770
+ p = p.parent
771
+ end
772
+ pop_to_tag(popTo, inclusive) if popTo
773
+ end
774
+
775
+ protected
776
+
777
+ #Turns a list of maps, lists, or scalars into a single map.
778
+ #Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
779
+ #of lists and partial maps.
780
+ def BeautifulStoneSoup.build_tag_map(default, *args)
781
+ built = args.inject({}) do |m, portion|
782
+ if portion.is_a? Hash
783
+ #It's a map. Merge it.
784
+ portion.each_pair { |k,v| m[k] = v }
785
+ elsif portion.is_a? Array
786
+ #It's a list. Map each item to the default.
787
+ portion.each { |k| m[k] = default }
788
+ else
789
+ #It's a scalar. Map it to the default.
790
+ m[portion] = default
791
+ end
792
+ m
793
+ end
794
+ end
795
+ end
796
+
797
+ #This parser knows the following facts about HTML:
798
+ #
799
+ #* Some tags have no closing tag and should be interpreted as being
800
+ # closed as soon as they are encountered.
801
+ #
802
+ #* The text inside some tags (ie. 'script') may contain tags which
803
+ # are not really part of the document and which should be parsed
804
+ # as text, not tags. If you want to parse the text as tags, you can
805
+ # always fetch it and parse it explicitly.
806
+ #
807
+ #* Tag nesting rules:
808
+ #
809
+ # Most tags can't be nested at all. For instance, the occurance of
810
+ # a <p> tag should implicitly close the previous <p> tag.
811
+ #
812
+ # <p>Para1<p>Para2
813
+ # should be transformed into:
814
+ # <p>Para1</p><p>Para2
815
+ #
816
+ # Some tags can be nested arbitrarily. For instance, the occurance
817
+ # of a <blockquote> tag should _not_ implicitly close the previous
818
+ # <blockquote> tag.
819
+ #
820
+ # Alice said: <blockquote>Bob said: <blockquote>Blah
821
+ # should NOT be transformed into:
822
+ # Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
823
+ #
824
+ # Some tags can be nested, but the nesting is reset by the
825
+ # interposition of other tags. For instance, a <tr> tag should
826
+ # implicitly close the previous <tr> tag within the same <table>,
827
+ # but not close a <tr> tag in another table.
828
+ #
829
+ # <table><tr>Blah<tr>Blah
830
+ # should be transformed into:
831
+ # <table><tr>Blah</tr><tr>Blah
832
+ # but,
833
+ # <tr>Blah<table><tr>Blah
834
+ # should NOT be transformed into
835
+ # <tr>Blah<table></tr><tr>Blah
836
+ #
837
+ #Differing assumptions about tag nesting rules are a major source
838
+ #of problems with the BeautifulSoup class. If BeautifulSoup is not
839
+ #treating as nestable a tag your page author treats as nestable,
840
+ #try writing a subclass.
841
+ class BeautifulSoup < BeautifulStoneSoup
842
+
843
+ @@self_closing_tags.replace(build_tag_map(nil, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame']))
844
+
845
+ @@quote_tags = {'script' => nil}
846
+
847
+ #According to the HTML standard, each of these inline tags can
848
+ #contain another tag of the same type. Furthermore, it's common
849
+ #to actually use these tags this way.
850
+ @@nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center']
851
+
852
+ #According to the HTML standard, these block tags can contain
853
+ #another tag of the same type. Furthermore, it's common
854
+ #to actually use these tags this way.
855
+ @@nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
856
+
857
+ #Lists can contain other lists, but there are restrictions.
858
+ @@nestable_list_tags = { 'ol' => [],
859
+ 'ul' => [],
860
+ 'li' => ['ul', 'ol'],
861
+ 'dl' => [],
862
+ 'dd' => ['dl'],
863
+ 'dt' => ['dl'] }
864
+
865
+ #Tables can contain other tables, but there are restrictions.
866
+ @@nestable_table_tags = {'table' => ['tr', 'td'],
867
+ 'tr' => ['table'],
868
+ 'td' => ['tr'],
869
+ 'th' => ['tr'],
870
+ }
871
+
872
+ @@non_nestable_block_tags = ['address', 'form', 'p', 'pre']
873
+
874
+ #If one of these tags is encountered, all tags up to the next tag of
875
+ #this type are popped.
876
+ @@reset_nesting_tags.replace(build_tag_map(nil, @@nestable_block_tags, 'noscript', @@non_nestable_block_tags,
877
+ @@nestable_list_tags, @@nestable_table_tags))
878
+
879
+ @@nestable_tags.replace(build_tag_map([], @@nestable_inline_tags, @@nestable_block_tags, @@nestable_list_tags, @@nestable_table_tags))
880
+
881
+ end
882
+
883
+ # This class will push a tag with only a single string child into
884
+ # the tag's parent as an attribute. The attribute's name is the tag
885
+ # name, and the value is the string child. An example should give
886
+ # the flavor of the change:
887
+ #
888
+ # <foo><bar>baz</bar></foo>
889
+ # =>
890
+ # <foo bar="baz"><bar>baz</bar></foo>
891
+ #
892
+ # You can then access fooTag['bar'] instead of fooTag.barTag.string.
893
+ #
894
+ # This is, of course, useful for scraping structures that tend to
895
+ # use subelements instead of attributes, such as SOAP messages. Note
896
+ # that it modifies its input, so don't print the modified version
897
+ # out.
898
+ class BeautifulSOAP < BeautifulStoneSoup
899
+ def pop_tag
900
+ if @tag_stack.size > 1
901
+ tag = @tag_stack[-1]
902
+ parent = @tag_stack[-2]
903
+ if (tag.is_a?(Tag) && tag.contents.size == 1 && \
904
+ tag.contents[0].is_a?(NavigableString) && !parent[tag.name])
905
+ parent[tag.name] = tag.contents[0]
906
+ end
907
+ super
908
+ end
909
+ end
910
+ end
911
+
912
+ #Enterprise class names! It has come to our attention that some people
913
+ #think the names of the Rubyful Soup parser classes are too silly
914
+ #and "unprofessional" for use in enterprise screen-scraping. We feel
915
+ #your pain! For such-minded folk, the Rubyful Soup Consortium And
916
+ #Rootin' Tootin' Texas Delicatessen recommends renaming this file to
917
+ #"RobustParser.rb" (or, in cases of extreme enterprisitude,
918
+ #"RobustParserBeanInterface.class") and using the following
919
+ #enterprise-friendly class aliases:
920
+ class RobustXMLParser < BeautifulStoneSoup; end
921
+ class RobustHTMLParser < BeautifulSoup; end
922
+ class SimplifyingSOAPParser < BeautifulSOAP; end
923
+
924
+ print BeautifulSoup.new(ARGF.read).prettify if $0 == __FILE__
925
+
@@ -0,0 +1,431 @@
1
+ #Unit tests for Rubyful Soup.
2
+ #
3
+ #These tests make sure the Rubyful Soup works as it should. If you
4
+ #find a bug in Rubyful Soup, the best way to express it is as a test
5
+ #case like this that fails.
6
+
7
+ require 'test/unit'
8
+ require 'rubygems'
9
+ require 'rubyful_soup'
10
+
11
+ class SoupTest < Test::Unit::TestCase
12
+
13
+ #Parse the given text and make sure its string rep is the other
14
+ #given text.
15
+ def assert_soup_equals(toParse, rep=nil, c=BeautifulStoneSoup)
16
+ if rep == nil
17
+ rep = toParse
18
+ end
19
+ assert_equal(c.new(toParse).to_s(false), rep)
20
+ end
21
+
22
+ #Null test to shut the compiler up.
23
+ def test_null
24
+ end
25
+
26
+ end
27
+
28
+ #Tests the various ways of fetching tags from a soup.
29
+ class ToteThatTag < SoupTest
30
+
31
+ def setup
32
+ ml = %{
33
+ <a id="x">1</a>
34
+ <a id="a">2</a>
35
+ <b id="b">3</b>
36
+ <b id="x">4</b>
37
+ <abc:d width="100">5</abc:d>}
38
+ @soup = BeautifulStoneSoup.new(ml)
39
+ end
40
+
41
+ def test_fetch_by_name
42
+ matching = @soup.find_all('a')
43
+ assert_equal(matching.length, 2)
44
+ assert_equal(matching[0].name, 'a')
45
+ assert_equal(matching[0], @soup.find('a'))
46
+ assert_equal(@soup.find('abc:d').contents.length, 1)
47
+
48
+ firstB = @soup.find('b')
49
+ nextB = firstB.find_next('b')
50
+ assert_equal(nextB.contents[0], '4')
51
+ assert_equal(nextB['id'], 'x')
52
+
53
+ end
54
+
55
+ def test_fetch_by_block
56
+
57
+ a = @soup.find_all('a')
58
+ b = @soup.find_all do |x|
59
+ x.is_a? Tag and x.name == 'a'
60
+ end
61
+ assert_equal(a,b)
62
+
63
+ a = @soup.find_text('3')
64
+ b = @soup.find_text do |x|
65
+ x.is_a? NavigableString and x == '3'
66
+ end
67
+ assert_equal(a,b)
68
+
69
+ matching = @soup.find_all do |x|
70
+ x.respond_to?('name') and x.name == x['id']
71
+ end
72
+ assert_equal(matching.length, 2)
73
+ assert_equal(matching[0].name, 'a')
74
+ end
75
+
76
+ def test_fetch_by_attribute
77
+ matching = @soup.find_all(nil, :attrs=>{'id' => 'x'})
78
+ assert_equal(matching.length, 2)
79
+ assert_equal(matching[0].name, 'a')
80
+ assert_equal(matching[1].name, 'b')
81
+
82
+ assert_equal(@soup.find_all(nil, :attrs=>{'id' => nil}).length, 1)
83
+ assert_equal(@soup.find_all(nil, :attrs=>{'id' => nil}).length, 1)
84
+
85
+ assert_equal(@soup.find_all(nil, :attrs=>{'width' => 100}).length, 1)
86
+ end
87
+
88
+ def test_tag_name_as_method
89
+ firstB = @soup.find('b')
90
+ assert_equal(firstB, @soup.b)
91
+ assert_equal(firstB, @soup.b_tag)
92
+ end
93
+
94
+ def test_fetch_by_list
95
+ matching = @soup.find_all(['a', 'abc:d'])
96
+ assert_equal(matching.length, 3)
97
+ end
98
+
99
+ def test_fetch_by_hash
100
+ matching = @soup.find_all({'a' => true, 'b' => true})
101
+ assert_equal(matching.length, 4)
102
+ end
103
+
104
+ def test_fetch_by_re
105
+ r = /a.*/
106
+ assert_equal(@soup.find_all(r).length, 3)
107
+ end
108
+
109
+ def test_fetch_by_method
110
+ proc = Proc.new { |x| return x.name == x['id'] }
111
+ matching = @soup.find_all(proc)
112
+ assert_equal(matching.length, 2)
113
+ assert_equal(matching[0].name, 'a')
114
+ end
115
+
116
+ end
117
+
118
+ #Testing the integrity of the parse tree.
119
+ class FollowThatTag < SoupTest
120
+
121
+ @@PROXIMITY_TEST = BeautifulStoneSoup.new('<b id="1"><b id="2"><b id="3"><b id="4">')
122
+
123
+ @@SIBLING_TEST = BeautifulStoneSoup.new('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
124
+
125
+ def test_parents
126
+ soup = BeautifulSoup.new('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah</b></ul></ul></ul>')
127
+ b = soup.find('b')
128
+ assert_equal(b.find_parents('ul', :attrs=>{'id' => 'foo'}).length, 2)
129
+ assert_equal(b.find_parent('ul')['a'], 'b')
130
+ end
131
+
132
+ def test_next_sibling
133
+ soup = @@SIBLING_TEST
134
+ tag = 'blockquote'
135
+ b = soup.find(tag, :attrs=>{'id' => 2})
136
+ assert_equal(b.find_next(tag)['id'], '2.1')
137
+ assert_equal(b.find_next_sibling(tag)['id'], '3')
138
+ assert_equal(b.find_next_sibling(tag)['id'], '3')
139
+ assert_equal(b.find_next_siblings(tag).length, 2)
140
+ assert_equal(b.find_next_siblings(tag, :attrs=>{'id' => 4}).length, 1)
141
+ end
142
+
143
+ def test_previous_sibling
144
+ soup = @@SIBLING_TEST
145
+ tag = 'blockquote'
146
+ b = soup.find(tag, :attrs=>{'id' => 3})
147
+ assert_equal(b.find_previous(tag)['id'], '2.1')
148
+ assert_equal(b.find_previous_sibling(tag)['id'], '2')
149
+ assert_equal(b.find_previous_sibling(tag)['id'], '2')
150
+ assert_equal(b.find_previous_siblings(tag).length, 2)
151
+ assert_equal(b.find_previous_siblings(tag, :attrs=>{'id' => 1}).length, 1)
152
+ end
153
+
154
+ def test_text_navigation
155
+ soup = BeautifulSoup.new('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
156
+ baz = soup.find_text('Baz')
157
+ assert_equal(baz.find_parent("i")['id'], '1')
158
+ assert_equal(baz.find_next(nil, :text=> 'Blee'), 'Blee')
159
+ assert_equal(baz.find_next_sibling(nil, :text=>'Blee'), 'Blee')
160
+ assert_equal(baz.find_next_sibling(nil, :text=>'Blargh'), nil)
161
+ assert_equal(baz.find_next_sibling('hr')['id'], '1')
162
+ end
163
+
164
+ end
165
+
166
+ #Tests the nextSibling and previousSibling navigation.
167
+ class SiblingRivalry < SoupTest
168
+
169
+ def test_siblings
170
+ soup = BeautifulSoup.new("<ul><li>1<p>A</p>B</li><li>2</li><li>3</li></ul>")
171
+ second_li = soup.find('li').next_sibling
172
+ assert_equal(second_li.name, 'li')
173
+ assert_equal(second_li.string, '2')
174
+ assert_equal(soup.find_text('1').next_sibling.name, 'p')
175
+ assert_equal(soup.find('p').next_sibling, 'B')
176
+ assert_equal(soup.find('p').next_sibling.previous_sibling.next_sibling,
177
+ 'B')
178
+ end
179
+ end
180
+
181
+ #Tests the various built-in functions of Tag objects.
182
+ class TagsAreObjectsToo < SoupTest
183
+
184
+ @@SOUP = BeautifulSoup.new('<top id="1">1<b>2</b>3</top>')
185
+
186
+ def test_length
187
+ assert_equal(@@SOUP.top.length, 3)
188
+ end
189
+
190
+ def test_hash_lookup
191
+ assert_equal(@@SOUP.top['id'], "1")
192
+ end
193
+
194
+ def test_iterator
195
+ bucket = []
196
+ @@SOUP.top.each do |x|
197
+ bucket << x
198
+ end
199
+ assert_equal(bucket.length, 3)
200
+ assert_equal(bucket[2], "3")
201
+ end
202
+
203
+ end
204
+
205
+ #Tests the use of 'string' as an alias for a tag's only content.
206
+ class StringEmUp < SoupTest
207
+
208
+ def test_string
209
+ s = BeautifulSoup.new('<b>foo</b>')
210
+ assert_equal(s.b.string, 'foo')
211
+ end
212
+
213
+ def test_lack_of_string
214
+ s = BeautifulSoup.new("<b>f<i>e</i>o</b>")
215
+ self.assert_equal(s.b.string, nil)
216
+ end
217
+ end
218
+
219
+ #Tests the limit argument.
220
+ class ThatsMyLimit < SoupTest
221
+
222
+ def test_basic_limits
223
+ s = BeautifulSoup.new('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
224
+ assert_equal(s.find_all('br').length, 4)
225
+ assert_equal(s.find_all('br', :limit=> 2).length, 2)
226
+ end
227
+ end
228
+
229
+ #Testing the modification of the tree.
230
+ class WriteOnlyCode < SoupTest
231
+
232
+ def test_replace_contents
233
+ soup = BeautifulSoup.new('<a>foo</a>')
234
+ soup.a.contents[0] = (NavigableString.new('bar'))
235
+ assert_equal(soup.render_contents, '<a>bar</a>')
236
+ end
237
+
238
+ def test_modify_attributes
239
+ soup = BeautifulSoup.new('<a id="1"></a>')
240
+ first_a = soup.find('a')
241
+
242
+ first_a['id'] = 2
243
+ assert_equal(soup.render_contents, '<a id="2"></a>')
244
+ first_a['id'] = nil
245
+ assert_equal(soup.render_contents, '<a></a>')
246
+
247
+ first_a['id2'] = 'foo'
248
+ assert_equal(soup.render_contents, '<a id2="foo"></a>')
249
+ first_a.delete('id2')
250
+ assert_equal(soup.render_contents, '<a></a>')
251
+ end
252
+
253
+ #Makes sure tags don't step on each others' toes.
254
+ def test_new_tag_
255
+ soup = BeautifulSoup.new('')
256
+ a = Tag.new(soup, 'a')
257
+ ol = Tag.new(soup, 'ol')
258
+ a["href"] = "http://foo.com/"
259
+ assert_equal(ol["href"], nil)
260
+ end
261
+ end
262
+
263
+ #Our operators do it all! Call now!
264
+ class OperatorOverload < SoupTest
265
+
266
+ def test_tag_name_as_find
267
+ # Tests that referencing a tag name as a member delegates to find.
268
+ soup = BeautifulSoup.new('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
269
+ assert_equal(soup.b.i, soup.find('b').find('i'))
270
+ assert_equal(soup.b.i.string, 'bar')
271
+ assert_equal(soup.b['id'], '1')
272
+ assert_equal(soup.b.contents[0], 'foo')
273
+ assert(soup.a == nil)
274
+
275
+ #Test the .foo_tag variant of .foo.
276
+ assert_equal(soup.b_tag.i_tag.string, 'bar')
277
+ assert_equal(soup.b.i_tag.string, 'bar')
278
+ assert_equal(soup.find('b').find('i'), soup.b_tag.i_tag)
279
+ end
280
+ end
281
+
282
+ #Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!
283
+ class NestableEgg < SoupTest
284
+
285
+ def test_para_inside_blockquote
286
+ soup = BeautifulSoup.new('<blockquote><p><b>Foo</b></p></blockquote><p>Bar')
287
+ assert_equal(soup.blockquote.p.b.string, 'Foo')
288
+ assert_equal(soup.blockquote.b.string, 'Foo')
289
+ assert_equal(soup.find('p', :recursive=>false).string, 'Bar')
290
+ end
291
+
292
+ def test_nested_tables
293
+ text = %{<table id="1"><tr><td>Here's another table:
294
+ <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>}
295
+ soup = BeautifulSoup.new(text)
296
+ assert_equal(soup.table.table.td.string, 'Juicy text')
297
+ assert_equal(soup.find_all('table').length, 2)
298
+ assert_equal(soup.table.find_all('table').length, 1)
299
+ assert_equal(soup.find('table', :attrs=>{'id' => 2}).parent.parent.parent.name,
300
+ 'table')
301
+ end
302
+
303
+ def test_bad_nested_tables
304
+ soup = BeautifulSoup.new("<table><tr><table><tr id='nested'></tr></table></tr></table>")
305
+ assert_equal(soup.table.tr.table.tr['id'], 'nested')
306
+ end
307
+ end
308
+
309
+
310
+ #Here we test cleanup of text that breaks an unaltered parser or is just
311
+ #obnoxious.
312
+ class CleanupOnAisleFour < SoupTest
313
+
314
+ def test_self_closing_tag
315
+ assert_equal(BeautifulStoneSoup.new("Foo<br/>Bar").find('br').to_s,
316
+ '<br />')
317
+ assert_soup_equals('<p>test1<br/>test2</p>',
318
+ '<p>test1<br />test2</p>')
319
+ end
320
+
321
+ def test_bad_closing_tags
322
+ BeautifulStoneSoup.new("<a>Foo<b>Bar</a>")
323
+ end
324
+
325
+ def test_premature_closing_tag
326
+ BeautifulStoneSoup.new("</b><a>Foo<b>Bar</a>")
327
+ end
328
+
329
+ def test_bad_doctype
330
+ assert_soup_equals("<!DOCTYPE foo='bar'>")
331
+ end
332
+
333
+ def test_whitespace_in_declaration
334
+ assert_soup_equals('<! DOCTYPE>', '<!DOCTYPE>')
335
+ end
336
+
337
+ def test_JunkInDeclaration
338
+ assert_soup_equals('<! Foo = -8>a', '<!Foo = -8>a')
339
+ end
340
+
341
+ def test_incomplete_declaration
342
+ assert_soup_equals('a<!b <p>c', 'a<!b <p>c</p>')
343
+ end
344
+
345
+ def test_valid_but_bogus_declaration
346
+ assert_soup_equals('<! Foo >a', '<!Foo >a')
347
+ end
348
+
349
+ #This fails for a totally bogus reason! I can't figure it out.
350
+ #def test_smart_quotes_not_so_smart_anymore_FAILS
351
+ # assert_soup_equals("\x91Foo\x92", '&lsquo;Foo&rsquo;')
352
+ #end
353
+
354
+ #def test_incomplete_declaration_at_endFAILS
355
+ # assert_soup_equals('a<!b')
356
+ #end
357
+
358
+ end
359
+
360
+ #Verifies that the parser treats multiple feed calls the same as one
361
+ #big feed call only if constructed with
362
+ #initialTextIsEverything=False.
363
+ class KeepOnParsing < SoupTest
364
+
365
+ def test_multiple_parse_calls
366
+ f1 = '<foo>bah<bar>'
367
+ f2 = 'blee</bar></foo>'
368
+
369
+ s1 = BeautifulSoup.new(f1+f2)
370
+ s2 = BeautifulSoup.new(f1)
371
+ s2.feed(f2)
372
+ s3 = BeautifulSoup.new(f1, :initial_text_is_everything => false)
373
+ s3.feed(f2)
374
+ assert_not_equal(s1, s2)
375
+ assert_equal(s1, s3)
376
+ end
377
+ end
378
+
379
+ #Verifies that BeautifulSOAP parser works.
380
+ class SOAPMeUp < SoupTest
381
+ def test_basic_soap
382
+ s = "<foo><bar>baz</bar></foo>"
383
+ soup = BeautifulSOAP.new(s)
384
+ assert_equal(soup.to_s, %{<foo bar="baz"><bar>baz</bar></foo>})
385
+ end
386
+
387
+ def test_dont_overwrite_existing_attr
388
+ s = %{<foo bar="don't kill me!"><bar>baz</bar></foo>}
389
+ soup = BeautifulSOAP.new(s)
390
+ assert_equal(soup.to_s, s)
391
+ end
392
+ end
393
+
394
+ #The Unicode test suite has not yet been ported because I haven't
395
+ #figured out how Ruby does Unicode.
396
+
397
+ # class UnicodeRed < SoupTest
398
+ # "Makes sure Unicode works."
399
+
400
+ # def setUp
401
+ # text = 'foo<b>bar</b>'
402
+ # @soup = BeautifulStoneSoup
403
+ # @soup.feed(text)
404
+
405
+ # def test_BasicUnicode
406
+ # import types
407
+ # sType = types.StringType
408
+ # uType = types.UnicodeType
409
+
410
+ # u = u'\3100'
411
+ # #It starts out ASCII...
412
+ # assert_equal(type(@soup.renderContents), sType)
413
+ # assert_equal(type(@soup.prettify), sType)
414
+ # #But you can have unicode if you want.
415
+ # assert_equal(type(unicode(@soup)), uType)
416
+
417
+ # #Add a Unicode character and it's Unicode.
418
+ # @soup.feed(u)
419
+ # assert_equal(type(@soup.renderContents), uType)
420
+ # assert_equal(type(@soup.prettify), uType)
421
+ # #But you can have ASCII if you want.
422
+ # assert_equal(type(str(@soup)), sType)
423
+
424
+ # #The part without any Unicode is still ASCII.
425
+ # assert_equal(type(@soup.b.prettify), sType)
426
+
427
+ # #But if you add a Unicode character it'll become Unicode.
428
+ # @soup.b['foo'] = u'\3100'
429
+ # assert_equal(type(@soup.b.prettify), uType)
430
+
431
+
metadata ADDED
@@ -0,0 +1,52 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.4
3
+ specification_version: 1
4
+ name: rubyful_soup
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.1
7
+ date: 2005-10-21
8
+ summary: An HTML/XML parser that handles bad markup and provides tree traversal methods.
9
+ require_paths:
10
+ - lib
11
+ email: leonardr@segfault.org
12
+ homepage: http://www.crummy.com/software/RubyfulSoup/
13
+ rubyforge_project:
14
+ description: "Rubyful Soup is a *ML parser that makes screen-scraping easy. It won't choke on
15
+ bad markup, and it's easy to locate the part of a document you want."
16
+ autorequire:
17
+ default_executable:
18
+ bindir: bin
19
+ has_rdoc: true
20
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
21
+ requirements:
22
+ -
23
+ - ">"
24
+ - !ruby/object:Gem::Version
25
+ version: 0.0.0
26
+ version:
27
+ platform: ruby
28
+ authors:
29
+ - Leonard Richardson
30
+ files:
31
+ - lib/rubyful_soup.rb
32
+ - tests/rubyful_soup_tests.rb
33
+ - CHANGELOG
34
+ test_files:
35
+ - tests/rubyful_soup_tests.rb
36
+ rdoc_options: []
37
+ extra_rdoc_files:
38
+ - CHANGELOG
39
+ executables: []
40
+ extensions: []
41
+ requirements: []
42
+ dependencies:
43
+ - !ruby/object:Gem::Dependency
44
+ name: htmltools
45
+ version_requirement:
46
+ version_requirements: !ruby/object:Gem::Version::Requirement
47
+ requirements:
48
+ -
49
+ - ">"
50
+ - !ruby/object:Gem::Version
51
+ version: 0.0.0
52
+ version: