rubyful_soup 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG ADDED
@@ -0,0 +1,12 @@
1
+ Rubyful Soup Changelog
2
+
3
+ 1.0.1
4
+
5
+ Changes from James Edward Gray (james at grayproductions dot net) to
6
+ quiet warnings.
7
+
8
+ Packaged as a gem for the first time.
9
+
10
+ 1.0.0
11
+
12
+ First full release
@@ -0,0 +1,925 @@
1
+ #Rubyful Soup
2
+ #Elixir and Tonic
3
+ #"The Screen-Scraper's Friend"
4
+ #v1.0.1
5
+ #http://www.crummy.com/software/RubyfulSoup/
6
+ #
7
+ #Rubyful Soup is a port to the Ruby language and idiom of the Python
8
+ #library Beautiful Soup.
9
+ #See http://www.crummy.com/software/BeautifulSoup/ for details on the original.
10
+
11
+ #This library requires the sgml-parser library, written by Takahiro
12
+ #Maebashi. The easiest way to get it is to install the "htmltools"
13
+ #gem.
14
+ require 'rubygems'
15
+ require 'sgml-parser'
16
+
17
+ #UTF-8 voodoo--does this really work?
18
+ $KCODE = 'u'
19
+ require 'jcode'
20
+
21
+ #This code makes SGMLParser able to parse XML with namespaces.
22
+ class SGMLParser
23
+ if const_defined? :Tagfind
24
+ remove_const(:Tagfind)
25
+ Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/
26
+ end
27
+ end
28
+
29
+ module PageElement
30
+
31
+ attr_reader :parser
32
+ attr_accessor :parent, :previous_parsed, :next_parsed, :previous_sibling
33
+ attr_accessor :next_sibling
34
+
35
+ def setup(parent=nil, previous_parsed=nil)
36
+ @parent = parent
37
+ @previous_parsed = previous_parsed
38
+ @next_parsed = nil
39
+ @previous_sibling = nil
40
+ @next_sibling = nil
41
+ if @parent and not @parent.contents.empty?
42
+ @previous_sibling = @parent.contents[-1]
43
+ @previous_sibling.next_sibling = self
44
+ end
45
+ end
46
+
47
+ #A bunch of different iterators over a parsed document.
48
+ {
49
+ #Iterates in parse order over the rest of the items in this document.
50
+ :next_parsed_items => :next_parsed,
51
+
52
+ #Iterates in reverse parse order over all previously parsed items in
53
+ #this document.
54
+ :previous_parsed_items => :previous_parsed,
55
+
56
+ #Iterates in parse order over all subsequent siblings of this item.
57
+ :next_siblings => :next_sibling,
58
+
59
+ #Iterates in reverse parse order over all prior siblings of this item.
60
+ :previous_siblings => :previous_sibling,
61
+
62
+ #Iterates upwards through the parentage of this item.
63
+ :parents => :parent
64
+ }.each do |k,v|
65
+ class_eval %{
66
+ def #{k}
67
+ i = self
68
+ while i
69
+ i = i.#{v}
70
+ yield i if i
71
+ end
72
+ end
73
+ }
74
+ end
75
+
76
+ [ #Returns first item/all items matching the given criteria and
77
+ #appearing after this PageElement in the document.
78
+ [:find_next, :find_all_next, 'next_parsed_items'],
79
+
80
+ #Returns first item/all items matching the given criteria and
81
+ #appearing before this PageElement in the document.
82
+ [:find_previous, :find_all_previous, 'previous_parsed_items'],
83
+
84
+ #Returns the nearest sibling/all siblings of this PageElement matching
85
+ #the given criteria and appearing before this PageElement in
86
+ #the document.
87
+ [:find_previous_sibling, :find_previous_siblings, 'previous_siblings'],
88
+
89
+ #Returns the nearest sibling/all siblings of this PageElement matching
90
+ #the given criteria and appearing after this PageElement in
91
+ #the document
92
+ [:find_next_sibling, :find_next_siblings, 'next_siblings'],
93
+
94
+ #Returns the nearest parent/all parents of this PageElement matching
95
+ #the given criteria.
96
+ [:find_parent, :find_parents, 'parents'],
97
+ ].each do |singular, plural, method_name|
98
+ class_eval %{
99
+ def #{singular}(name=nil, args={}, &block)
100
+ args['limit'] = 1
101
+ fetch(method('#{method_name}'), name, args, block)[0]
102
+ end
103
+
104
+ def #{plural}(name=nil, args={}, &block)
105
+ fetch(method('#{method_name}'), name, args, block)
106
+ end
107
+ }
108
+ end
109
+
110
+ protected
111
+
112
+ #Returns a list of items matching the given criteria, obtained by
113
+ #iterating over the given iterator.
114
+ def fetch(iterator, name, args, block)
115
+ attrs = args[:attrs]
116
+ limit = args[:limit]
117
+ text = args[:text]
118
+
119
+ attrs ||= {}
120
+ if attrs != nil and not attrs.respond_to? :keys
121
+ attrs = {'class' => attrs}
122
+ end
123
+ bucket = []
124
+ catch(:stop_iteration) do
125
+ iterator.call do |item|
126
+ match = false
127
+ if block
128
+ match = true if block.call(item)
129
+ elsif item.is_a? Tag
130
+ #A tag matches if its name matches and its attributes line up.
131
+ if not text and (not name or PageElement.matches(item, name))
132
+ match = true
133
+ attrs.each_pair do |attr, matchAgainst|
134
+ check = item[attr]
135
+ unless PageElement.matches(check, matchAgainst)
136
+ match = false
137
+ break
138
+ end
139
+ end
140
+ end
141
+ elsif text
142
+ #A text matches if its string value matches the given text
143
+ #criterion.
144
+ match = PageElement.matches(item, text)
145
+ end
146
+ if match
147
+ bucket.push(item)
148
+ if limit and bucket.length >= limit
149
+ throw :stop_iteration
150
+ end
151
+ end
152
+ end
153
+ end
154
+ return bucket
155
+ end
156
+
157
+ #Used to tell whether a Tag or a NavigableString "matches" some data
158
+ #structure.
159
+ def PageElement.matches(chunk, how_to_match)
160
+ #puts "Seeing if #{chunk.class} #{chunk} matches #{how_to_match.class} #{how_to_match}."
161
+ #
162
+ # If given a list of items, return true if the list contains a
163
+ # text element that matches.
164
+ if chunk.is_a? Array
165
+ chunk.each do |tag|
166
+ return true if tag.is_a? NavigableString and matches(tag, how_to_match)
167
+ end
168
+ return false
169
+ elsif how_to_match.is_a? Proc
170
+ return how_to_match.call(chunk)
171
+ elsif chunk.is_a? Tag
172
+ #Custom match methods take the tag as an argument, but all other
173
+ #ways of matching match the tag name as a string
174
+ chunk = chunk.name
175
+ end
176
+
177
+ #At this point we know that chunk is a string
178
+ unless chunk.is_a? String
179
+ chunk = chunk.to_s
180
+ end
181
+ if how_to_match.is_a? Regexp
182
+ return how_to_match.match(chunk) != nil
183
+ elsif how_to_match.is_a? Array
184
+ return how_to_match.find {|x| x == chunk} != nil
185
+ elsif how_to_match.is_a? Hash
186
+ return how_to_match[chunk] != nil
187
+ else
188
+ #It's just a string
189
+ return how_to_match.to_s == chunk
190
+ end
191
+ end
192
+
193
+ end
194
+
195
+ module TagModule
196
+
197
+ include Enumerable
198
+ include PageElement
199
+
200
+ attr_accessor :name, :contents, :attrs, :string
201
+
202
+ #I tried to have Tag subclass Method, but it killed the
203
+ #whole thing. Maybe I should just leave well enough alone.
204
+ #
205
+ #def arity
206
+ # return methods('find_all').arity
207
+ #end
208
+ #
209
+ #def call(*args)
210
+ # return find_all(*args)
211
+ #end
212
+ #
213
+ #def to_proc
214
+ # return methods('find_all').to_proc
215
+ #end
216
+
217
+ def initialize(parser, name, attrs=nil, parent=nil, previous=nil)
218
+ @hidden = false
219
+ @parser = parser
220
+ @name = name
221
+ attrs ||= {}
222
+ @attrs = attrs
223
+ @contents = []
224
+ setup(parent, previous)
225
+ end
226
+
227
+ #soup.title_tag or soup.title is the same as soup.find('title')
228
+ def method_missing(name, *args)
229
+ #puts "Missing method #{name}"
230
+ name = name.to_s
231
+ if name[-4...name.length] == '_tag'
232
+ name = name[0...name.length-4]
233
+ end
234
+ return find(name, *args)
235
+ end
236
+
237
+ #TODO: is there a mixin for Hash?
238
+ def [](k)
239
+ return @attrs[k]
240
+ end
241
+
242
+ def []=(k, v)
243
+ @attrs[k] = v
244
+ end
245
+
246
+ def delete(k)
247
+ @attrs.delete(k)
248
+ end
249
+
250
+ def has_key?(k)
251
+ return @attrs.has_key(k)
252
+ end
253
+
254
+ #End things that would go away if there was a mixin for Hash.
255
+
256
+ def each
257
+ @contents.each { |x| yield x }
258
+ end
259
+
260
+ def length
261
+ return contents.length
262
+ end
263
+ alias size length
264
+
265
+ def self_closing?
266
+ return @parser.self_closing_tag?(@name)
267
+ end
268
+
269
+ #Adds the given tag to the contents of this tag
270
+ def append(tag)
271
+ @contents.push(tag)
272
+ end
273
+
274
+ def to_str
275
+ return to_s
276
+ end
277
+
278
+ #Renders this tag and its contents as a pretty-printed string.
279
+ def prettify
280
+ return to_s(true)
281
+ end
282
+
283
+ def inspect
284
+ to_s
285
+ end
286
+
287
+ #Renders this tag and its contents as a string. NOTE: since REXML
288
+ #consumes whitespace, this method is not certain to reproduce the
289
+ #whitespace present in the original string.
290
+ def to_s(show_structure_indent=nil)
291
+ attrs = []
292
+ @attrs.each { |k,v| attrs.push("#{k}=\"#{v}\"") if v }
293
+ if self_closing?
294
+ close = ' /'
295
+ closeTag = nil
296
+ else
297
+ close = nil
298
+ closeTag = "</#{name}>"
299
+ end
300
+ indent_increment = show_structure_indent==true ? 0 : show_structure_indent
301
+ if show_structure_indent
302
+ indent_increment += 1 unless @hidden
303
+ end
304
+ contents = render_contents(indent_increment)
305
+ space = "\n #{' ' * indent_increment}" if show_structure_indent
306
+ if @hidden
307
+ s = contents
308
+ else
309
+ s = []
310
+ attribute_string = ''
311
+ unless attrs.empty?
312
+ attribute_string = ' ' + attrs.join(' ')
313
+ end
314
+ s.push(space) if show_structure_indent
315
+ s.push("<#{@name}#{attribute_string}#{close}>")
316
+ s.push(contents)
317
+ s.push(space) if closeTag and show_structure_indent
318
+ s.push(closeTag)
319
+ s = s.join('')
320
+ end
321
+ return s
322
+ end
323
+
324
+ #Renders the contents of this tag as a string.
325
+ def render_contents(show_structure_indent=nil)
326
+ s=[]
327
+ @contents.each do |c|
328
+ text = nil
329
+ if c.is_a? Tag
330
+ text = c.to_s(show_structure_indent)
331
+ else
332
+ text = c.to_s
333
+ end
334
+ if text
335
+ if show_structure_indent
336
+ text.chomp!
337
+ end
338
+ s.push(text)
339
+ end
340
+ end
341
+ return s.join('')
342
+ end
343
+
344
+ def recursive_children
345
+ stack = [[self, 0]]
346
+ catch(:stop_iteration) do
347
+ until stack.empty?
348
+ tag, start = stack.pop
349
+ for i in start...tag.contents.length
350
+ a = tag.contents[i]
351
+ yield a
352
+ if a.is_a? TagModule and not tag.contents.empty? and i < tag.contents.length
353
+ stack.push([tag, i+1])
354
+ stack.push([a, 0])
355
+ break
356
+ end
357
+ end if tag.is_a? TagModule
358
+ end
359
+ end
360
+ end
361
+
362
+ #Iterates over the direct children of this Tag.
363
+ def children
364
+ catch(:stop_iteration) { @contents.each { |x| yield x } }
365
+ end
366
+
367
+ #Convenience method to retrieve the first piece of text matching the
368
+ #given criteria. 'text' can be a string, a regular expression object,
369
+ #a Proc that takes a string and returns whether or not the
370
+ #string 'matches', etc.
371
+ def find_text(text=nil, &block)
372
+ args = { :text => text, :limit => 1}
373
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
374
+ fetch(iterator, nil, args, block)[0]
375
+ end
376
+
377
+ #Convenience method to retrieve all pieces of text matching the
378
+ #given criteria. 'text' can be a string, a regular expression object,
379
+ #a callable that takes a string and returns whether or not the
380
+ #string 'matches', etc.
381
+ #Args: :limit
382
+ def find_all_text(text=nil, args={}, &block)
383
+ args['text'] = text
384
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
385
+ fetch(iterator, nil, args, block)
386
+ end
387
+
388
+ #Extracts a list of Tag objects that match the given criteria. You
389
+ #can specify the name of the Tag and any attributes you want the Tag
390
+ #to have.
391
+ #
392
+ #The value of a key-value pair in the 'attrs' map can be a string, a
393
+ #list of strings, a regular expression object, or a Proc object that
394
+ #takes a string and returns whether or not the string matches for
395
+ #some custom definition of 'matches'. The same is true of the tag
396
+ #name, except that a Proc object will be passed the Tag object instead
397
+ #of just a string.
398
+ #Args: :attrs :text :limit :recursive
399
+ def find_all(name=nil, args={}, &block)
400
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
401
+ fetch(iterator, name, args, block)
402
+ end
403
+
404
+ #Returns the first Tag or NavigableString object that matches the
405
+ #given criteria. Takes much the same arguments as fetch.
406
+ #args: :attrs :text :limit :recursive
407
+ def find(name=nil, args={}, &block)
408
+ args[:limit] = 1
409
+ iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
410
+ fetch(iterator, name, args, block)[0]
411
+ end
412
+ end
413
+
414
+ class Tag
415
+ include TagModule
416
+ end
417
+
418
+ class NavigableString < String
419
+ include PageElement
420
+ end
421
+
422
+ #This class contains the basic parser and fetch code. It defines
423
+ #a parser that knows nothing about tag behavior except for the
424
+ #following:
425
+ #
426
+ #You can't close a tag without closing all the tags it encloses.
427
+ #That is, "<foo><bar></foo>" actually means
428
+ #"<foo><bar></bar></foo>".
429
+ #
430
+ #[Another possible explanation is "<foo><bar /></foo>", but since
431
+ # this class defines no self_closing_tags, it will never use that
432
+ # explanation.]
433
+ #
434
+ #This class is useful for parsing XML or made-up markup languages,
435
+ #or when BeautifulSoup makes an assumption counter to what you were
436
+ #expecting."""
437
+ class BeautifulStoneSoup < SGMLParser
438
+ include TagModule
439
+
440
+ #As a public service we will by default silently replace MS smart quotes
441
+ #and similar characters with their HTML or ASCII equivalents.
442
+ @@ms_chars = { '\x80' => '&euro;',
443
+ "\x81" => ' ',
444
+ "\x82" => '&sbquo;',
445
+ "\x83" => '&fnof;',
446
+ "\x84" => '&bdquo;',
447
+ "\x85" => '&hellip;',
448
+ "\x86" => '&dagger;',
449
+ "\x87" => '&Dagger;',
450
+ "\x88" => '&caret;',
451
+ "\x89" => '%',
452
+ "\x8A" => '&Scaron;',
453
+ "\x8B" => '&lt;',
454
+ "\x8C" => '&OElig;',
455
+ "\x8D" => '?',
456
+ "\x8E" => 'Z',
457
+ "\x8F" => '?',
458
+ "\x90" => '?',
459
+ "\x91" => '&lsquo;',
460
+ "\x92" => '&rsquo;',
461
+ "\x93" => '&ldquo;',
462
+ "\x94" => '&rdquo;',
463
+ "\x95" => '&bull;',
464
+ "\x96" => '&ndash;',
465
+ "\x97" => '&mdash;',
466
+ "\x98" => '&tilde;',
467
+ "\x99" => '&trade;',
468
+ "\x9a" => '&scaron;',
469
+ "\x9b" => '&gt;',
470
+ "\x9c" => '&oelig;',
471
+ "\x9d" => '?',
472
+ "\x9e" => 'z',
473
+ "\x9f" => '&Yuml;'}
474
+
475
+ @@parser_massage = [[/<([^<>]*)\/>/, '<\1></\1>'],
476
+ [/<!\s+([^<>]*)>/, '<!\1>'],
477
+ [/([\x80-\x9f])/m, proc { |m| @@ms_chars[m]}]
478
+ ]
479
+
480
+ @@rootTagName = '[document]'
481
+
482
+ @@nestable_tags = {}
483
+ @@reset_nesting_tags = {}
484
+ @@quoteTags = {}
485
+ @@self_closing_tags = {}
486
+
487
+ attr_accessor :hidden
488
+
489
+ def self_closing_tag?(tag)
490
+ @@self_closing_tags.has_key?(tag)
491
+ end
492
+
493
+ #Args: :initial_text_is_everything, :avoid_parser_problems
494
+ def initialize(text, args={})
495
+ super(self, @@rootTagName)
496
+ @quote_stack = []
497
+ @hidden = 1
498
+ reset
499
+
500
+ @avoid_parser_problems = args[:avoid_parser_problems] || true
501
+ if @avoid_parser_problems and not @avoid_parser_problems.is_a? Enumerable
502
+ @avoid_parser_problems = @@parser_massage
503
+ end
504
+ feed(text) if text != nil
505
+ done if args[:initial_text_is_everything] != false
506
+ end
507
+
508
+ def feed(text)
509
+ if @avoid_parser_problems
510
+ #before = text.clone
511
+ @avoid_parser_problems.each do |re, fix|
512
+ if fix.is_a? String
513
+ text.gsub!(re, fix)
514
+ else
515
+ text.gsub!(re) { |x| fix.call(x) }
516
+ end
517
+ end
518
+ #if before != text
519
+ # puts "Changed from #{before} to #{text}"
520
+ #end
521
+ end
522
+ super
523
+ end
524
+
525
+ def ==(anObject)
526
+ return anObject.to_s == to_s
527
+ end
528
+
529
+ def done
530
+ end_text
531
+ pop_tag while @currentTag.name != @@rootTagName
532
+ end
533
+
534
+ def reset
535
+ super
536
+ @currentText = []
537
+ @currentTag = nil
538
+ @tag_stack = []
539
+ push_tag(self)
540
+ end
541
+
542
+ def push_tag(tag)
543
+ #puts "Push #{ tag.name }"
544
+ @currentTag.append(tag) if @currentTag
545
+ @tag_stack.push(tag)
546
+ @currentTag = @tag_stack[-1]
547
+ end
548
+
549
+ def pop_tag
550
+ tag = @tag_stack.pop
551
+ #puts "Pop #{ tag.name }"
552
+
553
+ # Tags with just one string-owning child get the child as a
554
+ # 'string' property, so that soup.tag.string is shorthand for
555
+ # soup.tag.contents[0]
556
+ if @currentTag.contents.length == 1 and @currentTag.contents[0].is_a? NavigableString
557
+ @currentTag.string = @currentTag.contents[0]
558
+ end
559
+
560
+ @currentTag = @tag_stack[-1] unless @tag_stack.empty?
561
+ @currentTag
562
+ end
563
+
564
+ # StreamListener implementation
565
+
566
+ def unknown_starttag(name, attrs)
567
+ #puts "Starting tag #{name} #{attrs.inspect}"
568
+ attrs = attrs.inject({}) do |m,v|
569
+ if v[1][0] == ?" and v[1][-1] == ?":
570
+ v[1] = v[1][1..-2]
571
+ end
572
+ m[v[0]] = v[1]
573
+ m
574
+ end
575
+ unless @quote_stack.empty?
576
+ #This is not a real tag.
577
+ #puts "<#{name}> is not real!"
578
+ #TODO: find idiomatic way to do this
579
+ attrString = []
580
+ attrs.each { |k,v| attrString.push('#{k}="#{v}"') }
581
+ self.handle_data('<#{name} #{attrString.join(' ')}>')
582
+ return
583
+ end
584
+
585
+ end_text
586
+ self_closing = @@self_closing_tags.has_key?(name)
587
+ smart_pop(name) unless self_closing
588
+ tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
589
+ @previous_parsed.next_parsed = tag if @previous_parsed
590
+ @previous_parsed = tag
591
+ push_tag(tag)
592
+ pop_tag if self_closing
593
+ if @@quoteTags.has_key?(name)
594
+ #puts "Beginning quote (#{name})"
595
+ @quote_stack.push(name)
596
+ end
597
+ end
598
+
599
+ def unknown_endtag(name)
600
+ #Ignore tag_end calls for self-closing tags; they were
601
+ #closed in the tag_start call.
602
+ #TODO: still neccessary?
603
+ #puts "Ending tag #{name}"
604
+ return if @@self_closing_tags.has_key?(name)
605
+
606
+ if not @quote_stack.empty? and @quote_stack[-1] != name
607
+ #This is not a real end tag.
608
+ #puts "</#{name}> is not real!"
609
+ handle_data('</#{name}>')
610
+ return
611
+ end
612
+ end_text
613
+ pop_to_tag(name)
614
+ @quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
615
+ end
616
+
617
+ def handle_data(data)
618
+ @currentText.push(data)
619
+ end
620
+
621
+ #Propagate comments right through.
622
+ def handle_comment(data)
623
+ handle_data("<!--#{comment}-->")
624
+ end
625
+
626
+ def handle_special(data)
627
+ handle_data("<#{data}>")
628
+ end
629
+
630
+ def unknown_charref(ref)
631
+ handle_data("&#{ref};")
632
+ end
633
+
634
+ def unknown_entityref(ref)
635
+ handle_data("%#{content}")
636
+ end
637
+
638
+ def attlistdecl(element_name, attributes, raw_content)
639
+ handle_data("<!ATTLIST #{raw_content}>")
640
+ end
641
+
642
+ def cdata(content)
643
+ handle_data("<![CDATA[#{content}]]")
644
+ end
645
+
646
+ ###
647
+
648
+ def doctype(*args)
649
+ content = args.join(' ')
650
+ ##{name} #{pub_sys}#{long_name}#{url}
651
+ #long_name = ' "#{long_name}"' if long_name
652
+ #url = ' "#{url}"' if url
653
+ handle_data("<!DOCTYPE #{content}>")
654
+ end
655
+
656
+ def elementdecl(content)
657
+ handle_data("<!ELEMENT #{content}>")
658
+ end
659
+
660
+ def entity(content)
661
+
662
+ end
663
+
664
+ def entitydecl(content)
665
+ handle_data("<!ENTITY #{content.join(' ')}>")
666
+ end
667
+
668
+ def instruction(name, instruction)
669
+ handle_data("<?#{name} #{instruction}>")
670
+ end
671
+
672
+ def notationdecl(content)
673
+ handle_data("<!NOTATION #{content}>")
674
+ end
675
+
676
+ def xmldecl(version, encoding, standalone)
677
+ encoding = ' encoding="#{encoding}"' if encoding
678
+ handle_data('<?xml version="#{version}"#{encoding}#{standalone}>')
679
+ end
680
+
681
+ #Called when we're done collecting some text, declarations, etc.
682
+ def end_text
683
+ currentText = @currentText.join('')
684
+ unless currentText.empty?
685
+ if currentText.strip.empty?
686
+ if currentText =~ /\n/
687
+ currentText = "\n"
688
+ else
689
+ currentText = ' '
690
+ end
691
+ end
692
+ #puts "Setting up text #{currentText}"
693
+ currentText = NavigableString.new(currentText)
694
+ currentText.setup(@currentTag, @previous_parsed)
695
+ @previous_parsed.next_parsed = currentText if @previous_parsed
696
+ @previous_parsed = currentText
697
+ @currentTag.contents.push(currentText)
698
+ end
699
+ @currentText = []
700
+ end
701
+
702
+ # Helper methods
703
+
704
+ private
705
+
706
+ #Pops the tag stack up to and including the most recent
707
+ #instance of the given tag. If inclusivePop is false, pops the tag
708
+ #stack up to but *not* including the most recent instance of
709
+ #the given tag.
710
+ def pop_to_tag(name, inclusive_pop=true)
711
+ return if name == @@rootTagName
712
+
713
+ #puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
714
+ num_pops = 0
715
+ mostRecentTag = nil
716
+ (0...@tag_stack.length).to_a.reverse.each do |i|
717
+ if name == @tag_stack[i].name
718
+ #puts "Found at #{i}, #{@tag_stack.length-i}"
719
+ num_pops = @tag_stack.length-i
720
+ break
721
+ end
722
+ end
723
+ num_pops -= 1 if not inclusive_pop
724
+
725
+ #puts "Popping #{num_pops} times."
726
+ num_pops.times { mostRecentTag = pop_tag }
727
+ mostRecentTag
728
+ end
729
+
730
+ #We need to pop up to the previous tag of this type, unless
731
+ #one of this tag's nesting reset triggers comes between this
732
+ #tag and the previous tag of this type, OR unless this tag is a
733
+ #generic nesting trigger and another generic nesting trigger
734
+ #comes between this tag and the previous tag of this type.
735
+ #
736
+ #Examples:
737
+ # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
738
+ # <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
739
+ # <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
740
+ # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
741
+ #
742
+ # <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
743
+ # <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
744
+ # <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
745
+ def smart_pop(name)
746
+ #puts "Smart pop for #{name}"
747
+ nesting_reset_triggers = @@nestable_tags[name]
748
+ is_nestable = nesting_reset_triggers != nil
749
+ is_reset_nesting = @@reset_nesting_tags.has_key?(name)
750
+ popTo = nil
751
+ inclusive = true
752
+ for p in @tag_stack.reverse
753
+ if (p == nil or p.name == name) and not is_nestable
754
+ #Non-nestable tags get popped to the top or to their
755
+ #last occurance.
756
+ #puts "Non-nestable tag #{name} gets popped to its last occurance."
757
+ popTo = name
758
+ break
759
+ end
760
+ if (nesting_reset_triggers != nil and nesting_reset_triggers.include?(p.name)) or (nesting_reset_triggers == nil and is_reset_nesting and @@reset_nesting_tags.has_key?(p.name))
761
+ #If we encounter one of the nesting reset triggers
762
+ #peculiar to this tag, or we encounter another tag
763
+ #that causes nesting to reset, pop up to but not
764
+ #including that tag.
765
+ #puts "Nesting reset trigger encountered for #{name}: #{p.name}"
766
+ popTo = p.name
767
+ inclusive = false
768
+ break
769
+ end
770
+ p = p.parent
771
+ end
772
+ pop_to_tag(popTo, inclusive) if popTo
773
+ end
774
+
775
+ protected
776
+
777
+ #Turns a list of maps, lists, or scalars into a single map.
778
+ #Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
779
+ #of lists and partial maps.
780
+ def BeautifulStoneSoup.build_tag_map(default, *args)
781
+ built = args.inject({}) do |m, portion|
782
+ if portion.is_a? Hash
783
+ #It's a map. Merge it.
784
+ portion.each_pair { |k,v| m[k] = v }
785
+ elsif portion.is_a? Array
786
+ #It's a list. Map each item to the default.
787
+ portion.each { |k| m[k] = default }
788
+ else
789
+ #It's a scalar. Map it to the default.
790
+ m[portion] = default
791
+ end
792
+ m
793
+ end
794
+ end
795
+ end
796
+
797
+ #This parser knows the following facts about HTML:
798
+ #
799
+ #* Some tags have no closing tag and should be interpreted as being
800
+ # closed as soon as they are encountered.
801
+ #
802
+ #* The text inside some tags (ie. 'script') may contain tags which
803
+ # are not really part of the document and which should be parsed
804
+ # as text, not tags. If you want to parse the text as tags, you can
805
+ # always fetch it and parse it explicitly.
806
+ #
807
+ #* Tag nesting rules:
808
+ #
809
+ # Most tags can't be nested at all. For instance, the occurance of
810
+ # a <p> tag should implicitly close the previous <p> tag.
811
+ #
812
+ # <p>Para1<p>Para2
813
+ # should be transformed into:
814
+ # <p>Para1</p><p>Para2
815
+ #
816
+ # Some tags can be nested arbitrarily. For instance, the occurance
817
+ # of a <blockquote> tag should _not_ implicitly close the previous
818
+ # <blockquote> tag.
819
+ #
820
+ # Alice said: <blockquote>Bob said: <blockquote>Blah
821
+ # should NOT be transformed into:
822
+ # Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
823
+ #
824
+ # Some tags can be nested, but the nesting is reset by the
825
+ # interposition of other tags. For instance, a <tr> tag should
826
+ # implicitly close the previous <tr> tag within the same <table>,
827
+ # but not close a <tr> tag in another table.
828
+ #
829
+ # <table><tr>Blah<tr>Blah
830
+ # should be transformed into:
831
+ # <table><tr>Blah</tr><tr>Blah
832
+ # but,
833
+ # <tr>Blah<table><tr>Blah
834
+ # should NOT be transformed into
835
+ # <tr>Blah<table></tr><tr>Blah
836
+ #
837
+ #Differing assumptions about tag nesting rules are a major source
838
+ #of problems with the BeautifulSoup class. If BeautifulSoup is not
839
+ #treating as nestable a tag your page author treats as nestable,
840
+ #try writing a subclass.
841
+ class BeautifulSoup < BeautifulStoneSoup
842
+
843
+ @@self_closing_tags.replace(build_tag_map(nil, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame']))
844
+
845
+ @@quote_tags = {'script' => nil}
846
+
847
+ #According to the HTML standard, each of these inline tags can
848
+ #contain another tag of the same type. Furthermore, it's common
849
+ #to actually use these tags this way.
850
+ @@nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center']
851
+
852
+ #According to the HTML standard, these block tags can contain
853
+ #another tag of the same type. Furthermore, it's common
854
+ #to actually use these tags this way.
855
+ @@nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
856
+
857
+ #Lists can contain other lists, but there are restrictions.
858
+ @@nestable_list_tags = { 'ol' => [],
859
+ 'ul' => [],
860
+ 'li' => ['ul', 'ol'],
861
+ 'dl' => [],
862
+ 'dd' => ['dl'],
863
+ 'dt' => ['dl'] }
864
+
865
+ #Tables can contain other tables, but there are restrictions.
866
+ @@nestable_table_tags = {'table' => ['tr', 'td'],
867
+ 'tr' => ['table'],
868
+ 'td' => ['tr'],
869
+ 'th' => ['tr'],
870
+ }
871
+
872
+ @@non_nestable_block_tags = ['address', 'form', 'p', 'pre']
873
+
874
+ #If one of these tags is encountered, all tags up to the next tag of
875
+ #this type are popped.
876
+ @@reset_nesting_tags.replace(build_tag_map(nil, @@nestable_block_tags, 'noscript', @@non_nestable_block_tags,
877
+ @@nestable_list_tags, @@nestable_table_tags))
878
+
879
+ @@nestable_tags.replace(build_tag_map([], @@nestable_inline_tags, @@nestable_block_tags, @@nestable_list_tags, @@nestable_table_tags))
880
+
881
+ end
882
+
883
+ # This class will push a tag with only a single string child into
884
+ # the tag's parent as an attribute. The attribute's name is the tag
885
+ # name, and the value is the string child. An example should give
886
+ # the flavor of the change:
887
+ #
888
+ # <foo><bar>baz</bar></foo>
889
+ # =>
890
+ # <foo bar="baz"><bar>baz</bar></foo>
891
+ #
892
+ # You can then access fooTag['bar'] instead of fooTag.barTag.string.
893
+ #
894
+ # This is, of course, useful for scraping structures that tend to
895
+ # use subelements instead of attributes, such as SOAP messages. Note
896
+ # that it modifies its input, so don't print the modified version
897
+ # out.
898
+ class BeautifulSOAP < BeautifulStoneSoup
899
+ def pop_tag
900
+ if @tag_stack.size > 1
901
+ tag = @tag_stack[-1]
902
+ parent = @tag_stack[-2]
903
+ if (tag.is_a?(Tag) && tag.contents.size == 1 && \
904
+ tag.contents[0].is_a?(NavigableString) && !parent[tag.name])
905
+ parent[tag.name] = tag.contents[0]
906
+ end
907
+ super
908
+ end
909
+ end
910
+ end
911
+
912
+ #Enterprise class names! It has come to our attention that some people
913
+ #think the names of the Rubyful Soup parser classes are too silly
914
+ #and "unprofessional" for use in enterprise screen-scraping. We feel
915
+ #your pain! For such-minded folk, the Rubyful Soup Consortium And
916
+ #Rootin' Tootin' Texas Delicatessen recommends renaming this file to
917
+ #"RobustParser.rb" (or, in cases of extreme enterprisitude,
918
+ #"RobustParserBeanInterface.class") and using the following
919
+ #enterprise-friendly class aliases:
920
+ class RobustXMLParser < BeautifulStoneSoup; end
921
+ class RobustHTMLParser < BeautifulSoup; end
922
+ class SimplifyingSOAPParser < BeautifulSOAP; end
923
+
924
+ print BeautifulSoup.new(ARGF.read).prettify if $0 == __FILE__
925
+
@@ -0,0 +1,431 @@
1
+ #Unit tests for Rubyful Soup.
2
+ #
3
+ #These tests make sure the Rubyful Soup works as it should. If you
4
+ #find a bug in Rubyful Soup, the best way to express it is as a test
5
+ #case like this that fails.
6
+
7
+ require 'test/unit'
8
+ require 'rubygems'
9
+ require 'rubyful_soup'
10
+
11
+ class SoupTest < Test::Unit::TestCase
12
+
13
+ #Parse the given text and make sure its string rep is the other
14
+ #given text.
15
+ def assert_soup_equals(toParse, rep=nil, c=BeautifulStoneSoup)
16
+ if rep == nil
17
+ rep = toParse
18
+ end
19
+ assert_equal(c.new(toParse).to_s(false), rep)
20
+ end
21
+
22
+ #Null test to shut the compiler up.
23
+ def test_null
24
+ end
25
+
26
+ end
27
+
28
+ #Tests the various ways of fetching tags from a soup.
29
+ class ToteThatTag < SoupTest
30
+
31
+ def setup
32
+ ml = %{
33
+ <a id="x">1</a>
34
+ <a id="a">2</a>
35
+ <b id="b">3</b>
36
+ <b id="x">4</b>
37
+ <abc:d width="100">5</abc:d>}
38
+ @soup = BeautifulStoneSoup.new(ml)
39
+ end
40
+
41
+ def test_fetch_by_name
42
+ matching = @soup.find_all('a')
43
+ assert_equal(matching.length, 2)
44
+ assert_equal(matching[0].name, 'a')
45
+ assert_equal(matching[0], @soup.find('a'))
46
+ assert_equal(@soup.find('abc:d').contents.length, 1)
47
+
48
+ firstB = @soup.find('b')
49
+ nextB = firstB.find_next('b')
50
+ assert_equal(nextB.contents[0], '4')
51
+ assert_equal(nextB['id'], 'x')
52
+
53
+ end
54
+
55
+ def test_fetch_by_block
56
+
57
+ a = @soup.find_all('a')
58
+ b = @soup.find_all do |x|
59
+ x.is_a? Tag and x.name == 'a'
60
+ end
61
+ assert_equal(a,b)
62
+
63
+ a = @soup.find_text('3')
64
+ b = @soup.find_text do |x|
65
+ x.is_a? NavigableString and x == '3'
66
+ end
67
+ assert_equal(a,b)
68
+
69
+ matching = @soup.find_all do |x|
70
+ x.respond_to?('name') and x.name == x['id']
71
+ end
72
+ assert_equal(matching.length, 2)
73
+ assert_equal(matching[0].name, 'a')
74
+ end
75
+
76
+ def test_fetch_by_attribute
77
+ matching = @soup.find_all(nil, :attrs=>{'id' => 'x'})
78
+ assert_equal(matching.length, 2)
79
+ assert_equal(matching[0].name, 'a')
80
+ assert_equal(matching[1].name, 'b')
81
+
82
+ assert_equal(@soup.find_all(nil, :attrs=>{'id' => nil}).length, 1)
83
+ assert_equal(@soup.find_all(nil, :attrs=>{'id' => nil}).length, 1)
84
+
85
+ assert_equal(@soup.find_all(nil, :attrs=>{'width' => 100}).length, 1)
86
+ end
87
+
88
+ def test_tag_name_as_method
89
+ firstB = @soup.find('b')
90
+ assert_equal(firstB, @soup.b)
91
+ assert_equal(firstB, @soup.b_tag)
92
+ end
93
+
94
+ def test_fetch_by_list
95
+ matching = @soup.find_all(['a', 'abc:d'])
96
+ assert_equal(matching.length, 3)
97
+ end
98
+
99
+ def test_fetch_by_hash
100
+ matching = @soup.find_all({'a' => true, 'b' => true})
101
+ assert_equal(matching.length, 4)
102
+ end
103
+
104
+ def test_fetch_by_re
105
+ r = /a.*/
106
+ assert_equal(@soup.find_all(r).length, 3)
107
+ end
108
+
109
+ def test_fetch_by_method
110
+ proc = Proc.new { |x| return x.name == x['id'] }
111
+ matching = @soup.find_all(proc)
112
+ assert_equal(matching.length, 2)
113
+ assert_equal(matching[0].name, 'a')
114
+ end
115
+
116
+ end
117
+
118
+ #Testing the integrity of the parse tree.
119
+ class FollowThatTag < SoupTest
120
+
121
+ @@PROXIMITY_TEST = BeautifulStoneSoup.new('<b id="1"><b id="2"><b id="3"><b id="4">')
122
+
123
+ @@SIBLING_TEST = BeautifulStoneSoup.new('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
124
+
125
+ def test_parents
126
+ soup = BeautifulSoup.new('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah</b></ul></ul></ul>')
127
+ b = soup.find('b')
128
+ assert_equal(b.find_parents('ul', :attrs=>{'id' => 'foo'}).length, 2)
129
+ assert_equal(b.find_parent('ul')['a'], 'b')
130
+ end
131
+
132
+ def test_next_sibling
133
+ soup = @@SIBLING_TEST
134
+ tag = 'blockquote'
135
+ b = soup.find(tag, :attrs=>{'id' => 2})
136
+ assert_equal(b.find_next(tag)['id'], '2.1')
137
+ assert_equal(b.find_next_sibling(tag)['id'], '3')
138
+ assert_equal(b.find_next_sibling(tag)['id'], '3')
139
+ assert_equal(b.find_next_siblings(tag).length, 2)
140
+ assert_equal(b.find_next_siblings(tag, :attrs=>{'id' => 4}).length, 1)
141
+ end
142
+
143
+ def test_previous_sibling
144
+ soup = @@SIBLING_TEST
145
+ tag = 'blockquote'
146
+ b = soup.find(tag, :attrs=>{'id' => 3})
147
+ assert_equal(b.find_previous(tag)['id'], '2.1')
148
+ assert_equal(b.find_previous_sibling(tag)['id'], '2')
149
+ assert_equal(b.find_previous_sibling(tag)['id'], '2')
150
+ assert_equal(b.find_previous_siblings(tag).length, 2)
151
+ assert_equal(b.find_previous_siblings(tag, :attrs=>{'id' => 1}).length, 1)
152
+ end
153
+
154
+ def test_text_navigation
155
+ soup = BeautifulSoup.new('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
156
+ baz = soup.find_text('Baz')
157
+ assert_equal(baz.find_parent("i")['id'], '1')
158
+ assert_equal(baz.find_next(nil, :text=> 'Blee'), 'Blee')
159
+ assert_equal(baz.find_next_sibling(nil, :text=>'Blee'), 'Blee')
160
+ assert_equal(baz.find_next_sibling(nil, :text=>'Blargh'), nil)
161
+ assert_equal(baz.find_next_sibling('hr')['id'], '1')
162
+ end
163
+
164
+ end
165
+
166
+ #Tests the nextSibling and previousSibling navigation.
167
+ class SiblingRivalry < SoupTest
168
+
169
+ def test_siblings
170
+ soup = BeautifulSoup.new("<ul><li>1<p>A</p>B</li><li>2</li><li>3</li></ul>")
171
+ second_li = soup.find('li').next_sibling
172
+ assert_equal(second_li.name, 'li')
173
+ assert_equal(second_li.string, '2')
174
+ assert_equal(soup.find_text('1').next_sibling.name, 'p')
175
+ assert_equal(soup.find('p').next_sibling, 'B')
176
+ assert_equal(soup.find('p').next_sibling.previous_sibling.next_sibling,
177
+ 'B')
178
+ end
179
+ end
180
+
181
+ #Tests the various built-in functions of Tag objects.
182
+ class TagsAreObjectsToo < SoupTest
183
+
184
+ @@SOUP = BeautifulSoup.new('<top id="1">1<b>2</b>3</top>')
185
+
186
+ def test_length
187
+ assert_equal(@@SOUP.top.length, 3)
188
+ end
189
+
190
+ def test_hash_lookup
191
+ assert_equal(@@SOUP.top['id'], "1")
192
+ end
193
+
194
+ def test_iterator
195
+ bucket = []
196
+ @@SOUP.top.each do |x|
197
+ bucket << x
198
+ end
199
+ assert_equal(bucket.length, 3)
200
+ assert_equal(bucket[2], "3")
201
+ end
202
+
203
+ end
204
+
205
+ #Tests the use of 'string' as an alias for a tag's only content.
206
+ class StringEmUp < SoupTest
207
+
208
+ def test_string
209
+ s = BeautifulSoup.new('<b>foo</b>')
210
+ assert_equal(s.b.string, 'foo')
211
+ end
212
+
213
+ def test_lack_of_string
214
+ s = BeautifulSoup.new("<b>f<i>e</i>o</b>")
215
+ self.assert_equal(s.b.string, nil)
216
+ end
217
+ end
218
+
219
+ #Tests the limit argument.
220
+ class ThatsMyLimit < SoupTest
221
+
222
+ def test_basic_limits
223
+ s = BeautifulSoup.new('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
224
+ assert_equal(s.find_all('br').length, 4)
225
+ assert_equal(s.find_all('br', :limit=> 2).length, 2)
226
+ end
227
+ end
228
+
229
+ #Testing the modification of the tree.
230
+ class WriteOnlyCode < SoupTest
231
+
232
+ def test_replace_contents
233
+ soup = BeautifulSoup.new('<a>foo</a>')
234
+ soup.a.contents[0] = (NavigableString.new('bar'))
235
+ assert_equal(soup.render_contents, '<a>bar</a>')
236
+ end
237
+
238
+ def test_modify_attributes
239
+ soup = BeautifulSoup.new('<a id="1"></a>')
240
+ first_a = soup.find('a')
241
+
242
+ first_a['id'] = 2
243
+ assert_equal(soup.render_contents, '<a id="2"></a>')
244
+ first_a['id'] = nil
245
+ assert_equal(soup.render_contents, '<a></a>')
246
+
247
+ first_a['id2'] = 'foo'
248
+ assert_equal(soup.render_contents, '<a id2="foo"></a>')
249
+ first_a.delete('id2')
250
+ assert_equal(soup.render_contents, '<a></a>')
251
+ end
252
+
253
+ #Makes sure tags don't step on each others' toes.
254
+ def test_new_tag_
255
+ soup = BeautifulSoup.new('')
256
+ a = Tag.new(soup, 'a')
257
+ ol = Tag.new(soup, 'ol')
258
+ a["href"] = "http://foo.com/"
259
+ assert_equal(ol["href"], nil)
260
+ end
261
+ end
262
+
263
+ #Our operators do it all! Call now!
264
+ class OperatorOverload < SoupTest
265
+
266
+ def test_tag_name_as_find
267
+ # Tests that referencing a tag name as a member delegates to find.
268
+ soup = BeautifulSoup.new('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
269
+ assert_equal(soup.b.i, soup.find('b').find('i'))
270
+ assert_equal(soup.b.i.string, 'bar')
271
+ assert_equal(soup.b['id'], '1')
272
+ assert_equal(soup.b.contents[0], 'foo')
273
+ assert(soup.a == nil)
274
+
275
+ #Test the .foo_tag variant of .foo.
276
+ assert_equal(soup.b_tag.i_tag.string, 'bar')
277
+ assert_equal(soup.b.i_tag.string, 'bar')
278
+ assert_equal(soup.find('b').find('i'), soup.b_tag.i_tag)
279
+ end
280
+ end
281
+
282
+ #Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!
283
+ class NestableEgg < SoupTest
284
+
285
+ def test_para_inside_blockquote
286
+ soup = BeautifulSoup.new('<blockquote><p><b>Foo</b></p></blockquote><p>Bar')
287
+ assert_equal(soup.blockquote.p.b.string, 'Foo')
288
+ assert_equal(soup.blockquote.b.string, 'Foo')
289
+ assert_equal(soup.find('p', :recursive=>false).string, 'Bar')
290
+ end
291
+
292
+ def test_nested_tables
293
+ text = %{<table id="1"><tr><td>Here's another table:
294
+ <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>}
295
+ soup = BeautifulSoup.new(text)
296
+ assert_equal(soup.table.table.td.string, 'Juicy text')
297
+ assert_equal(soup.find_all('table').length, 2)
298
+ assert_equal(soup.table.find_all('table').length, 1)
299
+ assert_equal(soup.find('table', :attrs=>{'id' => 2}).parent.parent.parent.name,
300
+ 'table')
301
+ end
302
+
303
+ def test_bad_nested_tables
304
+ soup = BeautifulSoup.new("<table><tr><table><tr id='nested'></tr></table></tr></table>")
305
+ assert_equal(soup.table.tr.table.tr['id'], 'nested')
306
+ end
307
+ end
308
+
309
+
310
+ #Here we test cleanup of text that breaks an unaltered parser or is just
311
+ #obnoxious.
312
+ class CleanupOnAisleFour < SoupTest
313
+
314
+ def test_self_closing_tag
315
+ assert_equal(BeautifulStoneSoup.new("Foo<br/>Bar").find('br').to_s,
316
+ '<br />')
317
+ assert_soup_equals('<p>test1<br/>test2</p>',
318
+ '<p>test1<br />test2</p>')
319
+ end
320
+
321
+ def test_bad_closing_tags
322
+ BeautifulStoneSoup.new("<a>Foo<b>Bar</a>")
323
+ end
324
+
325
+ def test_premature_closing_tag
326
+ BeautifulStoneSoup.new("</b><a>Foo<b>Bar</a>")
327
+ end
328
+
329
+ def test_bad_doctype
330
+ assert_soup_equals("<!DOCTYPE foo='bar'>")
331
+ end
332
+
333
+ def test_whitespace_in_declaration
334
+ assert_soup_equals('<! DOCTYPE>', '<!DOCTYPE>')
335
+ end
336
+
337
+ def test_JunkInDeclaration
338
+ assert_soup_equals('<! Foo = -8>a', '<!Foo = -8>a')
339
+ end
340
+
341
+ def test_incomplete_declaration
342
+ assert_soup_equals('a<!b <p>c', 'a<!b <p>c</p>')
343
+ end
344
+
345
+ def test_valid_but_bogus_declaration
346
+ assert_soup_equals('<! Foo >a', '<!Foo >a')
347
+ end
348
+
349
+ #This fails for a totally bogus reason! I can't figure it out.
350
+ #def test_smart_quotes_not_so_smart_anymore_FAILS
351
+ # assert_soup_equals("\x91Foo\x92", '&lsquo;Foo&rsquo;')
352
+ #end
353
+
354
+ #def test_incomplete_declaration_at_endFAILS
355
+ # assert_soup_equals('a<!b')
356
+ #end
357
+
358
+ end
359
+
360
+ #Verifies that the parser treats multiple feed calls the same as one
361
+ #big feed call only if constructed with
362
+ #initialTextIsEverything=False.
363
+ class KeepOnParsing < SoupTest
364
+
365
+ def test_multiple_parse_calls
366
+ f1 = '<foo>bah<bar>'
367
+ f2 = 'blee</bar></foo>'
368
+
369
+ s1 = BeautifulSoup.new(f1+f2)
370
+ s2 = BeautifulSoup.new(f1)
371
+ s2.feed(f2)
372
+ s3 = BeautifulSoup.new(f1, :initial_text_is_everything => false)
373
+ s3.feed(f2)
374
+ assert_not_equal(s1, s2)
375
+ assert_equal(s1, s3)
376
+ end
377
+ end
378
+
379
+ #Verifies that BeautifulSOAP parser works.
380
+ class SOAPMeUp < SoupTest
381
+ def test_basic_soap
382
+ s = "<foo><bar>baz</bar></foo>"
383
+ soup = BeautifulSOAP.new(s)
384
+ assert_equal(soup.to_s, %{<foo bar="baz"><bar>baz</bar></foo>})
385
+ end
386
+
387
+ def test_dont_overwrite_existing_attr
388
+ s = %{<foo bar="don't kill me!"><bar>baz</bar></foo>}
389
+ soup = BeautifulSOAP.new(s)
390
+ assert_equal(soup.to_s, s)
391
+ end
392
+ end
393
+
394
+ #The Unicode test suite has not yet been ported because I haven't
395
+ #figured out how Ruby does Unicode.
396
+
397
+ # class UnicodeRed < SoupTest
398
+ # "Makes sure Unicode works."
399
+
400
+ # def setUp
401
+ # text = 'foo<b>bar</b>'
402
+ # @soup = BeautifulStoneSoup
403
+ # @soup.feed(text)
404
+
405
+ # def test_BasicUnicode
406
+ # import types
407
+ # sType = types.StringType
408
+ # uType = types.UnicodeType
409
+
410
+ # u = u'\3100'
411
+ # #It starts out ASCII...
412
+ # assert_equal(type(@soup.renderContents), sType)
413
+ # assert_equal(type(@soup.prettify), sType)
414
+ # #But you can have unicode if you want.
415
+ # assert_equal(type(unicode(@soup)), uType)
416
+
417
+ # #Add a Unicode character and it's Unicode.
418
+ # @soup.feed(u)
419
+ # assert_equal(type(@soup.renderContents), uType)
420
+ # assert_equal(type(@soup.prettify), uType)
421
+ # #But you can have ASCII if you want.
422
+ # assert_equal(type(str(@soup)), sType)
423
+
424
+ # #The part without any Unicode is still ASCII.
425
+ # assert_equal(type(@soup.b.prettify), sType)
426
+
427
+ # #But if you add a Unicode character it'll become Unicode.
428
+ # @soup.b['foo'] = u'\3100'
429
+ # assert_equal(type(@soup.b.prettify), uType)
430
+
431
+
metadata ADDED
@@ -0,0 +1,52 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.4
3
+ specification_version: 1
4
+ name: rubyful_soup
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.1
7
+ date: 2005-10-21
8
+ summary: An HTML/XML parser that handles bad markup and provides tree traversal methods.
9
+ require_paths:
10
+ - lib
11
+ email: leonardr@segfault.org
12
+ homepage: http://www.crummy.com/software/RubyfulSoup/
13
+ rubyforge_project:
14
+ description: "Rubyful Soup is a *ML parser that makes screen-scraping easy. It won't choke on
15
+ bad markup, and it's easy to locate the part of a document you want."
16
+ autorequire:
17
+ default_executable:
18
+ bindir: bin
19
+ has_rdoc: true
20
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
21
+ requirements:
22
+ -
23
+ - ">"
24
+ - !ruby/object:Gem::Version
25
+ version: 0.0.0
26
+ version:
27
+ platform: ruby
28
+ authors:
29
+ - Leonard Richardson
30
+ files:
31
+ - lib/rubyful_soup.rb
32
+ - tests/rubyful_soup_tests.rb
33
+ - CHANGELOG
34
+ test_files:
35
+ - tests/rubyful_soup_tests.rb
36
+ rdoc_options: []
37
+ extra_rdoc_files:
38
+ - CHANGELOG
39
+ executables: []
40
+ extensions: []
41
+ requirements: []
42
+ dependencies:
43
+ - !ruby/object:Gem::Dependency
44
+ name: htmltools
45
+ version_requirement:
46
+ version_requirements: !ruby/object:Gem::Version::Requirement
47
+ requirements:
48
+ -
49
+ - ">"
50
+ - !ruby/object:Gem::Version
51
+ version: 0.0.0
52
+ version: