RubyGems - rubyful_soup - Versions diffs - 1.0.1 - Mend

rubyful_soup 1.0.1

Files changed (4) hide show

data/CHANGELOG +12 -0
data/lib/rubyful_soup.rb +925 -0
data/tests/rubyful_soup_tests.rb +431 -0
metadata +52 -0

data/CHANGELOG ADDED Viewed

@@ -0,0 +1,12 @@
+Rubyful Soup Changelog
+1.0.1
+Changes from James Edward Gray (james at grayproductions dot net) to
+quiet warnings.
+Packaged as a gem for the first time.
+1.0.0
+First full release

data/lib/rubyful_soup.rb ADDED Viewed

@@ -0,0 +1,925 @@
+#Rubyful Soup
+#Elixir and Tonic
+#"The Screen-Scraper's Friend"
+#v1.0.1
+#http://www.crummy.com/software/RubyfulSoup/
+#
+#Rubyful Soup is a port to the Ruby language and idiom of the Python
+#library Beautiful Soup.
+#See http://www.crummy.com/software/BeautifulSoup/ for details on the original.
+#This library requires the sgml-parser library, written by Takahiro
+#Maebashi. The easiest way to get it is to install the "htmltools"
+#gem.
+require 'rubygems'
+require 'sgml-parser'
+#UTF-8 voodoo--does this really work?
+$KCODE = 'u'
+require 'jcode'
+#This code makes SGMLParser able to parse XML with namespaces.
+class SGMLParser
+  if const_defined? :Tagfind
+    remove_const(:Tagfind)
+    Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/
+  end
+end
+module PageElement
+  attr_reader :parser
+  attr_accessor :parent, :previous_parsed, :next_parsed, :previous_sibling
+  attr_accessor :next_sibling
+  def setup(parent=nil, previous_parsed=nil)
+    @parent = parent
+    @previous_parsed = previous_parsed
+    @next_parsed = nil
+    @previous_sibling = nil
+    @next_sibling = nil
+    if @parent and not @parent.contents.empty?
+      @previous_sibling = @parent.contents[-1]
+      @previous_sibling.next_sibling = self
+    end
+  end
+  #A bunch of different iterators over a parsed document.
+  {
+    #Iterates in parse order over the rest of the items in this document.
+    :next_parsed_items => :next_parsed,
+    #Iterates in reverse parse order over all previously parsed items in
+    #this document.
+    :previous_parsed_items => :previous_parsed,
+    #Iterates in parse order over all subsequent siblings of this item.
+    :next_siblings => :next_sibling,
+    #Iterates in reverse parse order over all prior siblings of this item.
+    :previous_siblings => :previous_sibling,
+    #Iterates upwards through the parentage of this item.
+    :parents => :parent
+  }.each do |k,v|
+    class_eval %{
+    def #{k}
+      i = self
+      while i
+        i = i.#{v}
+          yield i if i
+      end
+    end
+  }
+end
+[ #Returns first item/all items matching the given criteria and
+  #appearing after this PageElement in the document.
+  [:find_next, :find_all_next, 'next_parsed_items'],
+  #Returns first item/all items matching the given criteria and
+  #appearing before this PageElement in the document.
+  [:find_previous, :find_all_previous, 'previous_parsed_items'],
+  #Returns the nearest sibling/all siblings of this PageElement matching
+  #the given criteria and appearing before this PageElement in
+  #the document.
+  [:find_previous_sibling, :find_previous_siblings, 'previous_siblings'],
+  #Returns the nearest sibling/all siblings of this PageElement matching
+  #the given criteria and appearing after this PageElement in
+  #the document
+  [:find_next_sibling, :find_next_siblings, 'next_siblings'],
+  #Returns the nearest parent/all parents of this PageElement matching
+  #the given criteria.
+  [:find_parent, :find_parents, 'parents'],
+].each do |singular, plural, method_name|
+  class_eval %{
+    def #{singular}(name=nil, args={}, &block)
+      args['limit'] = 1
+      fetch(method('#{method_name}'), name, args, block)[0]
+    end
+    def #{plural}(name=nil, args={}, &block)
+      fetch(method('#{method_name}'), name, args, block)
+    end
+  }
+end
+  protected
+  #Returns a list of items matching the given criteria, obtained by
+  #iterating over the given iterator.
+  def fetch(iterator, name, args, block)
+    attrs = args[:attrs]
+    limit = args[:limit]
+    text = args[:text]
+    attrs ||= {}
+    if attrs != nil and not attrs.respond_to? :keys
+      attrs = {'class' => attrs}
+    end
+    bucket = []
+    catch(:stop_iteration) do
+      iterator.call do |item|
+        match = false
+        if block
+          match = true if block.call(item)
+        elsif item.is_a? Tag
+          #A tag matches if its name matches and its attributes line up.
+          if not text and (not name or PageElement.matches(item, name))
+            match = true
+            attrs.each_pair do |attr, matchAgainst|
+              check = item[attr]
+              unless PageElement.matches(check, matchAgainst)
+                match = false
+                break
+              end
+            end
+          end
+        elsif text
+          #A text matches if its string value matches the given text
+          #criterion.
+          match = PageElement.matches(item, text)
+        end
+        if match
+          bucket.push(item)
+          if limit and bucket.length >= limit
+            throw :stop_iteration
+          end
+        end
+      end
+    end
+    return bucket
+  end
+  #Used to tell whether a Tag or a NavigableString "matches" some data
+  #structure.
+  def PageElement.matches(chunk, how_to_match)
+    #puts "Seeing if #{chunk.class} #{chunk} matches #{how_to_match.class} #{how_to_match}."
+    #
+    # If given a list of items, return true if the list contains a
+    # text element that matches.
+    if chunk.is_a? Array
+      chunk.each do |tag|
+        return true if tag.is_a? NavigableString and matches(tag, how_to_match)
+      end
+      return false
+    elsif how_to_match.is_a? Proc
+      return how_to_match.call(chunk)
+    elsif chunk.is_a? Tag
+      #Custom match methods take the tag as an argument, but all other
+      #ways of matching match the tag name as a string
+      chunk = chunk.name
+    end
+    #At this point we know that chunk is a string
+    unless chunk.is_a? String
+      chunk = chunk.to_s
+    end
+    if how_to_match.is_a? Regexp
+      return how_to_match.match(chunk) != nil
+    elsif how_to_match.is_a? Array
+      return how_to_match.find {|x| x == chunk} != nil
+    elsif how_to_match.is_a? Hash
+      return how_to_match[chunk] != nil
+    else
+      #It's just a string
+      return how_to_match.to_s == chunk
+    end
+  end
+end
+module TagModule
+  include Enumerable
+  include PageElement
+  attr_accessor :name, :contents, :attrs, :string
+  #I tried to have Tag subclass Method, but it killed the
+  #whole thing. Maybe I should just leave well enough alone.
+  #
+  #def arity
+  #  return methods('find_all').arity
+  #end
+  #
+  #def call(*args)
+  #  return find_all(*args)
+  #end
+  #
+  #def to_proc
+  #  return methods('find_all').to_proc
+  #end
+  def initialize(parser, name, attrs=nil, parent=nil, previous=nil)
+    @hidden = false
+    @parser = parser
+    @name = name
+    attrs ||= {}
+    @attrs = attrs
+    @contents = []
+    setup(parent, previous)
+  end
+  #soup.title_tag or soup.title is the same as soup.find('title')
+  def method_missing(name, *args)
+    #puts "Missing method #{name}"
+    name = name.to_s
+    if name[-4...name.length] == '_tag'
+        name = name[0...name.length-4]
+    end
+    return find(name, *args)
+  end
+  #TODO: is there a mixin for Hash?
+  def [](k)
+    return @attrs[k]
+  end
+  def []=(k, v)
+    @attrs[k] = v
+  end
+  def delete(k)
+    @attrs.delete(k)
+  end
+  def has_key?(k)
+    return @attrs.has_key(k)
+  end
+  #End things that would go away if there was a mixin for Hash.
+  def each
+    @contents.each { |x| yield x }
+  end
+  def length
+    return contents.length
+  end
+  alias size length
+  def self_closing?
+    return @parser.self_closing_tag?(@name)
+  end
+  #Adds the given tag to the contents of this tag
+  def append(tag)
+    @contents.push(tag)
+  end
+  def to_str
+    return to_s
+  end
+  #Renders this tag and its contents as a pretty-printed string.
+  def prettify
+    return to_s(true)
+  end
+  def inspect
+    to_s
+  end
+  #Renders this tag and its contents as a string.  NOTE: since REXML
+  #consumes whitespace, this method is not certain to reproduce the
+  #whitespace present in the original string.
+  def to_s(show_structure_indent=nil)
+    attrs = []
+    @attrs.each { |k,v| attrs.push("#{k}=\"#{v}\"") if v }
+    if self_closing?
+      close = ' /'
+      closeTag = nil
+    else
+      close = nil
+      closeTag = "</#{name}>"
+    end
+    indent_increment = show_structure_indent==true ? 0 : show_structure_indent
+    if show_structure_indent
+      indent_increment += 1 unless @hidden
+    end
+    contents = render_contents(indent_increment)
+    space = "\n #{' ' * indent_increment}" if show_structure_indent
+    if @hidden
+      s = contents
+    else
+      s = []
+      attribute_string = ''
+      unless attrs.empty?
+        attribute_string = ' ' + attrs.join(' ')
+      end
+      s.push(space) if show_structure_indent
+      s.push("<#{@name}#{attribute_string}#{close}>")
+      s.push(contents)
+      s.push(space) if closeTag and show_structure_indent
+      s.push(closeTag)
+      s = s.join('')
+    end
+    return s
+  end
+  #Renders the contents of this tag as a string.
+  def render_contents(show_structure_indent=nil)
+    s=[]
+    @contents.each do |c|
+      text = nil
+      if c.is_a? Tag
+        text = c.to_s(show_structure_indent)
+      else
+        text = c.to_s
+      end
+      if text
+        if show_structure_indent
+          text.chomp!
+        end
+        s.push(text)
+      end
+    end
+    return s.join('')
+  end
+  def recursive_children
+    stack = [[self, 0]]
+    catch(:stop_iteration) do
+      until stack.empty?
+        tag, start = stack.pop
+        for i in start...tag.contents.length
+          a = tag.contents[i]
+          yield a
+          if a.is_a? TagModule and not tag.contents.empty? and i < tag.contents.length
+            stack.push([tag, i+1])
+            stack.push([a, 0])
+            break
+          end
+        end if tag.is_a? TagModule
+      end
+    end
+  end
+  #Iterates over the direct children of this Tag.
+  def children
+    catch(:stop_iteration) { @contents.each { |x| yield x } }
+  end
+  #Convenience method to retrieve the first piece of text matching the
+  #given criteria. 'text' can be a string, a regular expression object,
+  #a Proc that takes a string and returns whether or not the
+  #string 'matches', etc.
+  def find_text(text=nil, &block)
+    args = { :text => text, :limit => 1}
+    iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
+    fetch(iterator, nil, args, block)[0]
+  end
+  #Convenience method to retrieve all pieces of text matching the
+  #given criteria. 'text' can be a string, a regular expression object,
+  #a callable that takes a string and returns whether or not the
+  #string 'matches', etc.
+  #Args: :limit
+  def find_all_text(text=nil, args={}, &block)
+    args['text'] = text
+    iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
+    fetch(iterator, nil, args, block)
+  end
+  #Extracts a list of Tag objects that match the given criteria.  You
+  #can specify the name of the Tag and any attributes you want the Tag
+  #to have.
+  #
+  #The value of a key-value pair in the 'attrs' map can be a string, a
+  #list of strings, a regular expression object, or a Proc object that
+  #takes a string and returns whether or not the string matches for
+  #some custom definition of 'matches'. The same is true of the tag
+  #name, except that a Proc object will be passed the Tag object instead
+  #of just a string.
+  #Args: :attrs :text :limit :recursive
+  def find_all(name=nil, args={}, &block)
+    iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
+    fetch(iterator, name, args, block)
+  end
+  #Returns the first Tag or NavigableString object that matches the
+  #given criteria. Takes much the same arguments as fetch.
+  #args: :attrs :text :limit :recursive
+  def find(name=nil, args={}, &block)
+    args[:limit] = 1
+    iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
+    fetch(iterator, name, args, block)[0]
+  end
+end
+class Tag
+  include TagModule
+end
+class NavigableString < String
+  include PageElement
+end
+#This class contains the basic parser and fetch code. It defines
+#a parser that knows nothing about tag behavior except for the
+#following:
+#
+#You can't close a tag without closing all the tags it encloses.
+#That is, "<foo><bar></foo>" actually means
+#"<foo><bar></bar></foo>".
+#
+#[Another possible explanation is "<foo><bar /></foo>", but since
+# this class defines no self_closing_tags, it will never use that
+# explanation.]
+#
+#This class is useful for parsing XML or made-up markup languages,
+#or when BeautifulSoup makes an assumption counter to what you were
+#expecting."""
+class BeautifulStoneSoup < SGMLParser
+  include TagModule
+  #As a public service we will by default silently replace MS smart quotes
+  #and similar characters with their HTML or ASCII equivalents.
+  @@ms_chars = { '\x80' => '&euro;',
+    "\x81" => ' ',
+    "\x82" => '&sbquo;',
+    "\x83" => '&fnof;',
+    "\x84" => '&bdquo;',
+    "\x85" => '&hellip;',
+    "\x86" => '&dagger;',
+    "\x87" => '&Dagger;',
+    "\x88" => '&caret;',
+    "\x89" => '%',
+    "\x8A" => '&Scaron;',
+    "\x8B" => '&lt;',
+    "\x8C" => '&OElig;',
+    "\x8D" => '?',
+    "\x8E" => 'Z',
+    "\x8F" => '?',
+    "\x90" => '?',
+    "\x91" => '&lsquo;',
+    "\x92" => '&rsquo;',
+    "\x93" => '&ldquo;',
+    "\x94" => '&rdquo;',
+    "\x95" => '&bull;',
+    "\x96" => '&ndash;',
+    "\x97" => '&mdash;',
+    "\x98" => '&tilde;',
+    "\x99" => '&trade;',
+    "\x9a" => '&scaron;',
+    "\x9b" => '&gt;',
+    "\x9c" => '&oelig;',
+    "\x9d" => '?',
+    "\x9e" => 'z',
+    "\x9f" => '&Yuml;'}
+  @@parser_massage = [[/<([^<>]*)\/>/, '<\1></\1>'],
+    [/<!\s+([^<>]*)>/, '<!\1>'],
+    [/([\x80-\x9f])/m, proc { |m| @@ms_chars[m]}]
+  ]
+  @@rootTagName = '[document]'
+  @@nestable_tags = {}
+  @@reset_nesting_tags = {}
+  @@quoteTags = {}
+  @@self_closing_tags = {}
+  attr_accessor :hidden
+  def self_closing_tag?(tag)
+    @@self_closing_tags.has_key?(tag)
+  end
+  #Args: :initial_text_is_everything, :avoid_parser_problems
+  def initialize(text, args={})
+    super(self, @@rootTagName)
+    @quote_stack = []
+    @hidden = 1
+    reset
+    @avoid_parser_problems = args[:avoid_parser_problems] || true
+    if @avoid_parser_problems and not @avoid_parser_problems.is_a? Enumerable
+      @avoid_parser_problems = @@parser_massage
+    end
+    feed(text) if text != nil
+    done if args[:initial_text_is_everything] != false
+  end
+  def feed(text)
+    if @avoid_parser_problems
+      #before = text.clone
+      @avoid_parser_problems.each do |re, fix|
+        if fix.is_a? String
+          text.gsub!(re, fix)
+        else
+          text.gsub!(re) { |x| fix.call(x) }
+        end
+      end
+      #if before != text
+      #  puts "Changed from #{before} to #{text}"
+      #end
+    end
+    super
+  end
+  def ==(anObject)
+    return anObject.to_s == to_s
+  end
+  def done
+    end_text
+    pop_tag while @currentTag.name != @@rootTagName
+  end
+  def reset
+    super
+    @currentText = []
+    @currentTag = nil
+    @tag_stack = []
+    push_tag(self)
+  end
+  def push_tag(tag)
+    #puts "Push #{ tag.name }"
+    @currentTag.append(tag) if @currentTag
+    @tag_stack.push(tag)
+    @currentTag = @tag_stack[-1]
+  end
+  def pop_tag
+    tag = @tag_stack.pop
+    #puts "Pop #{ tag.name }"
+    # Tags with just one string-owning child get the child as a
+    # 'string' property, so that soup.tag.string is shorthand for
+    # soup.tag.contents[0]
+    if @currentTag.contents.length == 1 and @currentTag.contents[0].is_a? NavigableString
+      @currentTag.string = @currentTag.contents[0]
+    end
+    @currentTag = @tag_stack[-1] unless @tag_stack.empty?
+    @currentTag
+  end
+  # StreamListener implementation
+  def unknown_starttag(name, attrs)
+    #puts "Starting tag #{name} #{attrs.inspect}"
+    attrs = attrs.inject({}) do |m,v|
+      if v[1][0] == ?" and v[1][-1] == ?":
+        v[1] = v[1][1..-2]
+      end
+      m[v[0]] = v[1]
+      m
+    end
+    unless @quote_stack.empty?
+      #This is not a real tag.
+      #puts "<#{name}> is not real!"
+      #TODO: find idiomatic way to do this
+      attrString = []
+      attrs.each { |k,v| attrString.push('#{k}="#{v}"') }
+      self.handle_data('<#{name} #{attrString.join(' ')}>')
+      return
+    end
+    end_text
+    self_closing = @@self_closing_tags.has_key?(name)
+    smart_pop(name) unless self_closing
+    tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
+    @previous_parsed.next_parsed = tag if @previous_parsed
+    @previous_parsed = tag
+    push_tag(tag)
+    pop_tag if self_closing
+    if @@quoteTags.has_key?(name)
+      #puts "Beginning quote (#{name})"
+      @quote_stack.push(name)
+    end
+  end
+  def unknown_endtag(name)
+    #Ignore tag_end calls for self-closing tags; they were
+    #closed in the tag_start call.
+    #TODO: still neccessary?
+    #puts "Ending tag #{name}"
+    return if @@self_closing_tags.has_key?(name)
+    if not @quote_stack.empty? and @quote_stack[-1] != name
+      #This is not a real end tag.
+      #puts "</#{name}> is not real!"
+      handle_data('</#{name}>')
+      return
+    end
+    end_text
+    pop_to_tag(name)
+    @quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
+  end
+  def handle_data(data)
+    @currentText.push(data)
+  end
+  #Propagate comments right through.
+  def handle_comment(data)
+    handle_data("<!--#{comment}-->")
+  end
+  def handle_special(data)
+    handle_data("<#{data}>")
+  end
+  def unknown_charref(ref)
+    handle_data("&#{ref};")
+  end
+  def unknown_entityref(ref)
+    handle_data("%#{content}")
+  end
+  def attlistdecl(element_name, attributes, raw_content)
+    handle_data("<!ATTLIST #{raw_content}>")
+  end
+  def cdata(content)
+    handle_data("<![CDATA[#{content}]]")
+  end
+  ###
+  def doctype(*args)
+    content = args.join(' ')
+    ##{name} #{pub_sys}#{long_name}#{url}
+    #long_name = ' "#{long_name}"' if long_name
+    #url = ' "#{url}"' if url
+    handle_data("<!DOCTYPE #{content}>")
+  end
+  def elementdecl(content)
+    handle_data("<!ELEMENT #{content}>")
+  end
+  def entity(content)
+  end
+  def entitydecl(content)
+    handle_data("<!ENTITY #{content.join(' ')}>")
+  end
+  def instruction(name, instruction)
+    handle_data("<?#{name} #{instruction}>")
+  end
+  def notationdecl(content)
+    handle_data("<!NOTATION #{content}>")
+  end
+  def xmldecl(version, encoding, standalone)
+    encoding = ' encoding="#{encoding}"' if encoding
+    handle_data('<?xml version="#{version}"#{encoding}#{standalone}>')
+  end
+  #Called when we're done collecting some text, declarations, etc.
+  def end_text
+    currentText = @currentText.join('')
+    unless currentText.empty?
+      if currentText.strip.empty?
+        if currentText =~ /\n/
+          currentText = "\n"
+        else
+          currentText = ' '
+        end
+      end
+      #puts "Setting up text #{currentText}"
+      currentText = NavigableString.new(currentText)
+      currentText.setup(@currentTag, @previous_parsed)
+      @previous_parsed.next_parsed = currentText if @previous_parsed
+      @previous_parsed = currentText
+      @currentTag.contents.push(currentText)
+    end
+    @currentText = []
+  end
+  # Helper methods
+  private
+  #Pops the tag stack up to and including the most recent
+  #instance of the given tag. If inclusivePop is false, pops the tag
+  #stack up to but *not* including the most recent instance of
+  #the given tag.
+  def pop_to_tag(name, inclusive_pop=true)
+    return if name == @@rootTagName
+    #puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
+    num_pops = 0
+    mostRecentTag = nil
+    (0...@tag_stack.length).to_a.reverse.each do |i|
+      if name == @tag_stack[i].name
+        #puts "Found at #{i}, #{@tag_stack.length-i}"
+        num_pops = @tag_stack.length-i
+        break
+      end
+    end
+    num_pops -= 1 if not inclusive_pop
+    #puts "Popping #{num_pops} times."
+    num_pops.times { mostRecentTag = pop_tag }
+    mostRecentTag
+  end
+  #We need to pop up to the previous tag of this type, unless
+  #one of this tag's nesting reset triggers comes between this
+  #tag and the previous tag of this type, OR unless this tag is a
+  #generic nesting trigger and another generic nesting trigger
+  #comes between this tag and the previous tag of this type.
+  #
+  #Examples:
+  # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
+  # <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
+  # <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
+  # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
+  #
+  # <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
+  # <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
+  # <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
+  def smart_pop(name)
+    #puts "Smart pop for #{name}"
+    nesting_reset_triggers = @@nestable_tags[name]
+    is_nestable = nesting_reset_triggers != nil
+    is_reset_nesting = @@reset_nesting_tags.has_key?(name)
+    popTo = nil
+    inclusive = true
+    for p in @tag_stack.reverse
+        if (p == nil or p.name == name) and not is_nestable
+            #Non-nestable tags get popped to the top or to their
+            #last occurance.
+            #puts "Non-nestable tag #{name} gets popped to its last occurance."
+            popTo = name
+            break
+        end
+        if (nesting_reset_triggers != nil and nesting_reset_triggers.include?(p.name)) or (nesting_reset_triggers == nil and is_reset_nesting and @@reset_nesting_tags.has_key?(p.name))
+          #If we encounter one of the nesting reset triggers
+          #peculiar to this tag, or we encounter another tag
+          #that causes nesting to reset, pop up to but not
+          #including that tag.
+          #puts "Nesting reset trigger encountered for #{name}: #{p.name}"
+          popTo = p.name
+          inclusive = false
+          break
+        end
+        p = p.parent
+    end
+    pop_to_tag(popTo, inclusive) if popTo
+  end
+  protected
+  #Turns a list of maps, lists, or scalars into a single map.
+  #Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
+  #of lists and partial maps.
+  def BeautifulStoneSoup.build_tag_map(default, *args)
+    built = args.inject({}) do |m, portion|
+      if portion.is_a? Hash
+        #It's a map. Merge it.
+        portion.each_pair { |k,v| m[k] = v }
+      elsif portion.is_a? Array
+        #It's a list. Map each item to the default.
+        portion.each { |k| m[k] = default }
+      else
+        #It's a scalar. Map it to the default.
+        m[portion] = default
+      end
+      m
+    end
+  end
+end
+#This parser knows the following facts about HTML:
+#
+#* Some tags have no closing tag and should be interpreted as being
+#  closed as soon as they are encountered.
+#
+#* The text inside some tags (ie. 'script') may contain tags which
+#  are not really part of the document and which should be parsed
+#  as text, not tags. If you want to parse the text as tags, you can
+#  always fetch it and parse it explicitly.
+#
+#* Tag nesting rules:
+#
+#  Most tags can't be nested at all. For instance, the occurance of
+#  a <p> tag should implicitly close the previous <p> tag.
+#
+#   <p>Para1<p>Para2
+#    should be transformed into:
+#   <p>Para1</p><p>Para2
+#
+#  Some tags can be nested arbitrarily. For instance, the occurance
+#  of a <blockquote> tag should _not_ implicitly close the previous
+#  <blockquote> tag.
+#
+#   Alice said: <blockquote>Bob said: <blockquote>Blah
+#    should NOT be transformed into:
+#   Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
+#
+#  Some tags can be nested, but the nesting is reset by the
+#  interposition of other tags. For instance, a <tr> tag should
+#  implicitly close the previous <tr> tag within the same <table>,
+#  but not close a <tr> tag in another table.
+#
+#   <table><tr>Blah<tr>Blah
+#    should be transformed into:
+#   <table><tr>Blah</tr><tr>Blah
+#    but,
+#   <tr>Blah<table><tr>Blah
+#    should NOT be transformed into
+#   <tr>Blah<table></tr><tr>Blah
+#
+#Differing assumptions about tag nesting rules are a major source
+#of problems with the BeautifulSoup class. If BeautifulSoup is not
+#treating as nestable a tag your page author treats as nestable,
+#try writing a subclass.
+class BeautifulSoup < BeautifulStoneSoup
+  @@self_closing_tags.replace(build_tag_map(nil, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame']))
+  @@quote_tags = {'script' => nil}
+  #According to the HTML standard, each of these inline tags can
+  #contain another tag of the same type. Furthermore, it's common
+  #to actually use these tags this way.
+  @@nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center']
+  #According to the HTML standard, these block tags can contain
+  #another tag of the same type. Furthermore, it's common
+  #to actually use these tags this way.
+  @@nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
+  #Lists can contain other lists, but there are restrictions.
+  @@nestable_list_tags = { 'ol' => [],
+    'ul' => [],
+    'li' => ['ul', 'ol'],
+    'dl' => [],
+    'dd' => ['dl'],
+    'dt' => ['dl'] }
+  #Tables can contain other tables, but there are restrictions.
+  @@nestable_table_tags = {'table' => ['tr', 'td'],
+    'tr' => ['table'],
+    'td' => ['tr'],
+    'th' => ['tr'],
+  }
+  @@non_nestable_block_tags = ['address', 'form', 'p', 'pre']
+  #If one of these tags is encountered, all tags up to the next tag of
+  #this type are popped.
+  @@reset_nesting_tags.replace(build_tag_map(nil, @@nestable_block_tags, 'noscript', @@non_nestable_block_tags,
+   @@nestable_list_tags, @@nestable_table_tags))
+  @@nestable_tags.replace(build_tag_map([], @@nestable_inline_tags, @@nestable_block_tags, @@nestable_list_tags, @@nestable_table_tags))
+end
+# This class will push a tag with only a single string child into
+# the tag's parent as an attribute. The attribute's name is the tag
+# name, and the value is the string child. An example should give
+# the flavor of the change:
+#
+# <foo><bar>baz</bar></foo>
+# =>
+# <foo bar="baz"><bar>baz</bar></foo>
+#
+# You can then access fooTag['bar'] instead of fooTag.barTag.string.
+#
+# This is, of course, useful for scraping structures that tend to
+# use subelements instead of attributes, such as SOAP messages. Note
+# that it modifies its input, so don't print the modified version
+# out.
+class BeautifulSOAP < BeautifulStoneSoup
+  def pop_tag
+    if @tag_stack.size > 1
+      tag = @tag_stack[-1]
+      parent = @tag_stack[-2]
+      if (tag.is_a?(Tag) && tag.contents.size == 1 && \
+          tag.contents[0].is_a?(NavigableString) && !parent[tag.name])
+          parent[tag.name] = tag.contents[0]
+      end
+      super
+    end
+  end
+end
+#Enterprise class names! It has come to our attention that some people
+#think the names of the Rubyful Soup parser classes are too silly
+#and "unprofessional" for use in enterprise screen-scraping. We feel
+#your pain! For such-minded folk, the Rubyful Soup Consortium And
+#Rootin' Tootin' Texas Delicatessen recommends renaming this file to
+#"RobustParser.rb" (or, in cases of extreme enterprisitude,
+#"RobustParserBeanInterface.class") and using the following
+#enterprise-friendly class aliases:
+class RobustXMLParser < BeautifulStoneSoup; end
+class RobustHTMLParser < BeautifulSoup; end
+class SimplifyingSOAPParser < BeautifulSOAP; end
+print BeautifulSoup.new(ARGF.read).prettify if $0 == __FILE__

data/tests/rubyful_soup_tests.rb ADDED Viewed

@@ -0,0 +1,431 @@
+#Unit tests for Rubyful Soup.
+#
+#These tests make sure the Rubyful Soup works as it should. If you
+#find a bug in Rubyful Soup, the best way to express it is as a test
+#case like this that fails.
+require 'test/unit'
+require 'rubygems'
+require 'rubyful_soup'
+class SoupTest < Test::Unit::TestCase
+  #Parse the given text and make sure its string rep is the other
+  #given text.
+  def assert_soup_equals(toParse, rep=nil, c=BeautifulStoneSoup)
+    if rep == nil
+      rep = toParse
+    end
+    assert_equal(c.new(toParse).to_s(false), rep)
+  end
+  #Null test to shut the compiler up.
+  def test_null
+  end
+end
+#Tests the various ways of fetching tags from a soup.
+class ToteThatTag < SoupTest
+  def setup
+    ml = %{
+      <a id="x">1</a>
+        <a id="a">2</a>
+        <b id="b">3</b>
+        <b id="x">4</b>
+        <abc:d width="100">5</abc:d>}
+    @soup = BeautifulStoneSoup.new(ml)
+  end
+  def test_fetch_by_name
+    matching = @soup.find_all('a')
+    assert_equal(matching.length, 2)
+    assert_equal(matching[0].name, 'a')
+    assert_equal(matching[0], @soup.find('a'))
+    assert_equal(@soup.find('abc:d').contents.length, 1)
+    firstB = @soup.find('b')
+    nextB = firstB.find_next('b')
+    assert_equal(nextB.contents[0], '4')
+    assert_equal(nextB['id'], 'x')
+  end
+  def test_fetch_by_block
+    a = @soup.find_all('a')
+    b = @soup.find_all do |x|
+      x.is_a? Tag and x.name == 'a'
+    end
+    assert_equal(a,b)
+    a = @soup.find_text('3')
+    b = @soup.find_text do |x|
+      x.is_a? NavigableString and x == '3'
+    end
+    assert_equal(a,b)
+    matching = @soup.find_all do |x|
+      x.respond_to?('name') and x.name == x['id']
+    end
+    assert_equal(matching.length, 2)
+    assert_equal(matching[0].name, 'a')
+  end
+  def test_fetch_by_attribute
+    matching = @soup.find_all(nil, :attrs=>{'id' => 'x'})
+    assert_equal(matching.length, 2)
+    assert_equal(matching[0].name, 'a')
+    assert_equal(matching[1].name, 'b')
+    assert_equal(@soup.find_all(nil, :attrs=>{'id' => nil}).length, 1)
+    assert_equal(@soup.find_all(nil, :attrs=>{'id' => nil}).length, 1)
+    assert_equal(@soup.find_all(nil, :attrs=>{'width' => 100}).length, 1)
+  end
+  def test_tag_name_as_method
+    firstB = @soup.find('b')
+    assert_equal(firstB, @soup.b)
+    assert_equal(firstB, @soup.b_tag)
+  end
+  def test_fetch_by_list
+    matching = @soup.find_all(['a', 'abc:d'])
+    assert_equal(matching.length, 3)
+  end
+  def test_fetch_by_hash
+    matching = @soup.find_all({'a' => true, 'b' => true})
+    assert_equal(matching.length, 4)
+  end
+  def test_fetch_by_re
+    r = /a.*/
+    assert_equal(@soup.find_all(r).length, 3)
+  end
+  def test_fetch_by_method
+    proc = Proc.new { |x| return x.name == x['id'] }
+    matching = @soup.find_all(proc)
+    assert_equal(matching.length, 2)
+    assert_equal(matching[0].name, 'a')
+  end
+end
+#Testing the integrity of the parse tree.
+class FollowThatTag < SoupTest
+  @@PROXIMITY_TEST = BeautifulStoneSoup.new('<b id="1"><b id="2"><b id="3"><b id="4">')
+  @@SIBLING_TEST = BeautifulStoneSoup.new('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
+  def test_parents
+    soup = BeautifulSoup.new('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah</b></ul></ul></ul>')
+    b = soup.find('b')
+    assert_equal(b.find_parents('ul', :attrs=>{'id' => 'foo'}).length, 2)
+    assert_equal(b.find_parent('ul')['a'], 'b')
+  end
+  def test_next_sibling
+    soup = @@SIBLING_TEST
+    tag = 'blockquote'
+    b = soup.find(tag, :attrs=>{'id' => 2})
+    assert_equal(b.find_next(tag)['id'], '2.1')
+    assert_equal(b.find_next_sibling(tag)['id'], '3')
+    assert_equal(b.find_next_sibling(tag)['id'], '3')
+    assert_equal(b.find_next_siblings(tag).length, 2)
+    assert_equal(b.find_next_siblings(tag, :attrs=>{'id' => 4}).length, 1)
+  end
+  def test_previous_sibling
+    soup = @@SIBLING_TEST
+    tag = 'blockquote'
+    b = soup.find(tag, :attrs=>{'id' => 3})
+    assert_equal(b.find_previous(tag)['id'], '2.1')
+    assert_equal(b.find_previous_sibling(tag)['id'], '2')
+    assert_equal(b.find_previous_sibling(tag)['id'], '2')
+    assert_equal(b.find_previous_siblings(tag).length, 2)
+    assert_equal(b.find_previous_siblings(tag, :attrs=>{'id' => 1}).length, 1)
+  end
+  def test_text_navigation
+    soup = BeautifulSoup.new('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
+    baz = soup.find_text('Baz')
+    assert_equal(baz.find_parent("i")['id'], '1')
+    assert_equal(baz.find_next(nil, :text=> 'Blee'), 'Blee')
+    assert_equal(baz.find_next_sibling(nil, :text=>'Blee'), 'Blee')
+    assert_equal(baz.find_next_sibling(nil, :text=>'Blargh'), nil)
+    assert_equal(baz.find_next_sibling('hr')['id'], '1')
+  end
+end
+#Tests the nextSibling and previousSibling navigation.
+class SiblingRivalry < SoupTest
+    def test_siblings
+      soup = BeautifulSoup.new("<ul><li>1<p>A</p>B</li><li>2</li><li>3</li></ul>")
+      second_li = soup.find('li').next_sibling
+      assert_equal(second_li.name, 'li')
+      assert_equal(second_li.string, '2')
+      assert_equal(soup.find_text('1').next_sibling.name, 'p')
+      assert_equal(soup.find('p').next_sibling, 'B')
+      assert_equal(soup.find('p').next_sibling.previous_sibling.next_sibling,
+                   'B')
+    end
+end
+#Tests the various built-in functions of Tag objects.
+class TagsAreObjectsToo < SoupTest
+  @@SOUP = BeautifulSoup.new('<top id="1">1<b>2</b>3</top>')
+  def test_length
+    assert_equal(@@SOUP.top.length, 3)
+  end
+  def test_hash_lookup
+    assert_equal(@@SOUP.top['id'], "1")
+  end
+  def test_iterator
+    bucket = []
+    @@SOUP.top.each do |x|
+      bucket << x
+    end
+    assert_equal(bucket.length, 3)
+    assert_equal(bucket[2], "3")
+  end
+end
+#Tests the use of 'string' as an alias for a tag's only content.
+class StringEmUp < SoupTest
+  def test_string
+    s = BeautifulSoup.new('<b>foo</b>')
+    assert_equal(s.b.string, 'foo')
+  end
+  def test_lack_of_string
+    s = BeautifulSoup.new("<b>f<i>e</i>o</b>")
+    self.assert_equal(s.b.string, nil)
+  end
+end
+#Tests the limit argument.
+class ThatsMyLimit < SoupTest
+  def test_basic_limits
+    s = BeautifulSoup.new('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
+    assert_equal(s.find_all('br').length, 4)
+    assert_equal(s.find_all('br', :limit=> 2).length, 2)
+  end
+end
+#Testing the modification of the tree.
+class WriteOnlyCode < SoupTest
+  def test_replace_contents
+    soup = BeautifulSoup.new('<a>foo</a>')
+    soup.a.contents[0] = (NavigableString.new('bar'))
+    assert_equal(soup.render_contents, '<a>bar</a>')
+  end
+  def test_modify_attributes
+    soup = BeautifulSoup.new('<a id="1"></a>')
+    first_a = soup.find('a')
+    first_a['id'] = 2
+    assert_equal(soup.render_contents, '<a id="2"></a>')
+    first_a['id'] = nil
+    assert_equal(soup.render_contents, '<a></a>')
+    first_a['id2'] = 'foo'
+    assert_equal(soup.render_contents, '<a id2="foo"></a>')
+    first_a.delete('id2')
+    assert_equal(soup.render_contents, '<a></a>')
+  end
+  #Makes sure tags don't step on each others' toes.
+  def test_new_tag_
+    soup = BeautifulSoup.new('')
+    a = Tag.new(soup, 'a')
+    ol = Tag.new(soup, 'ol')
+    a["href"] = "http://foo.com/"
+    assert_equal(ol["href"], nil)
+  end
+end
+#Our operators do it all! Call now!
+class OperatorOverload < SoupTest
+  def test_tag_name_as_find
+    # Tests that referencing a tag name as a member delegates to find.
+    soup = BeautifulSoup.new('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
+    assert_equal(soup.b.i, soup.find('b').find('i'))
+    assert_equal(soup.b.i.string, 'bar')
+    assert_equal(soup.b['id'], '1')
+    assert_equal(soup.b.contents[0], 'foo')
+    assert(soup.a == nil)
+    #Test the .foo_tag variant of .foo.
+    assert_equal(soup.b_tag.i_tag.string, 'bar')
+    assert_equal(soup.b.i_tag.string, 'bar')
+    assert_equal(soup.find('b').find('i'), soup.b_tag.i_tag)
+  end
+end
+#Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!
+class NestableEgg < SoupTest
+  def test_para_inside_blockquote
+    soup = BeautifulSoup.new('<blockquote><p><b>Foo</b></p></blockquote><p>Bar')
+    assert_equal(soup.blockquote.p.b.string, 'Foo')
+    assert_equal(soup.blockquote.b.string, 'Foo')
+    assert_equal(soup.find('p', :recursive=>false).string, 'Bar')
+  end
+  def test_nested_tables
+    text = %{<table id="1"><tr><td>Here's another table:
+        <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>}
+    soup = BeautifulSoup.new(text)
+    assert_equal(soup.table.table.td.string, 'Juicy text')
+    assert_equal(soup.find_all('table').length, 2)
+    assert_equal(soup.table.find_all('table').length, 1)
+    assert_equal(soup.find('table', :attrs=>{'id' => 2}).parent.parent.parent.name,
+                 'table')
+  end
+  def test_bad_nested_tables
+    soup = BeautifulSoup.new("<table><tr><table><tr id='nested'></tr></table></tr></table>")
+    assert_equal(soup.table.tr.table.tr['id'], 'nested')
+  end
+end
+#Here we test cleanup of text that breaks an unaltered parser or is just
+#obnoxious.
+class CleanupOnAisleFour < SoupTest
+     def test_self_closing_tag
+       assert_equal(BeautifulStoneSoup.new("Foo<br/>Bar").find('br').to_s,
+                      '<br />')
+       assert_soup_equals('<p>test1<br/>test2</p>',
+                          '<p>test1<br />test2</p>')
+     end
+     def test_bad_closing_tags
+       BeautifulStoneSoup.new("<a>Foo<b>Bar</a>")
+     end
+     def test_premature_closing_tag
+       BeautifulStoneSoup.new("</b><a>Foo<b>Bar</a>")
+     end
+     def test_bad_doctype
+       assert_soup_equals("<!DOCTYPE foo='bar'>")
+     end
+     def test_whitespace_in_declaration
+       assert_soup_equals('<! DOCTYPE>', '<!DOCTYPE>')
+     end
+     def test_JunkInDeclaration
+       assert_soup_equals('<! Foo = -8>a', '<!Foo = -8>a')
+     end
+     def test_incomplete_declaration
+       assert_soup_equals('a<!b <p>c', 'a<!b <p>c</p>')
+     end
+     def test_valid_but_bogus_declaration
+       assert_soup_equals('<! Foo >a', '<!Foo >a')
+     end
+     #This fails for a totally bogus reason! I can't figure it out.
+     #def test_smart_quotes_not_so_smart_anymore_FAILS
+     #  assert_soup_equals("\x91Foo\x92", '&lsquo;Foo&rsquo;')
+     #end
+     #def test_incomplete_declaration_at_endFAILS
+     #  assert_soup_equals('a<!b')
+     #end
+end
+#Verifies that the parser treats multiple feed calls the same as one
+#big feed call only if constructed with
+#initialTextIsEverything=False.
+class KeepOnParsing < SoupTest
+    def test_multiple_parse_calls
+      f1 = '<foo>bah<bar>'
+      f2 = 'blee</bar></foo>'
+      s1 = BeautifulSoup.new(f1+f2)
+      s2 = BeautifulSoup.new(f1)
+      s2.feed(f2)
+      s3 = BeautifulSoup.new(f1, :initial_text_is_everything => false)
+      s3.feed(f2)
+      assert_not_equal(s1, s2)
+      assert_equal(s1, s3)
+    end
+end
+#Verifies that BeautifulSOAP parser works.
+class SOAPMeUp < SoupTest
+  def test_basic_soap
+    s = "<foo><bar>baz</bar></foo>"
+    soup = BeautifulSOAP.new(s)
+    assert_equal(soup.to_s, %{<foo bar="baz"><bar>baz</bar></foo>})
+  end
+  def test_dont_overwrite_existing_attr
+    s = %{<foo bar="don't kill me!"><bar>baz</bar></foo>}
+    soup = BeautifulSOAP.new(s)
+    assert_equal(soup.to_s, s)
+  end
+end
+#The Unicode test suite has not yet been ported because I haven't
+#figured out how Ruby does Unicode.
+# class UnicodeRed < SoupTest
+#     "Makes sure Unicode works."
+#     def setUp
+#         text = 'foo<b>bar</b>'
+#         @soup = BeautifulStoneSoup
+#         @soup.feed(text)
+#     def test_BasicUnicode
+#         import types
+#         sType = types.StringType
+#         uType = types.UnicodeType
+#         u = u'\3100'
+#         #It starts out ASCII...
+#         assert_equal(type(@soup.renderContents), sType)
+#         assert_equal(type(@soup.prettify), sType)
+#         #But you can have unicode if you want.
+#         assert_equal(type(unicode(@soup)), uType)
+#         #Add a Unicode character and it's Unicode.
+#         @soup.feed(u)
+#         assert_equal(type(@soup.renderContents), uType)
+#         assert_equal(type(@soup.prettify), uType)
+#         #But you can have ASCII if you want.
+#         assert_equal(type(str(@soup)), sType)
+#         #The part without any Unicode is still ASCII.
+#         assert_equal(type(@soup.b.prettify), sType)
+#         #But if you add a Unicode character it'll become Unicode.
+#         @soup.b['foo'] = u'\3100'
+#         assert_equal(type(@soup.b.prettify), uType)

metadata ADDED Viewed

@@ -0,0 +1,52 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.8.4
+specification_version: 1
+name: rubyful_soup
+version: !ruby/object:Gem::Version
+  version: 1.0.1
+date: 2005-10-21
+summary: An HTML/XML parser that handles bad markup and provides tree traversal methods.
+require_paths:
+  - lib
+email: leonardr@segfault.org
+homepage: http://www.crummy.com/software/RubyfulSoup/
+rubyforge_project:
+description: "Rubyful Soup is a *ML parser that makes screen-scraping easy. It won't choke on
+  bad markup, and it's easy to locate the part of a document you want."
+autorequire:
+default_executable:
+bindir: bin
+has_rdoc: true
+required_ruby_version: !ruby/object:Gem::Version::Requirement
+  requirements:
+    -
+      - ">"
+      - !ruby/object:Gem::Version
+        version: 0.0.0
+  version:
+platform: ruby
+authors:
+  - Leonard Richardson
+files:
+  - lib/rubyful_soup.rb
+  - tests/rubyful_soup_tests.rb
+  - CHANGELOG
+test_files:
+  - tests/rubyful_soup_tests.rb
+rdoc_options: []
+extra_rdoc_files:
+  - CHANGELOG
+executables: []
+extensions: []
+requirements: []
+dependencies:
+  - !ruby/object:Gem::Dependency
+    name: htmltools
+    version_requirement:
+    version_requirements: !ruby/object:Gem::Version::Requirement
+      requirements:
+        -
+          - ">"
+          - !ruby/object:Gem::Version
+            version: 0.0.0
+      version: