RubyGems - rubyful_soup_2011 - Versions diffs - 0.1.5 - Mend

rubyful_soup_2011 0.1.5

Files changed (3) hide show

data/lib/rubyful_soup.rb +950 -0
data/tests/rubyful_soup_tests.rb +441 -0
metadata +57 -0

data/lib/rubyful_soup.rb ADDED Viewed

@@ -0,0 +1,950 @@
+#Rubyful Soup
+#Elixir and Tonic
+#"The Screen-Scraper's Friend"
+#v1.0.4
+#http://www.crummy.com/software/RubyfulSoup/
+#
+#Rubyful Soup is a port to the Ruby language and idiom of the Python
+#library Beautiful Soup.
+#See http://www.crummy.com/software/BeautifulSoup/ for details on the original.
+#This library requires the sgml-parser library, written by Takahiro
+#Maebashi. The easiest way to get it is to install the "htmltools"
+#gem.
+require 'html/sgml-parser'
+require 'set'
+#UTF-8 voodoo--does this really work?
+$KCODE = 'u'
+# require 'jcode'
+#This code makes SGMLParser able to parse XML with namespaces.
+class HTML::SGMLParser
+  if const_defined? :Tagfind
+    remove_const(:Tagfind)
+    Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/
+  end
+end
+module PageElement
+  attr_reader :parser
+  attr_accessor :parent, :previous_parsed, :next_parsed, :previous_sibling
+  attr_accessor :next_sibling
+  def setup(parent=nil, previous_parsed=nil)
+    @parent = parent
+    @previous_parsed = previous_parsed
+    @next_parsed = nil
+    @previous_sibling = nil
+    @next_sibling = nil
+    if @parent and not @parent.contents.empty?
+      @previous_sibling = @parent.contents[-1]
+      @previous_sibling.next_sibling = self
+    end
+  end
+  #A bunch of different iterators over a parsed document.
+  {
+    #Iterates in parse order over the rest of the items in this document.
+    :next_parsed_items => :next_parsed,
+    #Iterates in reverse parse order over all previously parsed items in
+    #this document.
+    :previous_parsed_items => :previous_parsed,
+    #Iterates in parse order over all subsequent siblings of this item.
+    :next_siblings => :next_sibling,
+    #Iterates in reverse parse order over all prior siblings of this item.
+    :previous_siblings => :previous_sibling,
+    #Iterates upwards through the parentage of this item.
+    :parents => :parent
+  }.each do |k,v|
+    class_eval %{
+    def #{k}
+      i = self
+      while i
+        i = i.#{v}
+          yield i if i
+      end
+    end
+  }
+end
+[ #Returns first item/all items matching the given criteria and
+  #appearing after this PageElement in the document.
+  [:find_next, :find_all_next, 'next_parsed_items'],
+  #Returns first item/all items matching the given criteria and
+  #appearing before this PageElement in the document.
+  [:find_previous, :find_all_previous, 'previous_parsed_items'],
+  #Returns the nearest sibling/all siblings of this PageElement matching
+  #the given criteria and appearing before this PageElement in
+  #the document.
+  [:find_previous_sibling, :find_previous_siblings, 'previous_siblings'],
+  #Returns the nearest sibling/all siblings of this PageElement matching
+  #the given criteria and appearing after this PageElement in
+  #the document
+  [:find_next_sibling, :find_next_siblings, 'next_siblings'],
+  #Returns the nearest parent/all parents of this PageElement matching
+  #the given criteria.
+  [:find_parent, :find_parents, 'parents'],
+].each do |singular, plural, method_name|
+  class_eval %{
+    def #{singular}(name=nil, args={}, &block)
+      args['limit'] = 1
+      fetch(method('#{method_name}'), name, args, block)[0]
+    end
+    def #{plural}(name=nil, args={}, &block)
+      fetch(method('#{method_name}'), name, args, block)
+    end
+  }
+end
+  protected
+  #Returns a list of items matching the given criteria, obtained by
+  #iterating over the given iterator.
+  def fetch(iterator, name, args, block)
+    attrs = args[:attrs]
+    limit = args[:limit]
+    text = args[:text]
+    attrs ||= {}
+    if attrs != nil and not attrs.respond_to? :keys
+      attrs = {'class' => attrs}
+    end
+    bucket = []
+    catch(:stop_iteration) do
+      iterator.call do |item|
+        match = false
+        if block
+          match = true if block.call(item)
+        elsif item.is_a? Tag
+          #A tag matches if its name matches and its attributes line up.
+          if not text and (not name or PageElement.matches(item, name))
+            match = true
+            attrs.each_pair do |attr, matchAgainst|
+              check = item[attr]
+              unless PageElement.matches(check, matchAgainst)
+                match = false
+                break
+              end
+            end
+          end
+        elsif text
+          #A text matches if its string value matches the given text
+          #criterion.
+          match = PageElement.matches(item, text)
+        end
+        if match
+          bucket.push(item)
+          if limit and bucket.length >= limit
+            throw :stop_iteration
+          end
+        end
+      end
+    end
+    return bucket
+  end
+  #Used to tell whether a Tag or a NavigableString "matches" some data
+  #structure.
+  def PageElement.matches(chunk, how_to_match)
+    #puts "Seeing if #{chunk.class} #{chunk} matches #{how_to_match.class} #{how_to_match}."
+    #
+    # If given a list of items, return true if the list contains a
+    # text element that matches.
+    if chunk.is_a? Array
+      chunk.each do |tag|
+        return true if tag.is_a? NavigableString and matches(tag, how_to_match)
+      end
+      return false
+    elsif how_to_match.is_a? Proc
+      return how_to_match.call(chunk)
+    elsif chunk.is_a? Tag
+      #Custom match methods take the tag as an argument, but all other
+      #ways of matching match the tag name as a string
+      chunk = chunk.name
+    end
+    #At this point we know that chunk is a string
+    unless chunk.is_a? String
+      chunk = chunk.to_s
+    end
+    if how_to_match.is_a? Regexp
+      return how_to_match.match(chunk) != nil
+    elsif how_to_match.is_a? Array
+      return how_to_match.find {|x| x == chunk} != nil
+    elsif how_to_match.is_a? Hash
+      return how_to_match[chunk] != nil
+    else
+      #It's just a string
+      return how_to_match.to_s == chunk
+    end
+  end
+end
+module TagModule
+  include Enumerable
+  include PageElement
+  attr_accessor :name, :contents, :attrs, :string
+  #I tried to have Tag subclass Method, but it killed the
+  #whole thing. Maybe I should just leave well enough alone.
+  #
+  #def arity
+  #  return methods('find_all').arity
+  #end
+  #
+  #def call(*args)
+  #  return find_all(*args)
+  #end
+  #
+  #def to_proc
+  #  return methods('find_all').to_proc
+  #end
+  def initialize(parser, name, attr_list=[], parent=nil, previous=nil)
+    @hidden = false
+    @parser = parser
+    @name = name
+    @attr_list = attr_list
+    @attrs = nil
+    @contents = []
+    setup(parent, previous)
+  end
+  # Turn the list of attributes into a hash on demand, so we don't have
+  # to do it for every tag while parsing.
+  def attrs
+    unless @attrs
+      @attrs = @attr_list.inject({}) do |m,v|
+        if v[1][0] == ?" and v[1][-1] == ?"
+            v[1] = v[1][1..-2]
+        end
+        m[v[0]] = v[1]
+        m
+      end
+      @attr_list = nil
+    end
+    return @attrs
+  end
+  #soup.title_tag, or soup.title, is the same as soup.find('title')
+  def method_missing(name, *args)
+    #puts "Missing method #{name} for #{self.class.name}"
+    name = name.to_s
+    if name[-4...name.length] == '_tag'
+        name = name[0...name.length-4]
+    end
+    return find(name, *args)
+  end
+  def [](k)
+    attrs[k]
+  end
+  def []=(k, v)
+    attrs[k] = v
+  end
+  def delete(k)
+    attrs.delete(k)
+  end
+  def has_key?(k)
+    attrs.has_key(k)
+  end
+  def each
+    @contents.each { |x| yield x }
+  end
+  def length
+    return contents.length
+  end
+  alias size length
+  def self_closing?
+    return @parser.self_closing_tag?(@name)
+  end
+  #Adds the given tag to the contents of this tag
+  def append(tag)
+    @contents.push(tag)
+  end
+  def to_str
+    return to_s
+  end
+  #Renders this tag and its contents as a pretty-printed string.
+  def prettify
+    return to_s(true)
+  end
+  def inspect
+    to_s
+  end
+  #Renders this tag and its contents as a string.  NOTE: since REXML
+  #consumes whitespace, this method is not certain to reproduce the
+  #whitespace present in the original string.
+  def to_s(show_structure_indent=nil)
+    attr_strings = []
+    attrs.each { |k,v| attr_strings << %{#{k}="#{v}"} if v }
+    if self_closing?
+      close = ' /'
+      closeTag = nil
+    else
+      close = nil
+      closeTag = "</#{name}>"
+    end
+    indent_increment = show_structure_indent==true ? 0 : show_structure_indent
+    if show_structure_indent
+      indent_increment += 1 unless @hidden
+    end
+    contents = render_contents(indent_increment)
+    space = "\n #{' ' * indent_increment}" if show_structure_indent
+    if @hidden
+      s = contents
+    else
+      s = []
+      attribute_string = ''
+      unless attr_strings.empty?
+        attribute_string = ' ' + attr_strings.join(' ')
+      end
+      s.push(space) if show_structure_indent
+      s.push("<#{@name}#{attribute_string}#{close}>")
+      s.push(contents)
+      s.push(space) if closeTag and show_structure_indent
+      s.push(closeTag)
+      s = s.join('')
+    end
+    return s
+  end
+  #Renders the contents of this tag as a string.
+  def render_contents(show_structure_indent=nil)
+    s=[]
+    @contents.each do |c|
+      text = nil
+      if c.is_a? Tag
+        text = c.to_s(show_structure_indent)
+      else
+        text = c.to_s
+      end
+      if text
+        if show_structure_indent
+          text.chomp!
+        end
+        s.push(text)
+      end
+    end
+    return s.join('')
+  end
+  def recursive_children
+    stack = [[self, 0]]
+    catch(:stop_iteration) do
+      until stack.empty?
+        tag, start = stack.pop
+        for i in start...tag.contents.length
+          a = tag.contents[i]
+          yield a
+          if a.is_a? TagModule and not tag.contents.empty? and i < tag.contents.length
+            stack.push([tag, i+1])
+            stack.push([a, 0])
+            break
+          end
+        end if tag.is_a? TagModule
+      end
+    end
+  end
+  #Iterates over the direct children of this Tag.
+  def children
+    catch(:stop_iteration) { @contents.each { |x| yield x } }
+  end
+  #Convenience method to retrieve the first piece of text matching the
+  #given criteria. 'text' can be a string, a regular expression object,
+  #a Proc that takes a string and returns whether or not the
+  #string 'matches', etc.
+  def find_text(text=nil, &block)
+    args = { :text => text, :limit => 1}
+    iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
+    fetch(iterator, nil, args, block)[0]
+  end
+  #Convenience method to retrieve all pieces of text matching the
+  #given criteria. 'text' can be a string, a regular expression object,
+  #a callable that takes a string and returns whether or not the
+  #string 'matches', etc.
+  #Args: :limit
+  def find_all_text(text=nil, args={}, &block)
+    args['text'] = text
+    iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
+    fetch(iterator, nil, args, block)
+  end
+  #Extracts a list of Tag objects that match the given criteria.  You
+  #can specify the name of the Tag and any attributes you want the Tag
+  #to have.
+  #
+  #The value of a key-value pair in the 'attrs' map can be a string, a
+  #list of strings, a regular expression object, or a Proc object that
+  #takes a string and returns whether or not the string matches for
+  #some custom definition of 'matches'. The same is true of the tag
+  #name, except that a Proc object will be passed the Tag object instead
+  #of just a string.
+  #Args: :attrs :text :limit :recursive
+  def find_all(name=nil, args={}, &block)
+    iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
+    fetch(iterator, name, args, block)
+  end
+  #Returns the first Tag or NavigableString object that matches the
+  #given criteria. Takes much the same arguments as fetch.
+  #args: :attrs :text :limit :recursive
+  def find(name=nil, args={}, &block)
+    args[:limit] = 1
+    iterator = method(args[:recursive] == false ? 'children' : 'recursive_children')
+    fetch(iterator, name, args, block)[0]
+  end
+end
+class Tag
+  include TagModule
+end
+class NavigableString < String
+  include PageElement
+end
+#This class contains the basic parser and fetch code. It defines
+#a parser that knows nothing about tag behavior except for the
+#following:
+#
+#You can't close a tag without closing all the tags it encloses.
+#That is, "<foo><bar></foo>" actually means
+#"<foo><bar></bar></foo>".
+#
+#[Another possible explanation is "<foo><bar /></foo>", but since
+# this class defines no self_closing_tags, it will never use that
+# explanation.]
+#
+#This class is useful for parsing XML or made-up markup languages,
+#or when BeautifulSoup makes an assumption counter to what you were
+#expecting."""
+class BeautifulStoneSoup < HTML::SGMLParser
+  include TagModule
+  #As a public service we will by default silently replace MS smart quotes
+  #and similar characters with their HTML or ASCII equivalents.
+  @@ms_chars = { '\x80' => '&euro;',
+    "\x81" => ' ',
+    "\x82" => '&sbquo;',
+    "\x83" => '&fnof;',
+    "\x84" => '&bdquo;',
+    "\x85" => '&hellip;',
+    "\x86" => '&dagger;',
+    "\x87" => '&Dagger;',
+    "\x88" => '&caret;',
+    "\x89" => '%',
+    "\x8A" => '&Scaron;',
+    "\x8B" => '&lt;',
+    "\x8C" => '&OElig;',
+    "\x8D" => '?',
+    "\x8E" => 'Z',
+    "\x8F" => '?',
+    "\x90" => '?',
+    "\x91" => '&lsquo;',
+    "\x92" => '&rsquo;',
+    "\x93" => '&ldquo;',
+    "\x94" => '&rdquo;',
+    "\x95" => '&bull;',
+    "\x96" => '&ndash;',
+    "\x97" => '&mdash;',
+    "\x98" => '&tilde;',
+    "\x99" => '&trade;',
+    "\x9a" => '&scaron;',
+    "\x9b" => '&gt;',
+    "\x9c" => '&oelig;',
+    "\x9d" => '?',
+    "\x9e" => 'z',
+    "\x9f" => '&Yuml;'}
+  @@parser_massage = [[/<([^<>]*)\/>/, '<\1></\1>'],
+    [/<!\s+([^<>]*)>/, '<!\1>'],
+    [/([\x80-\x9f])/m, proc { |m| @@ms_chars[m]}]
+  ]
+  @@rootTagName = '[document]'
+  @@nestable_tags = {}
+  @@reset_nesting_tags = {}
+  @@quoteTags = {}
+  @@self_closing_tags = {}
+  attr_accessor :hidden
+  def self_closing_tag?(tag)
+    @@self_closing_tags.has_key?(tag)
+  end
+  #Args: :initial_text_is_everything, :avoid_parser_problems, :parse_only_these
+  def initialize(text, args={})
+    super(self, @@rootTagName)
+    @quote_stack = []
+    @hidden = 1
+    if args[:parse_only_these]
+      @parse_only_these = Set.new
+      p = args[:parse_only_these]
+      if p.respond_to? :each
+        p.each { |x| @parse_only_these << x }
+      else
+        @parse_only_these << p
+      end
+    else
+      @parse_only_these = nil
+    end
+    reset
+    @avoid_parser_problems = args[:avoid_parser_problems] || true
+    if @avoid_parser_problems and not @avoid_parser_problems.is_a? Enumerable
+      @avoid_parser_problems = @@parser_massage
+    end
+    feed(text) if text != nil
+    done if args[:initial_text_is_everything] != false
+  end
+  def feed(text)
+    if @avoid_parser_problems
+      #before = text.clone
+      @avoid_parser_problems.each do |re, fix|
+        if fix.is_a? String
+          text.gsub!(re, fix)
+        else
+          text.gsub!(re) { |x| fix.call(x) }
+        end
+      end
+      #if before != text
+      #  puts "Changed from #{before} to #{text}"
+      #end
+    end
+    super
+  end
+  def ==(anObject)
+    return anObject != nil && anObject.to_s == to_s
+  end
+  def done
+    end_text
+    pop_tag while @currentTag.name != @@rootTagName
+  end
+  def reset
+    super
+    @currentText = []
+    @currentTag = nil
+    @tag_stack = []
+    push_tag(self)
+  end
+  def push_tag(tag)
+    #puts "Push #{ tag.name }"
+    @currentTag.append(tag) if @currentTag
+    @tag_stack.push(tag)
+    @currentTag = @tag_stack[-1]
+  end
+  def pop_tag
+    tag = @tag_stack.pop
+    #puts "Pop #{ tag.name }"
+    # Tags with just one string-owning child get the child as a
+    # 'string' property, so that soup.tag.string is shorthand for
+    # soup.tag.contents[0]
+    if @currentTag.contents.length == 1 and @currentTag.contents[0].is_a? NavigableString
+      @currentTag.string = @currentTag.contents[0]
+    end
+    @currentTag = @tag_stack[-1] unless @tag_stack.empty?
+    @currentTag
+  end
+  # StreamListener implementation
+  def unknown_starttag(name, attrs)
+    #puts "Starting tag #{name} #{attrs.inspect}"
+    unless @quote_stack.empty?
+      #This is not a real tag.
+      #puts "<#{name}> is not real!"
+      #TODO: find idiomatic way to do this
+      attrString = []
+      attrs.each { |k,v| attrString.push('#{k}="#{v}"') }
+      self.handle_data('<#{name} #{attrString.join(' ')}>')
+      return
+    end
+    end_text
+    return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
+    self_closing = @@self_closing_tags.has_key?(name)
+    smart_pop(name) unless self_closing
+    tag = Tag.new(self, name, attrs, @currentTag, @previous_parsed)
+    @previous_parsed.next_parsed = tag if @previous_parsed
+    @previous_parsed = tag
+    push_tag(tag)
+    pop_tag if self_closing
+    if @@quoteTags.has_key?(name)
+      #puts "Beginning quote (#{name})"
+      @quote_stack.push(name)
+    end
+  end
+  def unknown_endtag(name)
+    #Ignore tag_end calls for self-closing tags; they were
+    #closed in the tag_start call.
+    #TODO: still neccessary?
+    #puts "Ending tag #{name}"
+    return if @@self_closing_tags.has_key?(name)
+    if not @quote_stack.empty? and @quote_stack[-1] != name
+      #This is not a real end tag.
+      #puts "</#{name}> is not real!"
+      handle_data('</#{name}>')
+      return
+    end
+    return unless !@parse_only_these or @tag_stack.size > 1 or @parse_only_these.member?(name)
+    end_text
+    pop_to_tag(name)
+    @quote_stack.pop if not @quote_stack.empty? and @quote_stack[-1] == name
+  end
+  def handle_data(data)
+    return unless !@parse_only_these or @tag_stack.size > 1
+    @currentText.push(data)
+  end
+  #Propagate comments right through.
+  def handle_comment(data)
+    handle_data("<!--#{data}-->")
+  end
+  def handle_special(data)
+    handle_data("<#{data}>")
+  end
+  def unknown_charref(ref)
+    handle_data("&#{ref};")
+  end
+  def unknown_entityref(ref)
+    handle_data("%#{ref}")
+  end
+  def attlistdecl(element_name, attributes, raw_content)
+    handle_data("<!ATTLIST #{raw_content}>")
+  end
+  def cdata(content)
+    handle_data("<![CDATA[#{content}]]")
+  end
+  ###
+  def doctype(*args)
+    content = args.join(' ')
+    ##{name} #{pub_sys}#{long_name}#{url}
+    #long_name = ' "#{long_name}"' if long_name
+    #url = ' "#{url}"' if url
+    handle_data("<!DOCTYPE #{content}>")
+  end
+  def elementdecl(content)
+    handle_data("<!ELEMENT #{content}>")
+  end
+  def entity(content)
+  end
+  def entitydecl(content)
+    handle_data("<!ENTITY #{content.join(' ')}>")
+  end
+  def instruction(name, instruction)
+    handle_data("<?#{name} #{instruction}>")
+  end
+  def notationdecl(content)
+    handle_data("<!NOTATION #{content}>")
+  end
+  def xmldecl(version, encoding, standalone)
+    encoding = ' encoding="#{encoding}"' if encoding
+    handle_data('<?xml version="#{version}"#{encoding}#{standalone}>')
+  end
+  #Called when we're done collecting some text, declarations, etc.
+  def end_text
+    currentText = @currentText.join('')
+    unless currentText.empty?
+      if currentText.strip.empty?
+        if currentText =~ /\n/
+          currentText = "\n"
+        else
+          currentText = ' '
+        end
+      end
+      #puts "Setting up text #{currentText}"
+      currentText = NavigableString.new(currentText)
+      currentText.setup(@currentTag, @previous_parsed)
+      @previous_parsed.next_parsed = currentText if @previous_parsed
+      @previous_parsed = currentText
+      @currentTag.contents.push(currentText)
+    end
+    @currentText = []
+  end
+  # Helper methods
+  private
+  #Pops the tag stack up to and including the most recent
+  #instance of the given tag. If inclusivePop is false, pops the tag
+  #stack up to but *not* including the most recent instance of
+  #the given tag.
+  def pop_to_tag(name, inclusive_pop=true)
+    return if name == @@rootTagName
+    #puts "Pop to tag #{ name }. Inclusive? #{inclusive_pop}"
+    num_pops = 0
+    mostRecentTag = nil
+    (@tag_stack.length-1).downto(0) do |i|
+      if name == @tag_stack[i].name
+        #puts "Found at #{i}, #{@tag_stack.length-i}"
+        num_pops = @tag_stack.length-i
+        break
+      end
+    end
+    num_pops -= 1 if not inclusive_pop
+    #puts "Popping #{num_pops} times."
+    num_pops.times { mostRecentTag = pop_tag }
+    mostRecentTag
+  end
+  #We need to pop up to the previous tag of this type, unless
+  #one of this tag's nesting reset triggers comes between this
+  #tag and the previous tag of this type, OR unless this tag is a
+  #generic nesting trigger and another generic nesting trigger
+  #comes between this tag and the previous tag of this type.
+  #
+  #Examples:
+  # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
+  # <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
+  # <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
+  # <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
+  #
+  # <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
+  # <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
+  # <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
+  def smart_pop(name)
+    #puts "Smart pop for #{name}"
+    nesting_reset_triggers = @@nestable_tags[name]
+    is_nestable = nesting_reset_triggers != nil
+    is_reset_nesting = @@reset_nesting_tags.has_key?(name)
+    popTo = nil
+    inclusive = true
+    @tag_stack.reverse_each do |p|
+        if (p == nil or p.name == name) and not is_nestable
+            #Non-nestable tags get popped to the top or to their
+            #last occurance.
+            #puts "Non-nestable tag #{name} gets popped to its last occurance."
+            popTo = name
+            break
+        end
+        if (nesting_reset_triggers != nil and nesting_reset_triggers.include?(p.name)) or (nesting_reset_triggers == nil and is_reset_nesting and @@reset_nesting_tags.has_key?(p.name))
+          #If we encounter one of the nesting reset triggers
+          #peculiar to this tag, or we encounter another tag
+          #that causes nesting to reset, pop up to but not
+          #including that tag.
+          #puts "Nesting reset trigger encountered for #{name}: #{p.name}"
+          popTo = p.name
+          inclusive = false
+          break
+        end
+        p = p.parent
+    end
+    pop_to_tag(popTo, inclusive) if popTo
+  end
+  protected
+  #Turns a list of maps, lists, or scalars into a single map.
+  #Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
+  #of lists and partial maps.
+  def BeautifulStoneSoup.build_tag_map(default, *args)
+    built = args.inject({}) do |m, portion|
+      if portion.is_a? Hash
+        #It's a map. Merge it.
+        portion.each_pair { |k,v| m[k] = v }
+      elsif portion.is_a? Array
+        #It's a list. Map each item to the default.
+        portion.each { |k| m[k] = default }
+      else
+        #It's a scalar. Map it to the default.
+        m[portion] = default
+      end
+      m
+    end
+  end
+end
+#This parser knows the following facts about HTML:
+#
+#* Some tags have no closing tag and should be interpreted as being
+#  closed as soon as they are encountered.
+#
+#* The text inside some tags (ie. 'script') may contain tags which
+#  are not really part of the document and which should be parsed
+#  as text, not tags. If you want to parse the text as tags, you can
+#  always fetch it and parse it explicitly.
+#
+#* Tag nesting rules:
+#
+#  Most tags can't be nested at all. For instance, the occurance of
+#  a <p> tag should implicitly close the previous <p> tag.
+#
+#   <p>Para1<p>Para2
+#    should be transformed into:
+#   <p>Para1</p><p>Para2
+#
+#  Some tags can be nested arbitrarily. For instance, the occurance
+#  of a <blockquote> tag should _not_ implicitly close the previous
+#  <blockquote> tag.
+#
+#   Alice said: <blockquote>Bob said: <blockquote>Blah
+#    should NOT be transformed into:
+#   Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
+#
+#  Some tags can be nested, but the nesting is reset by the
+#  interposition of other tags. For instance, a <tr> tag should
+#  implicitly close the previous <tr> tag within the same <table>,
+#  but not close a <tr> tag in another table.
+#
+#   <table><tr>Blah<tr>Blah
+#    should be transformed into:
+#   <table><tr>Blah</tr><tr>Blah
+#    but,
+#   <tr>Blah<table><tr>Blah
+#    should NOT be transformed into
+#   <tr>Blah<table></tr><tr>Blah
+#
+#Differing assumptions about tag nesting rules are a major source
+#of problems with the BeautifulSoup class. If BeautifulSoup is not
+#treating as nestable a tag your page author treats as nestable,
+#try writing a subclass.
+class BeautifulSoup < BeautifulStoneSoup
+  @@self_closing_tags.replace(build_tag_map(nil, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame']))
+  @@quote_tags = {'script' => nil}
+  #According to the HTML standard, each of these inline tags can
+  #contain another tag of the same type. Furthermore, it's common
+  #to actually use these tags this way.
+  @@nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center']
+  #According to the HTML standard, these block tags can contain
+  #another tag of the same type. Furthermore, it's common
+  #to actually use these tags this way.
+  @@nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
+  #Lists can contain other lists, but there are restrictions.
+  @@nestable_list_tags = { 'ol' => [],
+    'ul' => [],
+    'li' => ['ul', 'ol'],
+    'dl' => [],
+    'dd' => ['dl'],
+    'dt' => ['dl'] }
+  #Tables can contain other tables, but there are restrictions.
+  @@nestable_table_tags = {'table' => ['tr', 'td'],
+    'tr' => ['table'],
+    'td' => ['tr'],
+    'th' => ['tr'],
+  }
+  @@non_nestable_block_tags = ['address', 'form', 'p', 'pre']
+  #If one of these tags is encountered, all tags up to the next tag of
+  #this type are popped.
+  @@reset_nesting_tags.replace(build_tag_map(nil, @@nestable_block_tags, 'noscript', @@non_nestable_block_tags,
+   @@nestable_list_tags, @@nestable_table_tags))
+  @@nestable_tags.replace(build_tag_map([], @@nestable_inline_tags, @@nestable_block_tags, @@nestable_list_tags, @@nestable_table_tags))
+end
+# This class will push a tag with only a single string child into
+# the tag's parent as an attribute. The attribute's name is the tag
+# name, and the value is the string child. An example should give
+# the flavor of the change:
+#
+# <foo><bar>baz</bar></foo>
+# =>
+# <foo bar="baz"><bar>baz</bar></foo>
+#
+# You can then access fooTag['bar'] instead of fooTag.barTag.string.
+#
+# This is, of course, useful for scraping structures that tend to
+# use subelements instead of attributes, such as SOAP messages. Note
+# that it modifies its input, so don't print the modified version
+# out.
+class BeautifulSOAP < BeautifulStoneSoup
+  def pop_tag
+    if @tag_stack.size > 1
+      tag = @tag_stack[-1]
+      parent = @tag_stack[-2]
+      if (tag.is_a?(Tag) && tag.contents.size == 1 && \
+          tag.contents[0].is_a?(NavigableString) && !parent[tag.name])
+          parent[tag.name] = tag.contents[0]
+      end
+      super
+    end
+  end
+end
+#Enterprise class names! It has come to our attention that some people
+#think the names of the Rubyful Soup parser classes are too silly
+#and "unprofessional" for use in enterprise screen-scraping. We feel
+#your pain! For such-minded folk, the Rubyful Soup Consortium And
+#Rootin' Tootin' Texas Delicatessen recommends renaming this file to
+#"RobustParser.rb" (or, in cases of extreme enterprisitude,
+#"RobustParserBeanInterface.class") and using the following
+#enterprise-friendly class aliases:
+class RobustXMLParser < BeautifulStoneSoup; end
+class RobustHTMLParser < BeautifulSoup; end
+class SimplifyingSOAPParser < BeautifulSOAP; end
+print BeautifulSoup.new(ARGF.read).prettify if $0 == __FILE__