RubyGems - feedtools - Versions diffs - 0.2.22 → 0.2.23 - Mend

feedtools 0.2.22 → 0.2.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

data/CHANGELOG +28 -0
data/README +23 -2
data/db/migration.rb +19 -0
data/db/schema.mysql.sql +1 -1
data/db/schema.postgresql.sql +1 -1
data/db/schema.sqlite.sql +1 -1
data/lib/feed_tools.rb +71 -388
data/lib/feed_tools/database_feed_cache.rb +4 -3
data/lib/feed_tools/feed.rb +809 -607
data/lib/feed_tools/feed_item.rb +551 -574
data/lib/feed_tools/feed_structures.rb +252 -0
data/lib/feed_tools/helpers/feed_tools_helper.rb +6 -5
data/lib/feed_tools/helpers/generic_helper.rb +16 -158
data/lib/feed_tools/helpers/html_helper.rb +629 -0
data/lib/feed_tools/helpers/retrieval_helper.rb +5 -0
data/lib/feed_tools/helpers/uri_helper.rb +223 -0
data/lib/feed_tools/helpers/xml_helper.rb +239 -0
data/rakefile +10 -237
data/test/unit/amp_test.rb +102 -94
data/test/unit/atom_test.rb +239 -6
data/test/unit/cache_test.rb +1 -1
data/test/unit/encoding_test.rb +5 -5
data/test/unit/generation_test.rb +34 -1
data/test/unit/helper_test.rb +111 -17
data/test/unit/rss_test.rb +21 -2
metadata +7 -3
data/lib/feed_tools/helpers/module_helper.rb +0 -27

data/CHANGELOG CHANGED Viewed

@@ -1,3 +1,31 @@
+== FeedTools 0.2.23
+ * autodiscovery implemented
+ * now knows a title from a hole in the ground
+ * now resolves relative urls when possible
+ * changed default table name to "cached_feeds" to avoid name collisions
+ * schema now uses "href" instead of "url"
+ * feed cache is set to nil by default now
+ * both summary and content elements are generated now
+ * now supports proxies
+ * now supports internationalized domain names if libidn is installed
+ * fixed bug with feed merging referencing a method that was refactored
+ * no longer dies if uuidtools gem is missing but the UUID class is defined
+ * updated timestamp handling and generation
+ * added support for entry sorting on any feed item field
+ * added support for disabling entry sorting entirely
+ * fixed issue with itunes categories
+ * fixed itunes subtitle/summary
+ * fixed entry assignment bug
+ * fixed issued/published variable name mix-up
+ * added support for the payload module
+ * added support for xhtml:div elements
+ * dc:date now preempts pubDate
+ * added better support for the scriptingNews format
+ * now correctly strips out wrapper div elements from text constructs
+ * fixed issue with some atom links being incorrectly identified as images
+ * reorganized some of the helper modules
+ * made some portions of url normalization case insensitive
+ * fixed issue with filename handling on Windows
 == FeedTools 0.2.22
  * fixed another atom generation error
 == FeedTools 0.2.21

data/README CHANGED Viewed

@@ -1,7 +1,9 @@
- FeedTools was designed to be a simple XML feed parser, generator, and translator with a built-in
- caching system.
+FeedTools was designed to be a simple XML feed parser, generator, and
+translator with a built-in caching system.
 == Example
+  require 'feed_tools'
   slashdot_feed = FeedTools::Feed.open('http://www.slashdot.org/index.rss')
   slashdot_feed.title
   => "Slashdot"
@@ -11,3 +13,22 @@
   => "http://slashdot.org/"
   slashdot_feed.items.first.find_node("slash:hitparade/text()").to_s
   => "43,37,28,23,11,3,1"
+== Installation
+You can install FeedTools as a gem:
+  gem install feedtools
+Or you can install it from the tarball or zip packages on the download page
+and then extract it to your vendors directory as you would with any other
+Ruby library.
+After installation, you will either need to run in non-caching mode or set
+up a caching mechanism.  The database feed cache system currently included
+with FeedTools is the most common caching method.  To set up the database
+feed cache, you will first need to create the appropriate database schema.
+Schema files for MySQL, PostgreSQL, and SQLite have been included, but the
+preferred method of creating the schema within the Rails environment is with
+a migration file.  A migration file has been supplied with FeedTools and can
+be found in the db directory.  Run
+<tt>script/generate migration add_feed_tools_tables</tt> and then copy and
+paste the contents of db/migration.rb into your new migration file.

data/db/migration.rb ADDED Viewed

@@ -0,0 +1,19 @@
+class AddFeedToolsTables < ActiveRecord::Migration
+  def self.up
+    puts "Adding cached feeds table..."
+    create_table :cached_feeds do |t|
+      t.column :href, :string
+      t.column :title, :string
+      t.column :link, :string
+      t.column :feed_data, :text
+      t.column :feed_data_type, :string
+      t.column :http_headers, :text
+      t.column :last_retrieved, :datetime
+    end
+  end
+  def self.down
+    puts "Dropping cached feeds table..."
+    drop_table :cached_feeds
+  end
+end

data/db/schema.mysql.sql CHANGED Viewed

@@ -1,7 +1,7 @@
 -- Example MySQL schema
   CREATE TABLE `feeds` (
     `id`              int(10) unsigned NOT NULL auto_increment,
-    `url`             varchar(255) default NULL,
+    `href`            varchar(255) default NULL,
     `title`           varchar(255) default NULL,
     `link`            varchar(255) default NULL,
     `feed_data`       longtext default NULL,

data/db/schema.postgresql.sql CHANGED Viewed

@@ -1,7 +1,7 @@
 -- Example PostgreSQL schema
   CREATE TABLE feeds (
     id                SERIAL PRIMARY KEY NOT NULL,
-    url               varchar(255) default NULL,
+    href              varchar(255) default NULL,
     title             varchar(255) default NULL,
     link              varchar(255) default NULL,
     feed_data         text default NULL,

data/db/schema.sqlite.sql CHANGED Viewed

@@ -1,7 +1,7 @@
 -- Example Sqlite schema
   CREATE TABLE feeds (
     id                INTEGER PRIMARY KEY NOT NULL,
-    url               VARCHAR(255) DEFAULT NULL,
+    href              VARCHAR(255) DEFAULT NULL,
     title             VARCHAR(255) DEFAULT NULL,
     link              VARCHAR(255) DEFAULT NULL,
     feed_data         TEXT DEFAULT NULL,

data/lib/feed_tools.rb CHANGED Viewed

@@ -32,7 +32,7 @@ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] ||
                  ENV['RAILS_ENV'] ||
                  'development' # :nodoc:
-FEED_TOOLS_VERSION = "0.2.22"
+FEED_TOOLS_VERSION = "0.2.23"
 FEED_TOOLS_NAMESPACES = {
   "admin" => "http://webns.net/mvcb/",
@@ -40,8 +40,9 @@ FEED_TOOLS_NAMESPACES = {
   "annotate" => "http://purl.org/rss/1.0/modules/annotate/",
   "atom10" => "http://www.w3.org/2005/Atom",
   "atom03" => "http://purl.org/atom/ns#",
-#  "atom-blog" => "http://purl.org/atom-blog/ns#",
+  "atom-blog" => "http://purl.org/atom-blog/ns#",
   "audio" => "http://media.tangent.org/rss/1.0/",
+  "bitTorrent" =>"http://www.reallysimplesyndication.com/bitTorrentRssModule",
   "blogChannel" => "http://backend.userland.com/blogChannelModule",
   "blogger" => "http://www.blogger.com/atom/ns#",
   "cc" => "http://web.resource.org/cc/",
@@ -61,20 +62,24 @@ FEED_TOOLS_NAMESPACES = {
   "itunes" => "http://www.itunes.com/dtds/podcast-1.0.dtd",
   "l" => "http://purl.org/rss/1.0/modules/link/",
   "media" => "http://search.yahoo.com/mrss",
+  "p" => "http://purl.org/net/rss1.1/payload#",
   "pingback" => "http://madskills.com/public/xml/rss/module/pingback/",
   "prism" => "http://prismstandard.org/namespaces/1.2/basic/",
   "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
   "rdfs" => "http://www.w3.org/2000/01/rdf-schema#",
   "ref" => "http://purl.org/rss/1.0/modules/reference/",
   "reqv" => "http://purl.org/rss/1.0/modules/richequiv/",
+  "rss09" => "http://my.netscape.com/rdf/simple/0.9/",
   "rss10" => "http://purl.org/rss/1.0/",
+  "rss11" => "http://purl.org/net/rss1.1#",
+  "rss20" => "http://backend.userland.com/rss2",
   "search" => "http://purl.org/rss/1.0/modules/search/",
   "slash" => "http://purl.org/rss/1.0/modules/slash/",
   "soap" => "http://schemas.xmlsoap.org/soap/envelope/",
   "ss" => "http://purl.org/rss/1.0/modules/servicestatus/",
   "str" => "http://hacks.benhammersley.com/rss/streaming/",
   "sub" => "http://purl.org/rss/1.0/modules/subscription/",
-  "sy" => "http://purl.org/rss/1.0/modules/syndication/",
+  "syn" => "http://purl.org/rss/1.0/modules/syndication/",
   "taxo" => "http://purl.org/rss/1.0/modules/taxonomy/",
   "thr" => "http://purl.org/rss/1.0/modules/threading/",
   "ti" => "http://purl.org/rss/1.0/modules/textinput/",
@@ -91,7 +96,7 @@ $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor")
 begin
   begin
     require 'iconv'
-  rescue LoadError
+  rescue Object
     warn("The Iconv library does not appear to be installed properly.  " +
       "FeedTools cannot function properly without it.")
     raise
@@ -101,11 +106,15 @@ begin
   require_gem('builder', '>= 1.2.4')
+  # Preload optional libraries.
   begin
     require 'tidy'
-  rescue LoadError
-    # Ignore the error for now.
+  rescue Object
   end
+  begin
+    require 'idn'
+  rescue Object
+  end
   require 'feed_tools/vendor/htree'
@@ -126,11 +135,25 @@ begin
   require_gem('activesupport', '>= 1.1.1')
   require_gem('activerecord', '>= 1.11.1')
-  require_gem('uuidtools', '>= 0.1.2')
+  begin
+    require_gem('uuidtools', '>= 0.1.2')
+  rescue Gem::LoadError
+    begin
+      require 'uuidtools'
+    rescue Object
+      raise unless defined? UUID
+    end
+  end
   require 'feed_tools/feed'
   require 'feed_tools/feed_item'
+  require 'feed_tools/feed_structures'
   require 'feed_tools/database_feed_cache'
+  require 'feed_tools/helpers/html_helper'
+  require 'feed_tools/helpers/xml_helper'
+  require 'feed_tools/helpers/uri_helper'
 rescue LoadError
   # ActiveSupport will very likely mess this up.  So drop a warn so that the
   # programmer can figure it out if things get wierd and unpredictable.
@@ -159,19 +182,27 @@ module FeedTools
   def FeedTools.load_configurations
     if @configurations.blank?
+      # TODO: Load this from a config file.
       config_hash = {}
       @configurations = {
-        :feed_cache => "FeedTools::DatabaseFeedCache",
+        :feed_cache => nil,
+        :proxy_address => nil,
+        :proxy_port => nil,
         :user_agent => "FeedTools/#{FEED_TOOLS_VERSION} " +
           "+http://www.sporkmonger.com/projects/feedtools/",
         :generator_name => "FeedTools/#{FEED_TOOLS_VERSION}",
         :generator_href => "http://www.sporkmonger.com/projects/feedtools/",
-        :tidy_enabled => false,
+        :tidy_enabled => true,
         :tidy_options => {},
+        :idn_enabled => true,
+        :sanitization_enabled => true,
         :sanitize_with_nofollow => true,
+        :always_strip_wrapper_elements => true,
         :timestamp_estimation_enabled => true,
         :url_normalization_enabled => true,
+        :entry_sorting_property => "time",
         :strip_comment_count => false,
+        :tab_spaces => 2,
         :max_ttl => 3.days.to_s,
         :output_encoding => "utf-8"
       }.merge(config_hash)
@@ -236,6 +267,9 @@ module FeedTools
         cache_class = eval(class_name)
         if cache_class.kind_of?(Class)
           @feed_cache = cache_class
+          if @feed_cache.respond_to? :initialize_cache
+            @feed_cache.initialize_cache
+          end
           return cache_class
         else
           return nil
@@ -258,376 +292,7 @@ module FeedTools
     rescue
       return false
     end
-  end
-  # Returns true if the html tidy module can be used.
-  #
-  # Obviously, you need the tidy gem installed in order to run with html
-  # tidy features turned on.
-  #
-  # This method does a fairly complicated, and probably unnecessarily
-  # desperate search for the libtidy library.  If you want this thing to
-  # execute fast, the best thing to do is to set Tidy.path ahead of time.
-  # If Tidy.path is set, this method doesn't do much.  If it's not set,
-  # it will do it's darnedest to find the libtidy library.  If you set
-  # the LIBTIDYPATH environment variable to the libtidy library, it should
-  # be able to find it.
-  #
-  # Once the library is located, this method will run much faster.
-  def FeedTools.tidy_enabled?
-    # This is an override variable to keep tidy from being used even if it
-    # is available.
-    if FeedTools.configurations[:tidy_enabled] == false
-      return false
-    end
-    if @tidy_enabled.nil? || @tidy_enabled == false
-      @tidy_enabled = false
-      begin
-        require 'tidy'
-        if Tidy.path.nil?
-          # *Shrug*, just brute force it, I guess.  There's a lot of places
-          # this thing might be hiding in, depending on platform and general
-          # sanity of the person who installed the thing.  Most of these are
-          # probably unlikely, but it's not like checking unlikely locations
-          # hurts.  Much.  Especially if you actually find it.
-          libtidy_locations = [
-            '/usr/local/lib/libtidy.dylib',
-            '/opt/local/lib/libtidy.dylib',
-            '/usr/lib/libtidy.dylib',
-            '/usr/local/lib/tidylib.dylib',
-            '/opt/local/lib/tidylib.dylib',
-            '/usr/lib/tidylib.dylib',
-            '/usr/local/lib/tidy.dylib',
-            '/opt/local/lib/tidy.dylib',
-            '/usr/lib/tidy.dylib',
-            '/usr/local/lib/libtidy.so',
-            '/opt/local/lib/libtidy.so',
-            '/usr/lib/libtidy.so',
-            '/usr/local/lib/tidylib.so',
-            '/opt/local/lib/tidylib.so',
-            '/usr/lib/tidylib.so',
-            '/usr/local/lib/tidy.so',
-            '/opt/local/lib/tidy.so',
-            '/usr/lib/tidy.so',
-            'C:\Program Files\Tidy\tidy.dll',
-            'C:\Tidy\tidy.dll',
-            'C:\Ruby\bin\tidy.dll',
-            'C:\Ruby\tidy.dll',
-            '/usr/local/lib',
-            '/opt/local/lib',
-            '/usr/lib'
-          ]
-          # We just made this thing up, but if someone sets it, we'll
-          # go ahead and check it
-          unless ENV['LIBTIDYPATH'].nil?
-            libtidy_locations =
-              libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
-          end
-          for path in libtidy_locations
-            if File.exists? path
-              if File.ftype(path) == "file"
-                Tidy.path = path
-                @tidy_enabled = true
-                break
-              elsif File.ftype(path) == "directory"
-                # Ok, now perhaps we're getting a bit more desperate
-                lib_paths =
-                  `find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
-                # If there's more than one, grab the first one and
-                # hope for the best, and if it doesn't work, then blame the
-                # user for not specifying more accurately.
-                tidy_path = lib_paths.split("\n").first
-                unless tidy_path.nil?
-                  Tidy.path = tidy_path
-                  @tidy_enabled = true
-                  break
-                end
-              end
-            end
-          end
-          # Still couldn't find it.
-          unless @tidy_enabled
-            @tidy_enabled = false
-          end
-        else
-          @tidy_enabled = true
-        end
-      rescue LoadError
-        # Tidy not installed, disable features that rely on tidy.
-        @tidy_enabled = false
-      end
-    end
-    return @tidy_enabled
-  end
-  # Attempts to ensures that the passed url is valid and sane.  Accepts very, very ugly urls
-  # and makes every effort to figure out what it was supposed to be.  Also translates from
-  # the feed: and rss: pseudo-protocols to the http: protocol.
-  def FeedTools.normalize_url(url)
-    if url.nil? || url == ""
-      return nil
-    end
-    normalized_url = url.strip
-    # if a url begins with the '/' character, it only makes sense that they
-    # meant to be using a file:// url.  Fix it for them.
-    if normalized_url.length > 0 && normalized_url[0..0] == "/"
-      normalized_url = "file://" + normalized_url
-    end
-    # if a url begins with a drive letter followed by a colon, we're looking at
-    # a file:// url.  Fix it for them.
-    if normalized_url.length > 0 &&
-        normalized_url.scan(/^[a-zA-Z]:[\\\/]/).size > 0
-      normalized_url = "file:///" + normalized_url
-    end
-    # if a url begins with javascript:, it's quite possibly an attempt at
-    # doing something malicious.  Let's keep that from getting anywhere,
-    # shall we?
-    if (normalized_url.downcase =~ /javascript:/) != nil
-      return "#"
-    end
-    # deal with all of the many ugly possibilities involved in the rss:
-    # and feed: pseudo-protocols (incidentally, whose crazy idea was this
-    # mess?)
-    normalized_url.gsub!(/^http:\/*(feed:\/*)?/, "http://")
-    normalized_url.gsub!(/^http:\/*(rss:\/*)?/, "http://")
-    normalized_url.gsub!(/^feed:\/*(http:\/*)?/, "http://")
-    normalized_url.gsub!(/^rss:\/*(http:\/*)?/, "http://")
-    normalized_url.gsub!(/^file:\/*/, "file:///")
-    normalized_url.gsub!(/^https:\/*/, "https://")
-    # fix (very) bad urls (usually of the user-entered sort)
-    normalized_url.gsub!(/^http:\/*(http:\/*)*/, "http://")
-    if (normalized_url =~ /^file:/) == 0
-      # Adjust windows-style urls
-      normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/, 'file:///\1:')
-      normalized_url.gsub!(/\\/, '/')
-    else
-      if (normalized_url =~ /https?:\/\//) == nil
-        normalized_url = "http://" + normalized_url
-      end
-      if normalized_url == "http://"
-        return nil
-      end
-      begin
-        feed_uri = URI.parse(normalized_url)
-        if feed_uri.scheme == nil
-          feed_uri.scheme = "http"
-        end
-        if feed_uri.path == nil || feed_uri.path == ""
-          feed_uri.path = "/"
-        end
-        if (feed_uri.path =~ /^[\/]+/) == 0
-          feed_uri.path.gsub!(/^[\/]+/, "/")
-        end
-        feed_uri.host.downcase!
-        normalized_url = feed_uri.to_s
-      rescue URI::InvalidURIError
-      end
-    end
-    # We can't do a proper set of escaping, so this will
-    # have to do.
-    normalized_url.gsub!(/%20/, " ")
-    normalized_url.gsub!(/ /, "%20")
-    return normalized_url
-  end
-  # Converts a url into a tag uri
-  def FeedTools.build_tag_uri(url, date)
-    unless url.kind_of? String
-      raise ArgumentError, "Expected String, got #{url.class.name}"
-    end
-    unless date.kind_of? Time
-      raise ArgumentError, "Expected Time, got #{date.class.name}"
-    end
-    tag_uri = normalize_url(url)
-    unless FeedTools.is_uri?(tag_uri)
-      raise ArgumentError, "Must supply a valid URL."
-    end
-    host = URI.parse(tag_uri).host
-    tag_uri.gsub!(/^(http|ftp|file):\/*/, "")
-    tag_uri.gsub!(/#/, "/")
-    tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" +
-      "#{tag_uri[(tag_uri.index(host) + host.size)..-1]}"
-    return tag_uri
-  end
-  # Converts a url into a urn:uuid: uri
-  def FeedTools.build_urn_uri(url)
-    unless url.kind_of? String
-      raise ArgumentError, "Expected String, got #{url.class.name}"
-    end
-    normalized_url = normalize_url(url)
-    require 'uuidtools'
-    return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
-  end
-  # Returns true if the parameter appears to be a valid uri
-  def FeedTools.is_uri?(url)
-    return false if url.nil?
-    begin
-      uri = URI.parse(url)
-      if uri.scheme.nil? || uri.scheme == ""
-        return false
-      end
-    rescue URI::InvalidURIError
-      return false
-    end
-    return true
-  end
-  # Escapes all html entities
-  def FeedTools.escape_entities(html)
-    return nil if html.nil?
-    escaped_html = CGI.escapeHTML(html)
-    escaped_html.gsub!(/'/, "&apos;")
-    escaped_html.gsub!(/"/, "&quot;")
-    return escaped_html
-  end
-  # Unescapes all html entities
-  def FeedTools.unescape_entities(html)
-    return nil if html.nil?
-    unescaped_html = html
-    unescaped_html.gsub!(/&#x26;/, "&amp;")
-    unescaped_html.gsub!(/&#38;/, "&amp;")
-    unescaped_html = CGI.unescapeHTML(unescaped_html)
-    unescaped_html.gsub!(/&apos;/, "'")
-    unescaped_html.gsub!(/&quot;/, "\"")
-    return unescaped_html
-  end
-  # Removes all html tags from the html formatted text.
-  def FeedTools.strip_html(html)
-    return nil if html.nil?
-    # TODO: do this properly
-    # ======================
-    stripped_html = html.gsub(/<\/?[^>]+>/, "")
-    return stripped_html
-  end
-  # Tidys up the html
-  def FeedTools.tidy_html(html, options = {})
-    return nil if html.nil?
-    if FeedTools.tidy_enabled?
-      is_fragment = true
-      html.gsub!(/&lt;!'/, "&amp;lt;!'")
-      if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
-          (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
-        is_fragment = false
-      end
-      if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
-        is_fragment = false
-      end
-      tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
-        tidy.options.output_xml = true
-        tidy.options.numeric_entities = true
-        tidy.options.markup = true
-        tidy.options.indent = false
-        tidy.options.wrap = 0
-        tidy.options.logical_emphasis = true
-        # TODO: Make this match the actual encoding of the feed
-        # =====================================================
-        tidy.options.input_encoding = "utf8"
-        tidy.options.output_encoding = "ascii"
-        tidy.options.ascii_chars = false
-        tidy.options.doctype = "omit"
-        xml = tidy.clean(html)
-        xml
-      end
-      if is_fragment
-        # Tidy sticks <html>...<body>[our html]</body>...</html> in.
-        # We don't want this.
-        tidy_html.strip!
-        tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
-        tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
-        tidy_html.strip!
-      end
-      tidy_html.gsub!(/&#x26;/, "&amp;")
-      tidy_html.gsub!(/&#38;/, "&amp;")
-      tidy_html.gsub!(/\320\262\320\202\342\204\242/, "\342\200\231")
-    else
-      tidy_html = html
-    end
-    if tidy_html.blank? && !html.blank?
-      tidy_html = html.strip
-    end
-    return tidy_html
-  end
-  # Removes all dangerous html tags from the html formatted text.
-  # If mode is set to :escape, dangerous and unknown elements will
-  # be escaped.  If mode is set to :strip, dangerous and unknown
-  # elements and all children will be removed entirely.
-  # Dangerous or unknown attributes are always removed.
-  def FeedTools.sanitize_html(html, mode=:strip)
-    return nil if html.nil?
-    # Lists borrowed from Mark Pilgrim's feedparser
-    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
-      'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
-      'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
-      'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
-      'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
-      'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
-      'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
-      'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
-      'u', 'ul', 'var']
-    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
-      'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
-      'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
-      'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
-      'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
-      'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
-      'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
-      'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
-      'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
-      'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
-      'type', 'usemap', 'valign', 'value', 'vspace', 'width']
-    # Replace with appropriate named entities
-    html.gsub!(/&#x26;/, "&amp;")
-    html.gsub!(/&#38;/, "&amp;")
-    html.gsub!(/&lt;!'/, "&amp;lt;!'")
-    # Hackity hack.  But it works, and it seems plenty fast enough.
-    html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
-    sanitize_node = lambda do |html_node|
-      if html_node.respond_to? :children
-        for child in html_node.children
-          if child.kind_of? REXML::Element
-            unless acceptable_elements.include? child.name.downcase
-              if mode == :strip
-                html_node.delete_element(child)
-              else
-                new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
-                html_node.insert_after(child, new_child)
-                html_node.delete_element(child)
-              end
-            end
-            for attribute in child.attributes.keys
-              unless acceptable_attributes.include? attribute.downcase
-                child.delete_attribute(attribute)
-              end
-            end
-          end
-          sanitize_node.call(child)
-        end
-      end
-      html_node
-    end
-    sanitize_node.call(html_doc.root)
-    html = html_doc.root.inner_xml
-    return html
-  end
+  end
   # Creates a merged "planet" feed from a set of urls.
   #
@@ -637,7 +302,7 @@ module FeedTools
   #   in conjunction with the DatabaseFeedCache as it will
   #   open multiple connections to the database.
   def FeedTools.build_merged_feed(url_array, options = {})
-    validate_options([ :multi_threaded ],
+    FeedTools::GenericHelper.validate_options([ :multi_threaded ],
                      options.keys)
     options = { :multi_threaded => false }.merge(options)
     return nil if url_array.nil?
@@ -930,19 +595,37 @@ module REXML # :nodoc:
             result << child.to_s
           end
         end
-        return result
+        return result.strip
       end
+    else
+      warn("inner_xml method already exists.")
     end
-    unless REXML::Element.public_instance_methods.include? :base_uri
-      def base_uri # :nodoc:
-        if not attribute('xml:base')
+    def base_uri # :nodoc:
+      begin
+        base_attribute = FeedTools::XmlHelper.try_xpaths(self, [
+          '@xml:base'
+        ])
+        if parent == nil || parent.kind_of?(REXML::Document)
+          return nil if base_attribute == nil
+          return base_attribute.value
+        end
+        if base_attribute != nil && parent == nil
+          return base_attribute.value
+        elsif parent != nil && base_attribute == nil
           return parent.base_uri
-        elsif parent
-          return URI.join(parent.base_uri, attribute('xml:base').value).to_s
-        else
-          return (attribute('xml:base').value or '')
+        elsif parent != nil && base_attribute != nil
+          parent_base_uri = parent.base_uri
+          if parent_base_uri != nil
+            uri = URI.parse(parent_base_uri)
+            return (uri + base_attribute.value).to_s
+          else
+            return base_attribute.value
+          end
         end
+        return nil
+      rescue
+        return nil
       end
     end
   end