RubyGems - openlogic-feed-normalizer - Versions diffs - 1.5.3 - Mend

openlogic-feed-normalizer 1.5.3

Files changed (20) hide show

data/.gemtest +0 -0
data/History.txt +62 -0
data/License.txt +27 -0
data/Manifest.txt +18 -0
data/README.txt +63 -0
data/Rakefile +30 -0
data/lib/feed-normalizer.rb +149 -0
data/lib/html-cleaner.rb +181 -0
data/lib/parsers/rss.rb +117 -0
data/lib/parsers/simple-rss.rb +142 -0
data/lib/structures.rb +262 -0
data/test/data/atom03.xml +128 -0
data/test/data/atom10.xml +114 -0
data/test/data/rdf10.xml +1498 -0
data/test/data/rss20.xml +65 -0
data/test/data/rss20diff.xml +59 -0
data/test/data/rss20diff_short.xml +51 -0
data/test/test_feednormalizer.rb +277 -0
data/test/test_htmlcleaner.rb +156 -0
metadata +123 -0

data/lib/parsers/rss.rb ADDED Viewed

@@ -0,0 +1,117 @@
+require 'rss'
+# For some reason, this is only included in the RDF Item by default (in 0.1.6).
+unless RSS::Rss::Channel::Item.new.respond_to?(:content_encoded)
+  class RSS::Rss::Channel::Item # :nodoc:
+    include RSS::ContentModel
+  end
+end
+# Add equality onto Enclosures.
+class RSS::Rss::Channel::Item::Enclosure
+  def eql?(enc)
+    instance_variables.all? do |iv|
+      instance_variable_get(iv) == enc.instance_variable_get(iv)
+    end
+  end
+  alias == eql?
+end
+module FeedNormalizer
+  class RubyRssParser < Parser
+    def self.parser
+      RSS::Parser
+    end
+    def self.parse(xml, loose)
+      begin
+        rss = parser.parse(xml)
+      rescue Exception => e
+        #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
+        return nil
+      end
+      # check for channel to make sure we're only dealing with RSS.
+      rss && rss.respond_to?(:channel) ? package(rss, loose) : nil
+    end
+    # Fairly high priority; a fast and strict parser.
+    def self.priority
+      100
+    end
+    protected
+    def self.package(rss, loose)
+      feed = Feed.new(self)
+      # channel elements
+      feed_mapping = {
+        :generator => :generator,
+        :title => :title,
+        :urls => :link,
+        :description => :description,
+        :copyright => :copyright,
+        :authors => :managingEditor,
+        :last_updated => [:lastBuildDate, :pubDate, :dc_date],
+        :id => :guid,
+        :ttl => :ttl
+      }
+      # make two passes, to catch all possible root elements
+      map_functions!(feed_mapping, rss, feed)
+      map_functions!(feed_mapping, rss.channel, feed)
+      # custom channel elements
+      feed.image = rss.image ? rss.image.url : nil
+      feed.skip_hours = skip(rss, :skipHours)
+      feed.skip_days = skip(rss, :skipDays)
+      # item elements
+      item_mapping = {
+        :date_published => [:pubDate, :dc_date],
+        :urls => :link,
+        :enclosures => :enclosure,
+        :description => :description,
+        :content => [:content_encoded, :description],
+        :title => :title,
+        :authors => [:author, :dc_creator],
+        :last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
+      }
+      rss.items.each do |rss_item|
+        unless rss_item.title.nil? && rss_item.description.nil? # some feeds return empty items
+          feed_entry = Entry.new
+          map_functions!(item_mapping, rss_item, feed_entry)
+          # custom item elements
+          feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
+          # fall back to link for ID
+          feed_entry.id ||= rss_item.link if rss_item.respond_to?(:link) && rss_item.link
+          feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
+          feed_entry.categories = loose ?
+                                    rss_item.categories.collect{|c|c.content} :
+                                    [rss_item.categories.first.content] rescue []
+          feed.entries << feed_entry
+        end
+      end
+      feed
+    end
+    def self.skip(parser, attribute)
+      case attribute
+        when :skipHours then attributes = :hours
+        when :skipDays then attributes = :days
+      end
+      channel = parser.channel
+      return nil unless channel.respond_to?(attribute) && a = channel.send(attribute)
+      a.send(attributes).collect{|e| e.content}
+    end
+  end
+end

data/lib/parsers/simple-rss.rb ADDED Viewed

@@ -0,0 +1,142 @@
+require 'simple-rss'
+# Monkey patches for outstanding issues logged in the simple-rss project.
+#   * Add support for issued time field:
+#     http://rubyforge.org/tracker/index.php?func=detail&aid=13980&group_id=893&atid=3517
+#   * The '+' symbol is lost when escaping fields.
+#     http://rubyforge.org/tracker/index.php?func=detail&aid=10852&group_id=893&atid=3517
+#
+class SimpleRSS
+  @@item_tags << :issued
+  undef clean_content
+  def clean_content(tag, attrs, content)
+    content = content.to_s
+    case tag
+      when :pubDate, :lastBuildDate, :published, :updated, :expirationDate, :modified, :'dc:date', :issued
+        Time.parse(content) rescue unescape(content)
+      when :author, :contributor, :skipHours, :skipDays
+        unescape(content.gsub(/<.*?>/,''))
+      else
+        content.empty? && "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi ? $1.strip : unescape(content)
+    end
+  end
+  undef unescape
+  def unescape(s)
+   if s =~ /^\s*(<!\[CDATA\[|\]\]>)/
+     # Raw HTML is inside the CDATA, so just remove the CDATA wrapper.
+     s.gsub(/(<!\[CDATA\[|\]\]>)/,'')
+   elsif s =~ /[<>]/
+     # Already looks like HTML.
+     s
+   else
+     # Make it HTML.
+     FeedNormalizer::HtmlCleaner.unescapeHTML(s)
+   end
+ end
+end
+module FeedNormalizer
+  # The SimpleRSS parser can handle both RSS and Atom feeds.
+  class SimpleRssParser < Parser
+    def self.parser
+      SimpleRSS
+    end
+    def self.parse(xml, loose)
+      begin
+        atomrss = parser.parse(xml)
+      rescue Exception => e
+        #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
+        return nil
+      end
+      package(atomrss)
+    end
+    # Fairly low priority; a slower, liberal parser.
+    def self.priority
+      900
+    end
+    protected
+    def self.package(atomrss)
+      feed = Feed.new(self)
+      # root elements
+      feed_mapping = {
+        :generator => :generator,
+        :title => :title,
+        :last_updated => [:updated, :lastBuildDate, :pubDate, :dc_date],
+        :copyright => [:copyright, :rights],
+        :authors => [:author, :webMaster, :managingEditor, :contributor],
+        :urls => [:'link+alternate', :link],
+        :description => [:description, :subtitle],
+        :ttl => :ttl
+      }
+      map_functions!(feed_mapping, atomrss, feed)
+      # custom channel elements
+      feed.id = feed_id(atomrss)
+      feed.image = image(atomrss)
+      # entry elements
+      entry_mapping = {
+        :date_published => [:pubDate, :published, :dc_date, :issued],
+        :urls => [:'link+alternate', :link],
+        :enclosures => :enclosure,
+        :description => [:description, :summary],
+        :content => [:content, :content_encoded, :description],
+        :title => :title,
+        :authors => [:author, :contributor, :dc_creator],
+        :categories => :category,
+        :last_updated => [:updated, :dc_date, :pubDate]
+      }
+      atomrss.entries.each do |atomrss_entry|
+        unless atomrss_entry.title.nil? && atomrss_entry.description.nil? # some feeds return empty items
+          feed_entry = Entry.new
+          map_functions!(entry_mapping, atomrss_entry, feed_entry)
+          # custom entry elements
+          feed_entry.id = atomrss_entry.guid || atomrss_entry[:id] # entries are a Hash..
+          # fall back to link for ID
+          feed_entry.id ||= atomrss_entry.link if atomrss_entry.respond_to?(:link) && atomrss_entry.link
+          feed_entry.copyright = atomrss_entry.copyright || (atomrss.respond_to?(:copyright) ? atomrss.copyright : nil)
+          feed.entries << feed_entry
+        end
+      end
+      feed
+    end
+    def self.image(parser)
+      if parser.respond_to?(:image) && parser.image
+        if parser.image =~ /<url>/ # RSS image contains an <url> spec
+          parser.image.scan(/<url>(.*?)<\/url>/).to_s
+        else
+          parser.image # Atom contains just the url
+        end
+      elsif parser.respond_to?(:logo) && parser.logo
+        parser.logo
+      end
+    end
+    def self.feed_id(parser)
+      overridden_value(parser, :id) || ("#{parser.link}" if parser.respond_to?(:link))
+    end
+    # gets the value returned from the method if it overriden, otherwise nil.
+    def self.overridden_value(object, method)
+      object.class.public_instance_methods(false).include? method
+    end
+  end
+end

data/lib/structures.rb ADDED Viewed

@@ -0,0 +1,262 @@
+module FeedNormalizer
+  module Singular
+    # If the method being called is a singular (in this simple case, does not
+    # end with an 's'), then it calls the plural method, and calls the first
+    # element. We're assuming that plural methods provide an array.
+    #
+    # Example:
+    # Object contains an array called 'alphas', which looks like [:a, :b, :c].
+    # Call object.alpha and :a is returned.
+    def method_missing(name, *args)
+      plural_name = :"#{name}s"
+      return self.send(plural_name).first if respond_to?(plural_name)
+      super(name, *args)
+    end
+    def respond_to?(x, y=false)
+      self.class::ELEMENTS.include?(x) || self.class::ELEMENTS.include?(:"#{x}s") || super(x, y)
+    end
+  end
+  module ElementEquality
+    def eql?(other)
+      self == (other)
+    end
+    def ==(other)
+      other.equal?(self) ||
+        (other.instance_of?(self.class) &&
+          self.class::ELEMENTS.all?{ |el| self.send(el) == other.send(el)} )
+    end
+    # Returns the difference between two Feed instances as a hash.
+    # Any top-level differences in the Feed object as presented as:
+    #
+    #  { :title => [content, other_content] }
+    #
+    # For differences at the items level, an array of hashes shows the diffs
+    # on a per-entry basis. Only entries that differ will contain a hash:
+    #
+    #  { :items => [
+    #     {:title => ["An article tile", "A new article title"]},
+    #     {:title => ["one title", "a different title"]} ]}
+    #
+    # If the number of items in each feed are different, then the count of each
+    # is provided instead:
+    #
+    #  { :items => [4,5] }
+    #
+    # This method can also be useful for human-readable feed comparison if
+    # its output is dumped to YAML.
+    def diff(other, elements = self.class::ELEMENTS)
+      diffs = {}
+      elements.each do |element|
+        if other.respond_to?(element)
+          self_value = self.send(element)
+          other_value = other.send(element)
+          next if self_value == other_value
+          diffs[element] = if other_value.respond_to?(:diff)
+            self_value.diff(other_value)
+          elsif other_value.is_a?(Enumerable) && other_value.all?{|v| v.respond_to?(:diff)}
+            if self_value.size != other_value.size
+              [self_value.size, other_value.size]
+            else
+              enum_diffs = []
+              self_value.each_with_index do |val, index|
+                enum_diffs << val.diff(other_value[index], val.class::ELEMENTS)
+              end
+              enum_diffs.reject{|h| h.empty?}
+            end
+          else
+            [other_value, self_value] unless other_value == self_value
+          end
+        end
+      end
+      diffs
+    end
+  end
+  module ElementCleaner
+    # Recursively cleans all elements in place.
+    #
+    # Only allow tags in whitelist. Always parse the html with a parser and delete
+    # all tags that arent on the list.
+    #
+    # For feed elements that can contain HTML:
+    # - feed.(title|description)
+    # - feed.entries[n].(title|description|content)
+    #
+    def clean!
+      self.class::SIMPLE_ELEMENTS.each do |element|
+        val = self.send(element)
+        send("#{element}=", (val.is_a?(Array) ?
+          val.collect{|v| HtmlCleaner.flatten(v.to_s)} : HtmlCleaner.flatten(val.to_s)))
+      end
+      self.class::HTML_ELEMENTS.each do |element|
+        send("#{element}=", HtmlCleaner.clean(self.send(element).to_s))
+      end
+      self.class::BLENDED_ELEMENTS.each do |element|
+        self.send(element).collect{|v| v.clean!}
+      end
+    end
+  end
+  module TimeFix
+    # Reparse any Time instances, due to RSS::Parser's redefinition of
+    # certain aspects of the Time class that creates unexpected behaviour
+    # when extending the Time class, as some common third party libraries do.
+    # See http://code.google.com/p/feed-normalizer/issues/detail?id=13.
+    def reparse(obj)
+      @parsed ||= false
+      if obj.is_a?(String)
+        @parsed = true
+        begin
+          Time.at(obj) rescue Time.rfc2822(obj) rescue Time.parse(obj)
+        rescue
+          @parsed = false
+          obj
+        end
+      else
+        return obj if @parsed
+        if obj.is_a?(Time)
+          @parsed = true
+          Time.at(obj) rescue obj
+        end
+      end
+    end
+  end
+  module RewriteRelativeLinks
+    def rewrite_relative_links(text, url)
+      if host = url_host(url)
+        text.to_s.gsub(/(href|src)=('|")\//, '\1=\2http://' + host + '/')
+      else
+        text
+      end
+    end
+    private
+      def url_host(url)
+        URI.parse(url).host rescue nil
+      end
+  end
+  # Represents a feed item entry.
+  # Available fields are:
+  #  * content
+  #  * description
+  #  * title
+  #  * date_published
+  #  * urls / url
+  #  * id
+  #  * authors / author
+  #  * copyright
+  #  * categories
+  class Entry
+    include Singular, ElementEquality, ElementCleaner, TimeFix, RewriteRelativeLinks
+    HTML_ELEMENTS = [:content, :description, :title]
+    SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories, :last_updated, :enclosures]
+    BLENDED_ELEMENTS = []
+    ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
+    attr_accessor(*ELEMENTS)
+    def initialize
+      @urls = []
+      @authors = []
+      @categories = []
+      @enclosures = []
+      @date_published, @content, @last_updated = nil
+    end
+    undef date_published
+    def date_published
+      @date_published = reparse(@date_published)
+    end
+    undef last_updated
+    def last_updated
+      @last_updated = reparse(@last_updated)
+    end
+    undef content
+    def content
+      @content = rewrite_relative_links(@content, url)
+    end
+  end
+  # Represents the root element of a feed.
+  # Available fields are:
+  #  * title
+  #  * description
+  #  * id
+  #  * last_updated
+  #  * copyright
+  #  * authors / author
+  #  * urls / url
+  #  * image
+  #  * generator
+  #  * items / channel
+  class Feed
+    include Singular, ElementEquality, ElementCleaner, TimeFix
+    # Elements that can contain HTML fragments.
+    HTML_ELEMENTS = [:title, :description]
+    # Elements that contain 'plain' Strings, with HTML escaped.
+    SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator, :ttl, :skip_hours, :skip_days]
+    # Elements that contain both HTML and escaped HTML.
+    BLENDED_ELEMENTS = [:items]
+    ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
+    attr_accessor(*ELEMENTS)
+    attr_accessor(:parser)
+    alias :entries :items
+    def initialize(wrapper)
+      # set up associations (i.e. arrays where needed)
+      @urls = []
+      @authors = []
+      @skip_hours = []
+      @skip_days = []
+      @items = []
+      @parser = wrapper.parser.to_s
+      @last_updated = nil
+    end
+    undef last_updated
+    def last_updated
+      @last_updated = reparse(@last_updated)
+    end
+    def channel() self end
+  end
+end