RubyGems - feed-normalizer - Versions diffs - 1.5.1 → 1.5.2 - Mend

feed-normalizer 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/History.txt +48 -48
data/License.txt +27 -27
data/Manifest.txt +18 -19
data/README.txt +63 -63
data/Rakefile +29 -25
data/lib/feed-normalizer.rb +149 -149
data/lib/html-cleaner.rb +181 -190
data/lib/parsers/rss.rb +110 -95
data/lib/parsers/simple-rss.rb +138 -137
data/lib/structures.rb +245 -244
data/test/data/atom03.xml +128 -127
data/test/data/atom10.xml +114 -112
data/test/data/rdf10.xml +1498 -1498
data/test/data/rss20.xml +64 -63
data/test/data/rss20diff.xml +59 -59
data/test/data/rss20diff_short.xml +51 -51
data/test/test_feednormalizer.rb +265 -267
data/test/test_htmlcleaner.rb +156 -155
metadata +99 -63
data/test/test_all.rb +0 -6

data/lib/parsers/simple-rss.rb CHANGED

@@ -1,137 +1,138 @@
-require 'simple-rss'
-# Monkey patches for outstanding issues logged in the simple-rss project.
-#   * Add support for issued time field:
-#     http://rubyforge.org/tracker/index.php?func=detail&aid=13980&group_id=893&atid=3517
-#   * The '+' symbol is lost when escaping fields.
-#     http://rubyforge.org/tracker/index.php?func=detail&aid=10852&group_id=893&atid=3517
-#
-class SimpleRSS
-  @@item_tags << :issued
-  undef clean_content
-  def clean_content(tag, attrs, content)
-    content = content.to_s
-    case tag
-      when :pubDate, :lastBuildDate, :published, :updated, :expirationDate, :modified, :'dc:date', :issued
-        Time.parse(content) rescue unescape(content)
-      when :author, :contributor, :skipHours, :skipDays
-        unescape(content.gsub(/<.*?>/,''))
-      else
-        content.empty? && "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi ? $1.strip : unescape(content)
-    end
-  end
-  undef unescape
-  def unescape(s)
-   if s =~ /^(<!\[CDATA\[|\]\]>)/
-     # Raw HTML is inside the CDATA, so just remove the CDATA wrapper.
-     s.gsub(/(<!\[CDATA\[|\]\]>)/,'').strip
-   elsif s =~ /[<>]/
-     # Already looks like HTML.
-     s
-   else
-     # Make it HTML.
-     FeedNormalizer::HtmlCleaner.unescapeHTML(s)
-   end
- end
-end
-module FeedNormalizer
-  # The SimpleRSS parser can handle both RSS and Atom feeds.
-  class SimpleRssParser < Parser
-    def self.parser
-      SimpleRSS
-    end
-    def self.parse(xml, loose)
-      begin
-        atomrss = parser.parse(xml)
-      rescue Exception => e
-        #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
-        return nil
-      end
-      package(atomrss)
-    end
-    # Fairly low priority; a slower, liberal parser.
-    def self.priority
-      900
-    end
-    protected
-    def self.package(atomrss)
-      feed = Feed.new(self)
-      # root elements
-      feed_mapping = {
-        :generator => :generator,
-        :title => :title,
-        :last_updated => [:updated, :lastBuildDate, :pubDate, :dc_date],
-        :copyright => [:copyright, :rights],
-        :authors => [:author, :webMaster, :managingEditor, :contributor],
-        :urls => :link,
-        :description => [:description, :subtitle],
-        :ttl => :ttl
-      }
-      map_functions!(feed_mapping, atomrss, feed)
-      # custom channel elements
-      feed.id = feed_id(atomrss)
-      feed.image = image(atomrss)
-      # entry elements
-      entry_mapping = {
-        :date_published => [:pubDate, :published, :dc_date, :issued],
-        :urls => :link,
-        :description => [:description, :summary],
-        :content => [:content, :content_encoded, :description],
-        :title => :title,
-        :authors => [:author, :contributor, :dc_creator],
-        :categories => :category,
-        :last_updated => [:updated, :dc_date, :pubDate]
-      }
-      atomrss.entries.each do |atomrss_entry|
-        feed_entry = Entry.new
-        map_functions!(entry_mapping, atomrss_entry, feed_entry)
-        # custom entry elements
-        feed_entry.id = atomrss_entry.guid || atomrss_entry[:id] # entries are a Hash..
-        feed_entry.copyright = atomrss_entry.copyright || (atomrss.respond_to?(:copyright) ? atomrss.copyright : nil)
-        feed.entries << feed_entry
-      end
-      feed
-    end
-    def self.image(parser)
-      if parser.respond_to?(:image) && parser.image
-        if parser.image =~ /<url>/ # RSS image contains an <url> spec
-          parser.image.scan(/<url>(.*?)<\/url>/).to_s
-        else
-          parser.image # Atom contains just the url
-        end
-      elsif parser.respond_to?(:logo) && parser.logo
-        parser.logo
-      end
-    end
-    def self.feed_id(parser)
-      overridden_value(parser, :id) || ("#{parser.link}" if parser.respond_to?(:link))
-    end
-    # gets the value returned from the method if it overriden, otherwise nil.
-    def self.overridden_value(object, method)
-      object.class.public_instance_methods(false).include? method
-    end
-  end
-end
+require 'simple-rss'
+# Monkey patches for outstanding issues logged in the simple-rss project.
+#   * Add support for issued time field:
+#     http://rubyforge.org/tracker/index.php?func=detail&aid=13980&group_id=893&atid=3517
+#   * The '+' symbol is lost when escaping fields.
+#     http://rubyforge.org/tracker/index.php?func=detail&aid=10852&group_id=893&atid=3517
+#
+class SimpleRSS
+  @@item_tags << :issued
+  undef clean_content
+  def clean_content(tag, attrs, content)
+    content = content.to_s
+    case tag
+      when :pubDate, :lastBuildDate, :published, :updated, :expirationDate, :modified, :'dc:date', :issued
+        Time.parse(content) rescue unescape(content)
+      when :author, :contributor, :skipHours, :skipDays
+        unescape(content.gsub(/<.*?>/,''))
+      else
+        content.empty? && "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi ? $1.strip : unescape(content)
+    end
+  end
+  undef unescape
+  def unescape(s)
+   if s =~ /^\s*(<!\[CDATA\[|\]\]>)/
+     # Raw HTML is inside the CDATA, so just remove the CDATA wrapper.
+     s.gsub(/(<!\[CDATA\[|\]\]>)/,'')
+   elsif s =~ /[<>]/
+     # Already looks like HTML.
+     s
+   else
+     # Make it HTML.
+     FeedNormalizer::HtmlCleaner.unescapeHTML(s)
+   end
+ end
+end
+module FeedNormalizer
+  # The SimpleRSS parser can handle both RSS and Atom feeds.
+  class SimpleRssParser < Parser
+    def self.parser
+      SimpleRSS
+    end
+    def self.parse(xml, loose)
+      begin
+        atomrss = parser.parse(xml)
+      rescue Exception => e
+        #puts "Parser #{parser} failed because #{e.message.gsub("\n",', ')}"
+        return nil
+      end
+      package(atomrss)
+    end
+    # Fairly low priority; a slower, liberal parser.
+    def self.priority
+      900
+    end
+    protected
+    def self.package(atomrss)
+      feed = Feed.new(self)
+      # root elements
+      feed_mapping = {
+        :generator => :generator,
+        :title => :title,
+        :last_updated => [:updated, :lastBuildDate, :pubDate, :dc_date],
+        :copyright => [:copyright, :rights],
+        :authors => [:author, :webMaster, :managingEditor, :contributor],
+        :urls => :link,
+        :description => [:description, :subtitle],
+        :ttl => :ttl
+      }
+      map_functions!(feed_mapping, atomrss, feed)
+      # custom channel elements
+      feed.id = feed_id(atomrss)
+      feed.image = image(atomrss)
+      # entry elements
+      entry_mapping = {
+        :date_published => [:pubDate, :published, :dc_date, :issued],
+        :urls => :link,
+        :enclosures => :enclosure,
+        :description => [:description, :summary],
+        :content => [:content, :content_encoded, :description],
+        :title => :title,
+        :authors => [:author, :contributor, :dc_creator],
+        :categories => :category,
+        :last_updated => [:updated, :dc_date, :pubDate]
+      }
+      atomrss.entries.each do |atomrss_entry|
+        feed_entry = Entry.new
+        map_functions!(entry_mapping, atomrss_entry, feed_entry)
+        # custom entry elements
+        feed_entry.id = atomrss_entry.guid || atomrss_entry[:id] # entries are a Hash..
+        feed_entry.copyright = atomrss_entry.copyright || (atomrss.respond_to?(:copyright) ? atomrss.copyright : nil)
+        feed.entries << feed_entry
+      end
+      feed
+    end
+    def self.image(parser)
+      if parser.respond_to?(:image) && parser.image
+        if parser.image =~ /<url>/ # RSS image contains an <url> spec
+          parser.image.scan(/<url>(.*?)<\/url>/).to_s
+        else
+          parser.image # Atom contains just the url
+        end
+      elsif parser.respond_to?(:logo) && parser.logo
+        parser.logo
+      end
+    end
+    def self.feed_id(parser)
+      overridden_value(parser, :id) || ("#{parser.link}" if parser.respond_to?(:link))
+    end
+    # gets the value returned from the method if it overriden, otherwise nil.
+    def self.overridden_value(object, method)
+      object.class.public_instance_methods(false).include? method
+    end
+  end
+end

data/lib/structures.rb CHANGED

@@ -1,244 +1,245 @@
-module FeedNormalizer
-  module Singular
-    # If the method being called is a singular (in this simple case, does not
-    # end with an 's'), then it calls the plural method, and calls the first
-    # element. We're assuming that plural methods provide an array.
-    #
-    # Example:
-    # Object contains an array called 'alphas', which looks like [:a, :b, :c].
-    # Call object.alpha and :a is returned.
-    def method_missing(name, *args)
-      return self.send(:"#{name}s").first rescue super(name, *args)
-    end
-    def respond_to?(x, y=false)
-      self.class::ELEMENTS.include?(x) || self.class::ELEMENTS.include?(:"#{x}s") || super(x, y)
-    end
-  end
-  module ElementEquality
-    def eql?(other)
-      self == (other)
-    end
-    def ==(other)
-      other.equal?(self) ||
-        (other.instance_of?(self.class) &&
-          self.class::ELEMENTS.all?{ |el| self.send(el) == other.send(el)} )
-    end
-    # Returns the difference between two Feed instances as a hash.
-    # Any top-level differences in the Feed object as presented as:
-    #
-    #  { :title => [content, other_content] }
-    #
-    # For differences at the items level, an array of hashes shows the diffs
-    # on a per-entry basis. Only entries that differ will contain a hash:
-    #
-    #  { :items => [
-    #     {:title => ["An article tile", "A new article title"]},
-    #     {:title => ["one title", "a different title"]} ]}
-    #
-    # If the number of items in each feed are different, then the count of each
-    # is provided instead:
-    #
-    #  { :items => [4,5] }
-    #
-    # This method can also be useful for human-readable feed comparison if
-    # its output is dumped to YAML.
-    def diff(other, elements = self.class::ELEMENTS)
-      diffs = {}
-      elements.each do |element|
-        if other.respond_to?(element)
-          self_value = self.send(element)
-          other_value = other.send(element)
-          next if self_value == other_value
-          diffs[element] = if other_value.respond_to?(:diff)
-            self_value.diff(other_value)
-          elsif other_value.is_a?(Enumerable) && other_value.all?{|v| v.respond_to?(:diff)}
-            if self_value.size != other_value.size
-              [self_value.size, other_value.size]
-            else
-              enum_diffs = []
-              self_value.each_with_index do |val, index|
-                enum_diffs << val.diff(other_value[index], val.class::ELEMENTS)
-              end
-              enum_diffs.reject{|h| h.empty?}
-            end
-          else
-            [other_value, self_value] unless other_value == self_value
-          end
-        end
-      end
-      diffs
-    end
-  end
-  module ElementCleaner
-    # Recursively cleans all elements in place.
-    #
-    # Only allow tags in whitelist. Always parse the html with a parser and delete
-    # all tags that arent on the list.
-    #
-    # For feed elements that can contain HTML:
-    # - feed.(title|description)
-    # - feed.entries[n].(title|description|content)
-    #
-    def clean!
-      self.class::SIMPLE_ELEMENTS.each do |element|
-        val = self.send(element)
-        send("#{element}=", (val.is_a?(Array) ?
-          val.collect{|v| HtmlCleaner.flatten(v.to_s)} : HtmlCleaner.flatten(val.to_s)))
-      end
-      self.class::HTML_ELEMENTS.each do |element|
-        send("#{element}=", HtmlCleaner.clean(self.send(element).to_s))
-      end
-      self.class::BLENDED_ELEMENTS.each do |element|
-        self.send(element).collect{|v| v.clean!}
-      end
-    end
-  end
-  module TimeFix
-    # Reparse any Time instances, due to RSS::Parser's redefinition of
-    # certain aspects of the Time class that creates unexpected behaviour
-    # when extending the Time class, as some common third party libraries do.
-    # See http://code.google.com/p/feed-normalizer/issues/detail?id=13.
-    def reparse(obj)
-      @parsed ||= false
-      return obj if @parsed
-      if obj.is_a?(Time)
-        @parsed = true
-        Time.at(obj) rescue obj
-      end
-    end
-  end
-  module RewriteRelativeLinks
-    def rewrite_relative_links(text, url)
-      if host = url_host(url)
-        text.to_s.gsub(/(href|src)=('|")\//, '\1=\2http://' + host + '/')
-      else
-        text
-      end
-    end
-    private
-      def url_host(url)
-        URI.parse(url).host rescue nil
-      end
-  end
-  # Represents a feed item entry.
-  # Available fields are:
-  #  * content
-  #  * description
-  #  * title
-  #  * date_published
-  #  * urls / url
-  #  * id
-  #  * authors / author
-  #  * copyright
-  #  * categories
-  class Entry
-    include Singular, ElementEquality, ElementCleaner, TimeFix, RewriteRelativeLinks
-    HTML_ELEMENTS = [:content, :description, :title]
-    SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories, :last_updated]
-    BLENDED_ELEMENTS = []
-    ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
-    attr_accessor(*ELEMENTS)
-    def initialize
-      @urls = []
-      @authors = []
-      @categories = []
-      @date_published, @content = nil
-    end
-    undef date_published
-    def date_published
-      @date_published = reparse(@date_published)
-    end
-    undef content
-    def content
-      @content = rewrite_relative_links(@content, url)
-    end
-  end
-  # Represents the root element of a feed.
-  # Available fields are:
-  #  * title
-  #  * description
-  #  * id
-  #  * last_updated
-  #  * copyright
-  #  * authors / author
-  #  * urls / url
-  #  * image
-  #  * generator
-  #  * items / channel
-  class Feed
-    include Singular, ElementEquality, ElementCleaner, TimeFix
-    # Elements that can contain HTML fragments.
-    HTML_ELEMENTS = [:title, :description]
-    # Elements that contain 'plain' Strings, with HTML escaped.
-    SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator, :ttl, :skip_hours, :skip_days]
-    # Elements that contain both HTML and escaped HTML.
-    BLENDED_ELEMENTS = [:items]
-    ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
-    attr_accessor(*ELEMENTS)
-    attr_accessor(:parser)
-    alias :entries :items
-    def initialize(wrapper)
-      # set up associations (i.e. arrays where needed)
-      @urls = []
-      @authors = []
-      @skip_hours = []
-      @skip_days = []
-      @items = []
-      @parser = wrapper.parser.to_s
-      @last_updated = nil
-    end
-    undef last_updated
-    def last_updated
-      @last_updated = reparse(@last_updated)
-    end
-    def channel() self end
-  end
-end
+module FeedNormalizer
+  module Singular
+    # If the method being called is a singular (in this simple case, does not
+    # end with an 's'), then it calls the plural method, and calls the first
+    # element. We're assuming that plural methods provide an array.
+    #
+    # Example:
+    # Object contains an array called 'alphas', which looks like [:a, :b, :c].
+    # Call object.alpha and :a is returned.
+    def method_missing(name, *args)
+      return self.send(:"#{name}s").first rescue super(name, *args)
+    end
+    def respond_to?(x, y=false)
+      self.class::ELEMENTS.include?(x) || self.class::ELEMENTS.include?(:"#{x}s") || super(x, y)
+    end
+  end
+  module ElementEquality
+    def eql?(other)
+      self == (other)
+    end
+    def ==(other)
+      other.equal?(self) ||
+        (other.instance_of?(self.class) &&
+          self.class::ELEMENTS.all?{ |el| self.send(el) == other.send(el)} )
+    end
+    # Returns the difference between two Feed instances as a hash.
+    # Any top-level differences in the Feed object as presented as:
+    #
+    #  { :title => [content, other_content] }
+    #
+    # For differences at the items level, an array of hashes shows the diffs
+    # on a per-entry basis. Only entries that differ will contain a hash:
+    #
+    #  { :items => [
+    #     {:title => ["An article tile", "A new article title"]},
+    #     {:title => ["one title", "a different title"]} ]}
+    #
+    # If the number of items in each feed are different, then the count of each
+    # is provided instead:
+    #
+    #  { :items => [4,5] }
+    #
+    # This method can also be useful for human-readable feed comparison if
+    # its output is dumped to YAML.
+    def diff(other, elements = self.class::ELEMENTS)
+      diffs = {}
+      elements.each do |element|
+        if other.respond_to?(element)
+          self_value = self.send(element)
+          other_value = other.send(element)
+          next if self_value == other_value
+          diffs[element] = if other_value.respond_to?(:diff)
+            self_value.diff(other_value)
+          elsif other_value.is_a?(Enumerable) && other_value.all?{|v| v.respond_to?(:diff)}
+            if self_value.size != other_value.size
+              [self_value.size, other_value.size]
+            else
+              enum_diffs = []
+              self_value.each_with_index do |val, index|
+                enum_diffs << val.diff(other_value[index], val.class::ELEMENTS)
+              end
+              enum_diffs.reject{|h| h.empty?}
+            end
+          else
+            [other_value, self_value] unless other_value == self_value
+          end
+        end
+      end
+      diffs
+    end
+  end
+  module ElementCleaner
+    # Recursively cleans all elements in place.
+    #
+    # Only allow tags in whitelist. Always parse the html with a parser and delete
+    # all tags that arent on the list.
+    #
+    # For feed elements that can contain HTML:
+    # - feed.(title|description)
+    # - feed.entries[n].(title|description|content)
+    #
+    def clean!
+      self.class::SIMPLE_ELEMENTS.each do |element|
+        val = self.send(element)
+        send("#{element}=", (val.is_a?(Array) ?
+          val.collect{|v| HtmlCleaner.flatten(v.to_s)} : HtmlCleaner.flatten(val.to_s)))
+      end
+      self.class::HTML_ELEMENTS.each do |element|
+        send("#{element}=", HtmlCleaner.clean(self.send(element).to_s))
+      end
+      self.class::BLENDED_ELEMENTS.each do |element|
+        self.send(element).collect{|v| v.clean!}
+      end
+    end
+  end
+  module TimeFix
+    # Reparse any Time instances, due to RSS::Parser's redefinition of
+    # certain aspects of the Time class that creates unexpected behaviour
+    # when extending the Time class, as some common third party libraries do.
+    # See http://code.google.com/p/feed-normalizer/issues/detail?id=13.
+    def reparse(obj)
+      @parsed ||= false
+      return obj if @parsed
+      if obj.is_a?(Time)
+        @parsed = true
+        Time.at(obj) rescue obj
+      end
+    end
+  end
+  module RewriteRelativeLinks
+    def rewrite_relative_links(text, url)
+      if host = url_host(url)
+        text.to_s.gsub(/(href|src)=('|")\//, '\1=\2http://' + host + '/')
+      else
+        text
+      end
+    end
+    private
+      def url_host(url)
+        URI.parse(url).host rescue nil
+      end
+  end
+  # Represents a feed item entry.
+  # Available fields are:
+  #  * content
+  #  * description
+  #  * title
+  #  * date_published
+  #  * urls / url
+  #  * id
+  #  * authors / author
+  #  * copyright
+  #  * categories
+  class Entry
+    include Singular, ElementEquality, ElementCleaner, TimeFix, RewriteRelativeLinks
+    HTML_ELEMENTS = [:content, :description, :title]
+    SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories, :last_updated, :enclosures]
+    BLENDED_ELEMENTS = []
+    ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
+    attr_accessor(*ELEMENTS)
+    def initialize
+      @urls = []
+      @authors = []
+      @categories = []
+      @enclosures = []
+      @date_published, @content = nil
+    end
+    undef date_published
+    def date_published
+      @date_published = reparse(@date_published)
+    end
+    undef content
+    def content
+      @content = rewrite_relative_links(@content, url)
+    end
+  end
+  # Represents the root element of a feed.
+  # Available fields are:
+  #  * title
+  #  * description
+  #  * id
+  #  * last_updated
+  #  * copyright
+  #  * authors / author
+  #  * urls / url
+  #  * image
+  #  * generator
+  #  * items / channel
+  class Feed
+    include Singular, ElementEquality, ElementCleaner, TimeFix
+    # Elements that can contain HTML fragments.
+    HTML_ELEMENTS = [:title, :description]
+    # Elements that contain 'plain' Strings, with HTML escaped.
+    SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator, :ttl, :skip_hours, :skip_days]
+    # Elements that contain both HTML and escaped HTML.
+    BLENDED_ELEMENTS = [:items]
+    ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
+    attr_accessor(*ELEMENTS)
+    attr_accessor(:parser)
+    alias :entries :items
+    def initialize(wrapper)
+      # set up associations (i.e. arrays where needed)
+      @urls = []
+      @authors = []
+      @skip_hours = []
+      @skip_days = []
+      @items = []
+      @parser = wrapper.parser.to_s
+      @last_updated = nil
+    end
+    undef last_updated
+    def last_updated
+      @last_updated = reparse(@last_updated)
+    end
+    def channel() self end
+  end
+end