RubyGems - fblee-feedzirra - Versions diffs - 0.0.17 - Mend

fblee-feedzirra 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/README.textile +196 -0
data/Rakefile +56 -0
data/lib/core_ext/date.rb +21 -0
data/lib/core_ext/string.rb +9 -0
data/lib/feedzirra/feed.rb +357 -0
data/lib/feedzirra/feed_entry_utilities.rb +81 -0
data/lib/feedzirra/feed_utilities.rb +71 -0
data/lib/feedzirra/parser/atom.rb +36 -0
data/lib/feedzirra/parser/atom_entry.rb +41 -0
data/lib/feedzirra/parser/atom_feed_burner.rb +28 -0
data/lib/feedzirra/parser/atom_feed_burner_entry.rb +37 -0
data/lib/feedzirra/parser/itunes_rss.rb +50 -0
data/lib/feedzirra/parser/itunes_rss_item.rb +31 -0
data/lib/feedzirra/parser/itunes_rss_owner.rb +12 -0
data/lib/feedzirra/parser/rss.rb +36 -0
data/lib/feedzirra/parser/rss_entry.rb +45 -0
data/lib/feedzirra/web_page.rb +8 -0
data/lib/feedzirra.rb +35 -0
data/spec/feedzirra/feed_entry_utilities_spec.rb +62 -0
data/spec/feedzirra/feed_spec.rb +595 -0
data/spec/feedzirra/feed_utilities_spec.rb +149 -0
data/spec/feedzirra/parser/atom_entry_spec.rb +49 -0
data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +42 -0
data/spec/feedzirra/parser/atom_feed_burner_spec.rb +39 -0
data/spec/feedzirra/parser/atom_spec.rb +43 -0
data/spec/feedzirra/parser/itunes_rss_item_spec.rb +48 -0
data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +18 -0
data/spec/feedzirra/parser/itunes_rss_spec.rb +50 -0
data/spec/feedzirra/parser/rss_entry_spec.rb +41 -0
data/spec/feedzirra/parser/rss_spec.rb +45 -0
data/spec/spec.opts +2 -0
data/spec/spec_helper.rb +70 -0
metadata +177 -0

data/lib/feedzirra/feed_entry_utilities.rb ADDED Viewed

@@ -0,0 +1,81 @@
+module Feedzirra
+  module FeedEntryUtilities
+    def published
+      @published || @updated
+    end
+    def parse_datetime(string)
+      begin
+        return DateTime.parse(string).feed_utils_to_gm_time
+      rescue
+        # This means the date was not in an expected format. Often publishers use bizarre character encodings
+        # and/or foreign languages in day or month names, so we try and ignore bogus data and see if it's possible
+        # to make a sensible extraction of the data that is parseable.
+        # This reg exp matches string like: ?���, 1 ???��� 2009 14:26 -0400
+        # which we take to mean the 1st of the current month, at 14:26 in Eastern Summer Time.
+        date_matcher = /.+?\s+(\d?\d)\s+.+?(\d\d\d\d)\s+(\d?\d):(\d\d):?(\d?\d)?\s+([-+]\d\d\d\d|\w\w\w)/
+        processed_date = date_matcher.match(string)
+        if processed_date.nil?
+            puts "DATE CAN'T BE PARSED: #{string}"
+            return nil
+        else
+            day_number = processed_date[1].to_i
+            year = processed_date[2].to_i
+            # sanitize future years to be this year
+            today = DateTime.now
+            current_year = today.year
+            year = current_year if year > current_year
+            # Guess the month: if the day number is less than today
+            if day_number > today.day
+                month = (today << 1).month
+            else
+                month = today.month
+            end
+            hours = processed_date[3].to_i
+            minutes = processed_date[4].to_i
+            seconds = processed_date[5] || "00"
+            timezone_indicator = processed_date[6]
+            parsed_datetime_string = "#{year}-#{month}-#{day_number} #{hours}:#{minutes}:#{seconds} #{timezone_indicator}"
+#            puts parsed_datetime_string
+            correct_date = DateTime.parse(parsed_datetime_string)
+            gmt_date = correct_date.feed_utils_to_gm_time
+            puts "Correctly sanitized a date string after initial parse failed. Went from: [#{string}] to [#{gmt_date.to_s}] via [#{correct_date.to_s}]"
+            return gmt_date
+        end
+      end
+    end
+    ##
+    # Returns the id of the entry or its url if not id is present, as some formats don't support it
+    def id
+      @id || @url
+    end
+    ##
+    # Writter for published. By default, we keep the "oldest" publish time found.
+    def published=(val)
+      parsed = parse_datetime(val)
+      @published = parsed if !@published || parsed < @published
+    end
+    ##
+    # Writter for udapted. By default, we keep the most recenet update time found.
+    def updated=(val)
+      parsed = parse_datetime(val)
+      @updated = parsed if !@updated || parsed > @updated
+    end
+    def sanitize!
+      self.title.sanitize!   if self.title
+      self.author.sanitize!  if self.author
+      self.summary.sanitize! if self.summary
+      self.content.sanitize! if self.content
+    end
+    alias_method :last_modified, :published
+  end
+end

data/lib/feedzirra/feed_utilities.rb ADDED Viewed

@@ -0,0 +1,71 @@
+module Feedzirra
+  module FeedUtilities
+    UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
+    attr_writer   :new_entries, :updated, :last_modified
+    attr_accessor :etag
+    def last_modified
+      @last_modified ||= begin
+        entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
+        entry ? entry.published : nil
+      end
+    end
+    def updated?
+      @updated
+    end
+    def new_entries
+      @new_entries ||= []
+    end
+    def has_new_entries?
+      new_entries.size > 0
+    end
+    def update_from_feed(feed)
+      self.new_entries += find_new_entries_for(feed)
+      self.entries.unshift(*self.new_entries)
+      updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
+    end
+    def update_attribute(feed, name)
+      old_value, new_value = send(name), feed.send(name)
+      if old_value != new_value
+        send("#{name}=", new_value)
+      end
+    end
+    def sanitize_entries!
+      entries.each {|entry| entry.sanitize!}
+    end
+    private
+    def updated!
+      @updated = true
+    end
+    def find_new_entries_for(feed)
+      # this implementation is a hack, which is why it's so ugly.
+      # it's to get around the fact that not all feeds have a published date.
+      # however, they're always ordered with the newest one first.
+      # So we go through the entries just parsed and insert each one as a new entry
+      # until we get to one that has the same url as the the newest for the feed
+      latest_entry = self.entries.first
+      found_new_entries = []
+      feed.entries.each do |entry|
+        break if entry.url == latest_entry.url
+        found_new_entries << entry
+      end
+      found_new_entries
+    end
+    def existing_entry?(test_entry)
+      entries.any? { |entry| entry.url == test_entry.url }
+    end
+  end
+end

data/lib/feedzirra/parser/atom.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module Feedzirra
+  module Parser
+    # == Summary
+    # Parser for dealing with Atom feeds.
+    #
+    # == Attributes
+    # * title
+    # * feed_url
+    # * url
+    # * entries
+    class Atom
+      include SAXMachine
+      include FeedUtilities
+      element :title
+      element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
+      element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
+      elements :link, :as => :links, :value => :href
+      element :subtitle, :as => :description
+      elements :entry, :as => :entries, :class => AtomEntry
+      def self.able_to_parse?(xml) #:nodoc:
+        xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})|(#{Regexp.escape("http://www.w3.org/2005/Atom")})/
+      end
+      def url
+        @url || links.last
+      end
+      def feed_url
+        @feed_url || links.first
+      end
+    end
+  end
+end

data/lib/feedzirra/parser/atom_entry.rb ADDED Viewed

@@ -0,0 +1,41 @@
+module Feedzirra
+  module Parser
+    # == Summary
+    # Parser for dealing with Atom feed entries.
+    #
+    # == Attributes
+    # * title
+    # * url
+    # * author
+    # * content
+    # * summary
+    # * published
+    # * categories
+    class AtomEntry
+      include SAXMachine
+      include FeedEntryUtilities
+      element :title
+      element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
+      element :name, :as => :author
+      element :guid
+      element :content
+      element :summary
+      element :published
+      element :id
+      element :created, :as => :published
+      element :"media:content", :as => :image, :value => :url
+      element :issued, :as => :published
+      element :updated
+      element :modified, :as => :updated
+      elements :category, :as => :categories, :value => :term
+      elements :link, :as => :links, :value => :href
+      def url
+        @url || links.first
+      end
+    end
+  end
+end

data/lib/feedzirra/parser/atom_feed_burner.rb ADDED Viewed

@@ -0,0 +1,28 @@
+module Feedzirra
+  module Parser
+    # == Summary
+    # Parser for dealing with Feedburner Atom feeds.
+    #
+    # == Attributes
+    # * title
+    # * feed_url
+    # * url
+    # * entries
+    class AtomFeedBurner
+      include SAXMachine
+      include FeedUtilities
+      element :title
+      element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
+      element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
+      element :subtitle, :as => :description
+      elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
+      def self.able_to_parse?(xml) #:nodoc:
+        (xml =~ /Atom/ && xml =~ /feedburner/) || false
+      end
+    end
+  end
+end

data/lib/feedzirra/parser/atom_feed_burner_entry.rb ADDED Viewed

@@ -0,0 +1,37 @@
+module Feedzirra
+  module Parser
+    # == Summary
+    # Parser for dealing with Feedburner Atom feed entries.
+    #
+    # == Attributes
+    # * title
+    # * url
+    # * author
+    # * content
+    # * summary
+    # * published
+    # * categories
+    class AtomFeedBurnerEntry
+      include SAXMachine
+      include FeedEntryUtilities
+      element :title
+      element :name, :as => :author
+      element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
+      element :"feedburner:origLink", :as => :url
+      element :summary
+      element :guid
+      element :content
+      element :published
+      element :id
+      element :issued, :as => :published
+      element :created, :as => :published
+      element :"media:content", :as => :image, :value => :url
+      element :updated
+      element :modified, :as => :updated
+      elements :category, :as => :categories, :value => :term
+    end
+  end
+end

data/lib/feedzirra/parser/itunes_rss.rb ADDED Viewed

@@ -0,0 +1,50 @@
+module Feedzirra
+  module Parser
+    # iTunes is RSS 2.0 + some apple extensions
+    # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
+    class ITunesRSS
+      include SAXMachine
+      include FeedUtilities
+      attr_accessor :feed_url
+      # RSS 2.0 elements that need including
+      element :copyright
+      element :description
+      element :language
+      element :managingEditor
+      element :title
+      element :link, :as => :url
+      # If author is not present use managingEditor on the channel
+      element :"itunes:author", :as => :itunes_author
+      element :"itunes:block", :as => :itunes_block
+      element :"itunes:image", :value => :href, :as => :itunes_image
+      element :"itunes:explicit", :as => :itunes_explicit
+      element :"itunes:keywords", :as => :itunes_keywords
+      # New URL for the podcast feed
+      element :"itunes:new-feed-url", :as => :itunes_new_feed_url
+      element :"itunes:subtitle", :as => :itunes_subtitle
+      # If summary is not present, use the description tag
+      element :"itunes:summary", :as => :itunes_summary
+      # iTunes RSS feeds can have multiple main categories...
+      # ...and multiple sub-categories per category
+      # TODO subcategories not supported correctly - they are at the same level
+      #   as the main categories
+      elements :"itunes:category", :as => :itunes_categories, :value => :text
+      elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
+      elements :item, :as => :entries, :class => ITunesRSSItem
+      def self.able_to_parse?(xml)
+        xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
+      end
+    end
+  end
+end

data/lib/feedzirra/parser/itunes_rss_item.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module Feedzirra
+  module Parser
+    # iTunes extensions to the standard RSS2.0 item
+    # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
+    class ITunesRSSItem
+      include SAXMachine
+      include FeedUtilities
+      element :author
+      element :guid
+      element :title
+      element :link, :as => :url
+      element :description, :as => :summary
+      element :pubDate, :as => :published
+      # If author is not present use author tag on the item
+      element :"itunes:author", :as => :itunes_author
+      element :"itunes:block", :as => :itunes_block
+      element :"itunes:duration", :as => :itunes_duration
+      element :"itunes:explicit", :as => :itunes_explicit
+      element :"itunes:keywords", :as => :itunes_keywords
+      element :"itunes:subtitle", :as => :itunes_subtitle
+      # If summary is not present, use the description tag
+      element :"itunes:summary", :as => :itunes_summary
+      element :enclosure, :value => :length, :as => :enclosure_length
+      element :enclosure, :value => :type, :as => :enclosure_type
+      element :enclosure, :value => :url, :as => :enclosure_url
+    end
+  end
+end

data/lib/feedzirra/parser/itunes_rss_owner.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module Feedzirra
+  module Parser
+    class ITunesRSSOwner
+      include SAXMachine
+      include FeedUtilities
+      element :"itunes:name", :as => :name
+      element :"itunes:email", :as => :email
+    end
+  end
+end

data/lib/feedzirra/parser/rss.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module Feedzirra
+  module Parser
+    # == Summary
+    # Parser for dealing with RSS feeds.
+    #
+    # == Attributes
+    # * title
+    # * feed_url
+    # * url
+    # * entries
+    class RSS
+      include SAXMachine
+      include FeedUtilities
+      element :title
+      element :link, :as => :url
+      elements :item, :as => :entries, :class => RSSEntry
+      # parse the subtitle and description, so we can use whatever we have!
+      element :subtitle
+      element :description, :as => :feed_description
+      attr_accessor :feed_url
+      def description
+          self.feed_description || self.subtitle
+      end
+      def self.able_to_parse?(xml) #:nodoc:
+        xml =~ /\<rss|\<rdf/
+      end
+    end
+  end
+end

data/lib/feedzirra/parser/rss_entry.rb ADDED Viewed

@@ -0,0 +1,45 @@
+module Feedzirra
+  module Parser
+    # == Summary
+    # Parser for dealing with RDF feed entries.
+    #
+    # == Attributes
+    # * title
+    # * url
+    # * author
+    # * content
+    # * summary
+    # * published
+    # * categories
+    class RSSEntry
+      include SAXMachine
+      include FeedEntryUtilities
+      element :title
+      element :link, :as => :url
+      element :"dc:creator", :as => :author
+      element :author, :as => :author
+      element :"content:encoded", :as => :content
+      element :description, :as => :summary
+      element :pubDate, :as => :published
+      element :pubdate, :as => :published
+      element :"dc:date", :as => :published
+      element :"dc:Date", :as => :published
+      element :"dcterms:created", :as => :published
+      element :"dcterms:modified", :as => :updated
+      element :issued, :as => :published
+      elements :category, :as => :categories
+      element :guid
+      # TODO: wtf... sometimes type="image/jpeg", sometimes medium="image", what are we to do?
+      element :"media:content", :as => :image, :value => :url
+    end
+  end
+end

data/lib/feedzirra/web_page.rb ADDED Viewed

@@ -0,0 +1,8 @@
+module Feedzirra
+  class WebPage
+    include SAXMachine
+    include FeedUtilities
+    element :title # not essential; helpful for debugging
+    element :link, :as => :feed_url, :value => :href, :with => {:rel => 'alternate', :type => "application/rss+xml"}
+  end
+end

data/lib/feedzirra.rb ADDED Viewed

@@ -0,0 +1,35 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
+gem 'activesupport'
+require 'zlib'
+require 'curb_core'
+require 'sax-machine'
+require 'dryopteris'
+require 'uri'
+require 'active_support/basic_object'
+require 'active_support/core_ext/object'
+require 'active_support/core_ext/time'
+require 'core_ext/date'
+require 'core_ext/string'
+require 'feedzirra/feed_utilities'
+require 'feedzirra/feed_entry_utilities'
+require 'feedzirra/web_page'
+require 'feedzirra/feed'
+require 'feedzirra/parser/rss_entry'
+require 'feedzirra/parser/itunes_rss_owner'
+require 'feedzirra/parser/itunes_rss_item'
+require 'feedzirra/parser/atom_entry'
+require 'feedzirra/parser/atom_feed_burner_entry'
+require 'feedzirra/parser/rss'
+require 'feedzirra/parser/itunes_rss'
+require 'feedzirra/parser/atom'
+require 'feedzirra/parser/atom_feed_burner'
+module Feedzirra
+  VERSION = "0.0.17"
+end

data/spec/feedzirra/feed_entry_utilities_spec.rb ADDED Viewed

@@ -0,0 +1,62 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe Feedzirra::FeedUtilities do
+  before(:each) do
+    @klass = Class.new do
+      include Feedzirra::FeedEntryUtilities
+    end
+  end
+  describe "handling dates" do
+    it "should parse an ISO 8601 formatted datetime into Time" do
+      time = @klass.new.parse_datetime("2008-02-20T8:05:00-010:00")
+      time.class.should == Time
+      time.to_s.should == "Wed Feb 20 18:05:00 UTC 2008"
+    end
+    it "should parse a bunch of strangely encoded stuff into Time" do
+      time = @klass.new.parse_datetime("Mon, 4 Jan 7010 13:51:39 EST")
+      time.class.should == Time
+      time.to_s.should == "Mon Jan 04 18:51:39 UTC 2010"
+      time = @klass.new.parse_datetime("���, 5 1��� 2010 10:37 -0500")
+      time.class.should == Time
+      time.to_s.should == "Tue Jan 05 15:37:00 UTC 2010"
+    end
+  end
+  describe "sanitizing" do
+    before(:each) do
+      @feed = Feedzirra::Feed.parse(sample_atom_feed)
+      @entry = @feed.entries.first
+    end
+    it "should provide a sanitized title" do
+      new_title = "<script>" + @entry.title
+      @entry.title = new_title
+      @entry.title.sanitize.should == Dryopteris.sanitize(new_title)
+    end
+    it "should sanitize content in place" do
+      new_content = "<script>" + @entry.content
+      @entry.content = new_content.dup
+      @entry.content.sanitize!.should == Dryopteris.sanitize(new_content)
+      @entry.content.should == Dryopteris.sanitize(new_content)
+    end
+    it "should sanitize things in place" do
+      @entry.title   += "<script>"
+      @entry.author  += "<script>"
+      @entry.content += "<script>"
+      cleaned_title   = Dryopteris.sanitize(@entry.title)
+      cleaned_author  = Dryopteris.sanitize(@entry.author)
+      cleaned_content = Dryopteris.sanitize(@entry.content)
+      @entry.sanitize!
+      @entry.title.should   == cleaned_title
+      @entry.author.should  == cleaned_author
+      @entry.content.should == cleaned_content
+    end
+  end
+end