RubyGems - Spectives-logophobia-feedzirra - Versions diffs - 0.0.31 - Mend

Spectives-logophobia-feedzirra 0.0.31

Files changed (44) hide show

data/README.rdoc +169 -0
data/README.textile +205 -0
data/Rakefile +56 -0
data/lib/core_ext/date.rb +21 -0
data/lib/core_ext/string.rb +9 -0
data/lib/feedzirra.rb +44 -0
data/lib/feedzirra/feed.rb +333 -0
data/lib/feedzirra/feed_entry_utilities.rb +45 -0
data/lib/feedzirra/feed_utilities.rb +71 -0
data/lib/feedzirra/parser/atom.rb +35 -0
data/lib/feedzirra/parser/atom_entry.rb +41 -0
data/lib/feedzirra/parser/itunes_category.rb +12 -0
data/lib/feedzirra/parser/mrss_category.rb +11 -0
data/lib/feedzirra/parser/mrss_content.rb +48 -0
data/lib/feedzirra/parser/mrss_copyright.rb +10 -0
data/lib/feedzirra/parser/mrss_credit.rb +11 -0
data/lib/feedzirra/parser/mrss_group.rb +37 -0
data/lib/feedzirra/parser/mrss_hash.rb +10 -0
data/lib/feedzirra/parser/mrss_player.rb +11 -0
data/lib/feedzirra/parser/mrss_rating.rb +10 -0
data/lib/feedzirra/parser/mrss_restriction.rb +11 -0
data/lib/feedzirra/parser/mrss_text.rb +13 -0
data/lib/feedzirra/parser/mrss_thumbnail.rb +11 -0
data/lib/feedzirra/parser/rss.rb +83 -0
data/lib/feedzirra/parser/rss_entry.rb +83 -0
data/lib/feedzirra/parser/rss_image.rb +15 -0
data/spec/benchmarks/feed_benchmarks.rb +98 -0
data/spec/benchmarks/feedzirra_benchmarks.rb +40 -0
data/spec/benchmarks/fetching_benchmarks.rb +28 -0
data/spec/benchmarks/parsing_benchmark.rb +30 -0
data/spec/benchmarks/updating_benchmarks.rb +33 -0
data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
data/spec/feedzirra/feed_spec.rb +546 -0
data/spec/feedzirra/feed_utilities_spec.rb +149 -0
data/spec/feedzirra/parser/atom_entry_spec.rb +49 -0
data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +42 -0
data/spec/feedzirra/parser/atom_feed_burner_spec.rb +39 -0
data/spec/feedzirra/parser/atom_spec.rb +43 -0
data/spec/feedzirra/parser/mrss_content_spec.rb +32 -0
data/spec/feedzirra/parser/rss_entry_spec.rb +154 -0
data/spec/feedzirra/parser/rss_spec.rb +93 -0
data/spec/sample_feeds/run_against_sample.rb +20 -0
data/spec/spec_helper.rb +62 -0
metadata +155 -0

data/lib/core_ext/string.rb ADDED Viewed

@@ -0,0 +1,9 @@
+class String
+  def sanitize!
+    self.replace(sanitize)
+  end
+  def sanitize
+    Dryopteris.sanitize(self)
+  end
+end

data/lib/feedzirra.rb ADDED Viewed

@@ -0,0 +1,44 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
+gem 'activesupport'
+require 'zlib'
+require 'curb'
+require 'sax-machine'
+require 'dryopteris'
+require 'uri'
+require 'active_support/basic_object'
+require 'active_support/core_ext/object'
+require 'active_support/core_ext/time'
+require 'core_ext/date'
+require 'core_ext/string'
+require 'feedzirra/feed_utilities'
+require 'feedzirra/feed_entry_utilities'
+require 'feedzirra/feed'
+require 'feedzirra/parser/mrss_content'
+require 'feedzirra/parser/mrss_credit'
+require 'feedzirra/parser/mrss_restriction'
+require 'feedzirra/parser/mrss_group'
+require 'feedzirra/parser/mrss_category'
+require 'feedzirra/parser/mrss_copyright'
+require 'feedzirra/parser/mrss_hash'
+require 'feedzirra/parser/mrss_player'
+require 'feedzirra/parser/mrss_rating'
+require 'feedzirra/parser/mrss_restriction'
+require 'feedzirra/parser/mrss_text'
+require 'feedzirra/parser/mrss_thumbnail'
+require 'feedzirra/parser/rss_entry'
+require 'feedzirra/parser/rss_image'
+require 'feedzirra/parser/itunes_category'
+require 'feedzirra/parser/atom_entry'
+require 'feedzirra/parser/rss'
+require 'feedzirra/parser/atom'
+module Feedzirra
+  VERSION = "0.0.31"
+end

data/lib/feedzirra/feed.rb ADDED Viewed

@@ -0,0 +1,333 @@
+module Feedzirra
+  class NoParserAvailable < StandardError; end
+  class Feed
+    USER_AGENT = "feedzirra http://github.com/pauldix/feedzirra/tree/master"
+    # Takes a raw XML feed and attempts to parse it. If no parser is available a Feedzirra::NoParserAvailable exception is raised.
+    #
+    # === Parameters
+    # [xml<String>] The XML that you would like parsed.
+    # === Returns
+    # An instance of the determined feed type. By default a Feedzirra::Atom, Feedzirra::AtomFeedBurner, Feedzirra::RDF, or Feedzirra::RSS object.
+    # === Raises
+    # Feedzirra::NoParserAvailable : If no valid parser classes could be found for the feed.
+    def self.parse(xml)
+      if parser = determine_feed_parser_for_xml(xml)
+        parser.parse(xml)
+      else
+        raise NoParserAvailable.new("No valid parser for XML.")
+      end
+    end
+    # Determines the correct parser class to use for parsing the feed.
+    #
+    # === Parameters
+    # [xml<String>] The XML that you would like determine the parser for.
+    # === Returns
+    # The class name of the parser that can handle the XML.
+    def self.determine_feed_parser_for_xml(xml)
+      start_of_doc = xml.slice(0, 2000)
+      feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)}
+    end
+    # Adds a new feed parsing class that will be used for parsing.
+    #
+    # === Parameters
+    # [klass<Constant>] The class/constant that you want to register.
+    # === Returns
+    # A updated array of feed parser class names.
+    def self.add_feed_class(klass)
+      feed_classes.unshift klass
+    end
+    # Provides a list of registered feed parsing classes.
+    #
+    # === Returns
+    # A array of class names.
+    def self.feed_classes
+      @feed_classes ||= [
+        Feedzirra::Parser::RSS,
+        Feedzirra::Parser::Atom
+      ]
+    end
+    # Makes all entry types look for the passed in element to parse. This is actually just a call to
+    # element (a SAXMachine call) in the class
+    #
+    # === Parameters
+    # [element_tag<String>]
+    # [options<Hash>] Valid keys are same as with SAXMachine
+    def self.add_common_feed_entry_element(element_tag, options = {})
+      # need to think of a better way to do this. will break for people who want this behavior
+      # across their added classes
+      feed_classes.map{|k| eval("#{k}Entry") }.each do |klass|
+        klass.send(:element, element_tag, options)
+      end
+    end
+    # Fetches and returns the raw XML for each URL provided.
+    #
+    # === Parameters
+    # [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
+    # [options<Hash>] Valid keys for this argument as as followed:
+    #                 :user_agent - String that overrides the default user agent.
+    #                 :if_modified_since - Time object representing when the feed was last updated.
+    #                 :if_none_match - String that's normally an etag for the request that was stored previously.
+    #                 :on_success - Block that gets executed after a successful request.
+    #                 :on_failure - Block that gets executed after a failed request.
+    # === Returns
+    # A String of XML if a single URL is passed.
+    #
+    # A Hash if multiple URL's are passed. The key will be the URL, and the value the XML.
+    def self.fetch_raw(urls, options = {})
+      url_queue = [*urls]
+      multi = Curl::Multi.new
+      responses = {}
+      url_queue.each do |url|
+        easy = Curl::Easy.new(url) do |curl|
+          curl.headers["User-Agent"]        = (options[:user_agent] || USER_AGENT)
+          curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
+          curl.headers["If-None-Match"]     = options[:if_none_match] if options.has_key?(:if_none_match)
+          curl.headers["Accept-encoding"]   = 'gzip, deflate' if options.has_key?(:compress)
+          curl.follow_location = true
+          curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
+          curl.max_redirects = options[:max_redirects] if options[:max_redirects]
+          curl.timeout = options[:timeout] if options[:timeout]
+          curl.on_success do |c|
+            c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
+            responses[url] = decode_content(c)
+          end
+          curl.on_failure do |c|
+            c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
+            responses[url] = c.response_code
+          end
+        end
+        multi.add(easy)
+      end
+      multi.perform
+      urls.is_a?(String) ? responses.values.first : responses
+    end
+    # Fetches and returns the parsed XML for each URL provided.
+    #
+    # === Parameters
+    # [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
+    # [options<Hash>] Valid keys for this argument as as followed:
+    # * :user_agent - String that overrides the default user agent.
+    # * :if_modified_since - Time object representing when the feed was last updated.
+    # * :if_none_match - String, an etag for the request that was stored previously.
+    # * :on_success - Block that gets executed after a successful request.
+    # * :on_failure - Block that gets executed after a failed request.
+    # === Returns
+    # A Feed object if a single URL is passed.
+    #
+    # A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object.
+    def self.fetch_and_parse(urls, options = {})
+      url_queue = [*urls]
+      multi = Curl::Multi.new
+      responses = {}
+      # I broke these down so I would only try to do 30 simultaneously because
+      # I was getting weird errors when doing a lot. As one finishes it pops another off the queue.
+      url_queue.slice!(0, 30).each do |url|
+        add_url_to_multi(multi, url, url_queue, responses, options)
+      end
+      multi.perform
+      return urls.is_a?(String) ? responses.values.first : responses
+    end
+    # Decodes the XML document if it was compressed.
+    #
+    # === Parameters
+    # [curl_request<Curl::Easy>] The Curl::Easy response object from the request.
+    # === Returns
+    # A decoded string of XML.
+    def self.decode_content(c)
+      if c.header_str.match(/Content-Encoding: gzip/)
+        begin
+          gz =  Zlib::GzipReader.new(StringIO.new(c.body_str))
+          xml = gz.read
+          gz.close
+        rescue Zlib::GzipFile::Error
+          # Maybe this is not gzipped?
+          xml = c.body_str
+        end
+      elsif c.header_str.match(/Content-Encoding: deflate/)
+        xml = Zlib::Inflate.inflate(c.body_str)
+      else
+        xml = c.body_str
+      end
+      xml
+    end
+    # Updates each feed for each Feed object provided.
+    #
+    # === Parameters
+    # [feeds<Feed> or <Array>] A single feed object, or an array of feed objects.
+    # [options<Hash>] Valid keys for this argument as as followed:
+    #                 * :user_agent - String that overrides the default user agent.
+    #                 * :on_success - Block that gets executed after a successful request.
+    #                 * :on_failure - Block that gets executed after a failed request.
+    # === Returns
+    # A updated Feed object if a single URL is passed.
+    #
+    # A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object.
+    def self.update(feeds, options = {})
+      feed_queue = [*feeds]
+      multi = Curl::Multi.new
+      responses = {}
+      feed_queue.slice!(0, 30).each do |feed|
+        add_feed_to_multi(multi, feed, feed_queue, responses, options)
+      end
+      multi.perform
+      responses.size == 1 ? responses.values.first : responses.values
+    end
+    # An abstraction for adding a feed by URL to the passed Curb::multi stack.
+    #
+    # === Parameters
+    # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
+    # [url<String>] The URL of the feed that you would like to be fetched.
+    # [url_queue<Array>] An array of URLs that are queued for request.
+    # [responses<Hash>] Existing responses that you want the response from the request added to.
+    # [feeds<String> or <Array>] A single feed object, or an array of feed objects.
+    # [options<Hash>] Valid keys for this argument as as followed:
+    #                 * :user_agent - String that overrides the default user agent.
+    #                 * :on_success - Block that gets executed after a successful request.
+    #                 * :on_failure - Block that gets executed after a failed request.
+    # === Returns
+    # The updated Curl::Multi object with the request details added to it's stack.
+    def self.add_url_to_multi(multi, url, url_queue, responses, options)
+      easy = Curl::Easy.new(url) do |curl|
+        curl.headers["User-Agent"]        = (options[:user_agent] || USER_AGENT)
+        curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
+        curl.headers["If-None-Match"]     = options[:if_none_match] if options.has_key?(:if_none_match)
+        curl.headers["Accept-encoding"]   = 'gzip, deflate' if options.has_key?(:compress)
+        curl.follow_location = true
+        curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
+        curl.max_redirects = options[:max_redirects] if options[:max_redirects]
+        curl.timeout = options[:timeout] if options[:timeout]
+        curl.on_success do |c|
+          c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
+          add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
+          xml = decode_content(c)
+          klass = determine_feed_parser_for_xml(xml)
+          if klass
+            begin
+              feed = klass.parse(xml)
+              feed.feed_url = c.last_effective_url
+              feed.etag = etag_from_header(c.header_str)
+              feed.last_modified = last_modified_from_header(c.header_str)
+              responses[url] = feed
+              options[:on_success].call(url, feed) if options.has_key?(:on_success)
+            rescue Exception => e
+              options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
+            end
+          else
+            # puts "Error determining parser for #{url} - #{c.last_effective_url}"
+            # raise NoParserAvailable.new("no valid parser for content.") (this would unfirtunately fail the whole 'multi', so it's not really useable)
+            options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
+          end
+        end
+        curl.on_failure do |c|
+          c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
+          add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
+          responses[url] = c.response_code
+          options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
+        end
+      end
+      multi.add(easy)
+    end
+    # An abstraction for adding a feed by a Feed object to the passed Curb::multi stack.
+    #
+    # === Parameters
+    # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
+    # [feed<Feed>] A feed object that you would like to be fetched.
+    # [url_queue<Array>] An array of feed objects that are queued for request.
+    # [responses<Hash>] Existing responses that you want the response from the request added to.
+    # [feeds<String>] or <Array> A single feed object, or an array of feed objects.
+    # [options<Hash>] Valid keys for this argument as as followed:
+    #                 * :user_agent - String that overrides the default user agent.
+    #                 * :on_success - Block that gets executed after a successful request.
+    #                 * :on_failure - Block that gets executed after a failed request.
+    # === Returns
+    # The updated Curl::Multi object with the request details added to it's stack.
+    def self.add_feed_to_multi(multi, feed, feed_queue, responses, options)
+      easy = Curl::Easy.new(feed.feed_url) do |curl|
+        curl.headers["User-Agent"]        = (options[:user_agent] || USER_AGENT)
+        curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified
+        curl.headers["If-None-Match"]     = feed.etag if feed.etag
+        curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
+        curl.follow_location = true
+        curl.max_redirects = options[:max_redirects] if options[:max_redirects]
+        curl.timeout = options[:timeout] if options[:timeout]
+        curl.on_success do |c|
+          c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
+          begin
+            add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
+            updated_feed = Feed.parse(c.body_str)
+            updated_feed.feed_url = c.last_effective_url
+            updated_feed.etag = etag_from_header(c.header_str)
+            updated_feed.last_modified = last_modified_from_header(c.header_str)
+            feed.update_from_feed(updated_feed)
+            responses[feed.feed_url] = feed
+            options[:on_success].call(feed) if options.has_key?(:on_success)
+          rescue Exception => e
+            options[:on_failure].call(feed, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
+          end
+        end
+        curl.on_failure do |c|
+          c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
+          add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
+          response_code = c.response_code
+          if response_code == 304 # it's not modified. this isn't an error condition
+            responses[feed.feed_url] = feed
+            options[:on_success].call(feed) if options.has_key?(:on_success)
+          else
+            responses[feed.url] = c.response_code
+            options[:on_failure].call(feed, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
+          end
+        end
+      end
+      multi.add(easy)
+    end
+    # Determines the etag from the request headers.
+    #
+    # === Parameters
+    # [header<String>] Raw request header returned from the request
+    # === Returns
+    # A string of the etag or nil if it cannot be found in the headers.
+    def self.etag_from_header(header)
+      header =~ /.*ETag:\s(.*)\r/
+      $1
+    end
+    # Determines the last modified date from the request headers.
+    #
+    # === Parameters
+    # [header<String>] Raw request header returned from the request
+    # === Returns
+    # A Time object of the last modified date or nil if it cannot be found in the headers.
+    def self.last_modified_from_header(header)
+      header =~ /.*Last-Modified:\s(.*)\r/
+      Time.parse($1) if $1
+    end
+  end
+end

data/lib/feedzirra/feed_entry_utilities.rb ADDED Viewed

@@ -0,0 +1,45 @@
+module Feedzirra
+  module FeedEntryUtilities
+    def published
+      @published || @updated
+    end
+    def parse_datetime(string)
+      begin
+        DateTime.parse(string).feed_utils_to_gm_time
+      rescue
+        puts "DATE CAN'T BE PARSED: #{string}"
+        nil
+      end
+    end
+    ##
+    # Returns the id of the entry or its url if not id is present, as some formats don't support it
+    def id
+      @id || @url
+    end
+    ##
+    # Writter for published. By default, we keep the "oldest" publish time found.
+    def published=(val)
+      parsed = parse_datetime(val)
+      @published = parsed if !@published || parsed < @published
+    end
+    ##
+    # Writter for udapted. By default, we keep the most recenet update time found.
+    def updated=(val)
+      parsed = parse_datetime(val)
+      @updated = parsed if !@updated || parsed > @updated
+    end
+    def sanitize!
+      self.title.sanitize!   if self.title
+      self.author.sanitize!  if self.author
+      self.summary.sanitize! if self.summary
+      self.content.sanitize! if self.content
+    end
+    alias_method :last_modified, :published
+  end
+end

data/lib/feedzirra/feed_utilities.rb ADDED Viewed

@@ -0,0 +1,71 @@
+module Feedzirra
+  module FeedUtilities
+    UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
+    attr_writer   :new_entries, :updated, :last_modified
+    attr_accessor :etag
+    def last_modified
+      @last_modified ||= begin
+        entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
+        entry ? entry.published : nil
+      end
+    end
+    def updated?
+      @updated
+    end
+    def new_entries
+      @new_entries ||= []
+    end
+    def has_new_entries?
+      new_entries.size > 0
+    end
+    def update_from_feed(feed)
+      self.new_entries += find_new_entries_for(feed)
+      self.entries.unshift(*self.new_entries)
+      updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
+    end
+    def update_attribute(feed, name)
+      old_value, new_value = send(name), feed.send(name)
+      if old_value != new_value
+        send("#{name}=", new_value)
+      end
+    end
+    def sanitize_entries!
+      entries.each {|entry| entry.sanitize!}
+    end
+    private
+    def updated!
+      @updated = true
+    end
+    def find_new_entries_for(feed)
+      # this implementation is a hack, which is why it's so ugly.
+      # it's to get around the fact that not all feeds have a published date.
+      # however, they're always ordered with the newest one first.
+      # So we go through the entries just parsed and insert each one as a new entry
+      # until we get to one that has the same url as the the newest for the feed
+      latest_entry = self.entries.first
+      found_new_entries = []
+      feed.entries.each do |entry|
+        break if entry.url == latest_entry.url
+        found_new_entries << entry
+      end
+      found_new_entries
+    end
+    def existing_entry?(test_entry)
+      entries.any? { |entry| entry.url == test_entry.url }
+    end
+  end
+end