RubyGems - spix_parser - Versions diffs - 1.5.2 - Mend

spix_parser 1.5.2

Files changed (22) hide show

data/lib/spix_parser/config.rb +10 -0
data/lib/spix_parser/core_ext.rb +117 -0
data/lib/spix_parser/custom_parsers/atom.rb +36 -0
data/lib/spix_parser/custom_parsers/atom_entry.rb +28 -0
data/lib/spix_parser/custom_parsers/enclosure.rb +13 -0
data/lib/spix_parser/custom_parsers/rss.rb +28 -0
data/lib/spix_parser/custom_parsers/rss_entry.rb +36 -0
data/lib/spix_parser/datetime.rb +34 -0
data/lib/spix_parser/parser.rb +124 -0
data/lib/spix_parser/tools/feed_discovery.rb +94 -0
data/lib/spix_parser/tools/redirect_follower.rb +31 -0
data/lib/spix_parser/version.rb +18 -0
data/lib/spix_parser/wrappers/enclosure_interface.rb +18 -0
data/lib/spix_parser/wrappers/entry.rb +87 -0
data/lib/spix_parser/wrappers/feed.rb +81 -0
data/lib/spix_parser/wrappers/parsing_error.rb +7 -0
data/lib/spix_parser.rb +46 -0
data/spec/parser_spec.rb +6 -0
data/spec/spix_parser/parser_spec.rb +42 -0
data/spec/spix_parser/tools/feed_discovery_spec.rb +72 -0
data/spec/spix_parser/utils_spec.rb +182 -0
metadata +184 -0

data/lib/spix_parser/config.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    module Config
+      ENCODING = "UTF-8"
+      BASE_TIMESTAMP = Time.mktime("1970").utc
+    end
+  end
+end

data/lib/spix_parser/core_ext.rb ADDED Viewed

@@ -0,0 +1,117 @@
+# encoding: utf-8
+class Time
+  def to_db_format(format=:default)
+    format == :default ? to_s_default : strftime("%Y-%m-%d %H:%M:%S").strip
+  end
+  alias_method :to_s_default, :to_s
+  alias_method :to_s, :to_db_format unless method_defined?(:to_formatted_s)
+  def self.can_parse?(date)
+    begin
+      Time.parse(date)
+    rescue StandardError => e
+      return false
+    end
+    true
+  end
+  # We have a lot of timestamps in portuguese and Ruby can't parse'em
+  # So, this function will translate portuguese date related terms to english for correct parsing,
+  # because we can't afford so many feed entries with wrong timestamps
+  def self.translate_for_parsing(date_as_string)
+    # First, add leading zero to days of month below 10
+    formatted_date = date_as_string.sub(/\A[a-zA-Z]+\,\s{1}(\d)[^\d]/, '0\1 ')
+    day_names = {"Domingo" => "Sunday", "Segunda" => "Monday", "Terça" => "Tuesday", "Quarta" => "Wednesday",
+                 "Quinta" => "Thursday", "Sexta" => "Friday", "Sábado" => "Saturday", "Sabado" => "Saturday"}
+    abbr_day_names = {"Dom" => "Sun", "Seg" => "Mon", "Ter" => "Tue", "Qua" => "Wed",
+                      "Qui" => "Thu", "Sex" => "Fri", "Sáb" => "Sat", "Sab" => "Sat"}
+    month_names = {"Janeiro" => "January", "Fevereiro" => "February", "Março" => "March", "Marco" => "March",
+                   "Abril" => "April", "Maio" => "May", "Junho" => "June", "Julho" => "July",
+                   "Agosto" => "August", "Setembro" => "September", "Outubro" => "October",
+                   "Novembro" => "November", "Dezembro" => "December"}
+    abbr_month_names = {"Jan" => "Jan", "Fev" => "Feb", "Abr" => "Apr", "Mai" => "May",
+                        "Ago" => "Aug", "Set" => "Sep", "Out" => "Oct", "Dez" => "Dec"}
+    day_names.each do |key, value|
+      formatted_date.sub!(key, value)
+    end
+    abbr_day_names.each do |key, value|
+      formatted_date.sub!(key, value)
+    end
+    month_names.each do |key, value|
+      formatted_date.sub!(key, value)
+    end
+    abbr_month_names.each do |key, value|
+      formatted_date.sub!(key, value)
+    end
+    formatted_date
+  end
+end
+class Object
+  def blank?
+    respond_to?(:empty?) ? empty? : !self
+  end
+  def silence_warnings
+    old_verbose, $VERBOSE = $VERBOSE, nil
+    yield
+  ensure
+    $VERBOSE = old_verbose
+  end
+end
+class NilClass
+  def blank?
+    true
+  end
+end
+class FalseClass
+  def blank?
+    true
+  end
+end
+class TrueClass
+  def blank?
+    false
+  end
+end
+class Array
+  alias_method :blank?, :empty?
+end
+class Hash
+  alias_method :blank?, :empty?
+end
+class Numeric
+  def blank?
+    false
+  end
+end
+class String
+  def to_sha1
+    Digest::SHA1.hexdigest "--17f7e62310d5a2bbb9bfc535b95134ece1cb474d--#{self}"
+  end
+  def blank?
+    self !~ /\S/
+  end
+  def busk_normalize
+    if RUBY_VERSION >= '1.9'
+      self.force_encoding(Spix::Parser::Config::ENCODING)
+    else
+      self
+    end
+  end
+end

data/lib/spix_parser/custom_parsers/atom.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class Atom
+      include SAXMachine
+      include Feedzirra::FeedUtilities
+      element :title, :as => :feed_title
+      element :subtitle, :as => :feed_subtitle
+      element :language, :as => :feed_language
+      element :updated, :as => :last_modified
+      element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
+      element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
+      elements :link, :as => :links, :value => :href
+      elements :entry, :as => :feed_entries, :class => Spix::Parser::AtomEntry
+      alias_method :entries, :feed_entries
+      def self.able_to_parse?(xml) #:nodoc:
+        (xml =~ /application\/atom\+xml|(#{Regexp.escape("http://www.w3.org/2005/Atom")})|(#{Regexp.escape("http://purl.org/atom")})/) && (xml =~ /\<feed\s/)
+      end
+      def url
+        @url || links.last
+      end
+      def feed_url
+        @feed_url || links.first
+      end
+      def last_modified
+        @last_modified.present? ? @last_modified : super
+      end
+    end
+  end
+end

data/lib/spix_parser/custom_parsers/atom_entry.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class AtomEntry
+      include SAXMachine
+      include Feedzirra::FeedEntryUtilities
+      element :title, :as => :entry_title
+      element :link, :as => :entry_url, :value => :href, :with => {:rel => "alternate"}
+      element :name, :as => :entry_author
+      element :content, :as => :entry_content
+      element :summary, :as => :entry_summary
+      element :published
+      element :id
+      element :created, :as => :published
+      element :issued, :as => :published
+      element :updated
+      element :modified, :as => :updated
+      elements :category, :as => :entry_categories, :value => :term
+      elements :enclosure, :as => :entry_enclosures, :class => Spix::Parser::Enclosure
+      element :"media:content", :as => :media_content, :value => :url
+      element :"media:description", :as => :media_description
+      element :"media:thumbnail", :as => :media_thumbnail, :value => :url
+    end
+  end
+end

data/lib/spix_parser/custom_parsers/enclosure.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class Enclosure
+      include SAXMachine
+      include Spix::Parser::EnclosureInterface
+      element :enclosure, :value => :length, :as => :enclosure_length
+      element :enclosure, :value => :type, :as => :enclosure_type
+      element :enclosure, :value => :url, :as => :enclosure_url
+    end
+  end
+end

data/lib/spix_parser/custom_parsers/rss.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class RSS
+      include SAXMachine
+      include Feedzirra::FeedUtilities
+      element :title, :as => :feed_title
+      element :description, :as => :feed_subtitle
+      element :language, :as => :feed_language
+      element :link, :as => :url
+      element :pubDate, :as => :last_modified
+      elements :item, :as => :feed_entries, :class => Spix::Parser::RSSEntry
+      alias_method :entries, :feed_entries
+      attr_accessor :feed_url
+      def self.able_to_parse?(xml) #:nodoc:
+        (xml =~ /\<rss|rdf/) && (xml =~ /\<channel/)
+      end
+      def last_modified
+        @last_modified.present? ? @last_modified : super
+      end
+    end
+  end
+end

data/lib/spix_parser/custom_parsers/rss_entry.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class RSSEntry
+      include SAXMachine
+      include Feedzirra::FeedEntryUtilities
+      element :title, :as => :entry_title
+      element :link, :as => :entry_url
+      element :author, :as => :entry_author
+      element :"dc:creator", :as => :entry_author
+      element :"content:encoded", :as => :entry_content
+      element :description, :as => :entry_summary
+      element :summary, :as => :entry_summary
+      element :pubDate, :as => :published
+      element :"dc:date", :as => :published
+      element :"dc:Date", :as => :published
+      element :"dcterms:created", :as => :published
+      element :"dcterms:modified", :as => :updated
+      element :issued, :as => :published
+      elements :category, :as => :entry_categories
+      element :guid, :as => :id
+      elements :enclosure, :as => :entry_enclosures, :class => Spix::Parser::Enclosure
+      element :"media:content", :as => :media_content, :value => :url
+      element :"media:description", :as => :media_description
+      element :"media:thumbnail", :as => :media_thumbnail, :value => :url
+    end
+  end
+end

data/lib/spix_parser/datetime.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    module DateTimeUtilities
+      def build_datetime_object(timestamp)
+        timestamp = normalize_timestamp(timestamp)
+        if Time.can_parse?(timestamp)
+          #if the timestamp is a non-date string, it will be Time.mktime("1970").utc
+          timestamp = Time.parse(timestamp, Spix::Parser::Config::BASE_TIMESTAMP).utc
+          # non-english dates sometimes are parsed to "future" dates by Ruby
+          # we also cover the case where the timestamp is Time.mktime("1970").utc as explained above
+          if (timestamp > Time.now.utc) || (timestamp == Spix::Parser::Config::BASE_TIMESTAMP)
+            timestamp = nil
+          end
+        else
+          timestamp = nil
+        end
+        timestamp
+      end
+      private
+      def normalize_timestamp(timestamp)
+        # In Ruby 1.9 the date is returned as String
+        # In Ruby 1.8 it is returned as Time
+        timestamp_string = timestamp.to_s
+        Time.translate_for_parsing(timestamp_string.busk_normalize)
+      end
+    end
+  end
+end

data/lib/spix_parser/parser.rb ADDED Viewed

@@ -0,0 +1,124 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    def self.parse(url, options = {})
+      feed = case options.delete(:mode)
+      when :local
+        Feedzirra::Feed.parse(url)
+      when :file
+        Feedzirra::Feed.parse(File.read(url))
+      else
+        Feedzirra::Feed.fetch_and_parse(url, options)
+      end
+      # Feedzirra has some issues with failure cases:
+      # If the failure occurs on the parsing phase, then the on_failure callback is triggered
+      # If the failure occurs on the fetching phase (i. e. a network error), the a number is returned
+      # That number may represent an http status code or be 0 in case of other errors.
+      # Also, we can't raise an exception on the on_failure callback, 'cause it will be raised even on success - that's really odd
+      # So we need this 'safety net' here until we patch it to use an uniform error architecture
+      if feed.nil? || (feed.is_a?(Fixnum) && feed == 0)
+        Log.error("The parser couldn't fetch the feed at #{url}")
+        return nil
+      elsif feed.is_a?(Fixnum)
+        feed
+      else
+        Spix::Parser::Feed.new(feed)
+      end
+    end
+  end
+  module Utils
+    extend self
+    def format_links(options)
+      text     = options[:text]
+      site_url = options[:site_url]
+      parse_links(text)
+      parse_images(text, site_url)
+      text
+    end
+    private
+    def join_attributes(attrs)
+      attrs.map do |attr, value|
+        %Q[#{attr}="#{value.to_s.gsub(/"/, "&quot;")}"] unless value.blank?
+      end.compact.join(" ")
+    end
+    def parse_attrs(str)
+      attrs = {}
+      return attrs unless str || str.respond_to?(:scan)
+      match_by_spaces = str !~ /'|"/
+      if match_by_spaces
+        # Make sure to match the last html attribute.
+        str += " "
+        value_regexp = /\s*(.*?)\s/
+      else
+        value_regexp = /\s*["'](.*?)["']/
+      end
+      attribute_regexp = /\b([a-zA-Z0-9:]+)\s*/
+      str.scan(/#{attribute_regexp}=#{value_regexp}/im) do
+        attrs[$1.to_s.downcase] = $2
+      end
+      attrs
+    end
+    def parse_links(text)
+      text.gsub!(/(<a\s+([^>]+)>)/uim) do |match|
+        attrs = parse_attrs($2.to_s)
+        # just parse these attributes
+        attrs = {
+          :href   => attrs["href"],
+          :title  => attrs["title"],
+          :target => "_blank",
+          :rel    => "external nofollow"
+        }
+        "<a #{join_attributes(attrs)}>"
+      end
+    end
+    def parse_images(text, site_url)
+      text.gsub!(/(<img(.*?)\/?>)/uim) do |match|
+        attrs = parse_attrs($2.to_s)
+        # just parse these attributes
+        attrs = {
+          :src    => parse_relative_image_source(attrs["src"], site_url),
+          :alt    => attrs["alt"],
+          :title  => attrs["title"],
+          :style  => attrs["style"],
+          :width  => attrs["width"],
+          :height => attrs["height"]
+        }
+        "<img #{join_attributes(attrs)} />" if attrs[:src].present?
+      end
+    end
+    def parse_relative_image_source(src, site_url)
+      if src.present? && site_url
+        begin
+          src = URI.parse(src)
+          src = URI.parse(site_url).merge(src) if src.relative?
+        rescue URI::InvalidURIError
+          # Manually concatenating if it is "relative uri", stripping slashes.
+          if src !~ /\A(https?|ftp):\/\//
+            site_url = site_url[0..-2] if site_url[-1] == ?/
+            src      = src[1..-1]      if src[0]       == ?/
+            src      = "#{site_url}/#{src}"
+          end
+        end
+      end
+      src
+    end
+  end
+end

data/lib/spix_parser/tools/feed_discovery.rb ADDED Viewed

@@ -0,0 +1,94 @@
+gem "feedzirra", ">=0.0.24"
+require "feedzirra"
+require "nokogiri"
+require "uri"
+require "open-uri"
+module Spix
+  class FeedDiscovery
+    # HTTP "User-Agent" header to send to servers when downloading feeds.
+    USER_AGENT = "SpixParser"
+    def self.feed?(uri)
+      Spix::Parser.parse(uri, :mode => :fetch) ? true : false
+    end
+    def self.list(uri)
+      content = self.read(uri)
+      doc = Nokogiri::HTML(content)
+      # get page title
+      title = doc.search('title')[0].content
+      items = doc.search("//link[@type='application/atom+xml']", "//link[@type='application/rss+xml']").collect do |link|
+        url_object = URI::parse(uri).normalize
+        href = link.get_attribute(:href).to_s
+        feed_url_object = URI::parse(href)
+        if feed_url_object.relative?
+          # there's 2 types of relative URIs
+          # the ones based on a path (base: http://sitewithfeed.com/foo/, relative: feed.xml, feed: http://sitewithfeed.com/foo/feed.xml)
+          # and the ones based on the top domain (base: http://sitewithfeed.com/foo/, relative: /feed.xml, feed: http://sitewithfeed.com/feed.xml)
+          if feed_url_object.path.match(/^\//)
+            # when the feed_url_object is relative and starts with a "/" we should ignore the domain path
+            path = nil
+          else
+            # when the feed_url_object is relative and do not starts with a "/" we should use the domain path
+            if url_object.path.match(/\/$/)
+              # when the url_object ends with a "/" we should use it
+              path = url_object.path
+            else
+              # when the url_object do not ends with a "/" we should add it
+              path = url_object.path + "/"
+            end
+          end
+          href = "#{url_object.scheme}://" +
+                 "#{url_object.host}" +
+                 "#{path}" +
+                 "#{url_object.query}" +
+                 href
+        end
+        item = {
+          :title => link.get_attribute(:title) || title,
+          :url => href
+        }
+      end
+      if items.size == 0
+        # if there's no item found at the given URI, maybe it's a feed URI
+        if self.feed?(uri)
+          items = [
+                    {
+                      :title => title,
+                      :url => uri
+                    }
+                  ]
+        end
+      end
+      items
+    rescue
+      nil
+    end
+    def self.read(uri)
+      if uri.respond_to?(:read)
+        content = uri.read
+      else
+        req_headers = {}
+        req_headers["User-Agent"] = USER_AGENT
+        content = open(uri, req_headers).read
+      end
+    end
+  end
+end

data/lib/spix_parser/tools/redirect_follower.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require 'net/http'
+require 'thread'
+class RedirectFollower
+  def self.resolve(url)
+    @response = ""
+    begin
+      timeout(5) do
+        t = Thread.new {@response = Net::HTTP.get_response(URI.parse(url)) }
+        t.join
+        if @response.kind_of?(Net::HTTPRedirection)
+          return redirect_url(@response)
+        end
+      end
+    rescue Timeout::Error, URI::InvalidURIError
+      return url
+    end
+    url
+  end
+  protected
+  def self.redirect_url(response)
+    if response['location'].nil?
+      response.body.match(/<a href=\"([^>]+)\">/i)[1]
+    else
+      response['location']
+    end
+  end
+end

data/lib/spix_parser/version.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    module Version
+      MAJOR = 1
+      MINOR = 5
+      TINY  = 2
+      def self.current_version
+        "#{MAJOR}.#{MINOR}.#{TINY}"
+      end
+      def self.date
+        "2011-04-25"
+      end
+    end
+  end
+end

data/lib/spix_parser/wrappers/enclosure_interface.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    module EnclosureInterface
+      def url
+        enclosure_url
+      end
+      def mime_type
+        enclosure_type
+      end
+      def length
+        enclosure_length
+      end
+    end
+  end
+end

data/lib/spix_parser/wrappers/entry.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class FeedEntry
+      include Spix::Parser::DateTimeUtilities
+      include Memoizable
+      def initialize(entry, feed)
+        @feed = feed
+        @entry = entry
+      end
+      def title
+        text = @entry.entry_title || "(title unknow)"
+        text = text.busk_normalize
+        Sanitizer.sanitize(text)
+      end
+      memoize(:title)
+      def summary
+        text = @entry.entry_summary || ""
+        text = text.busk_normalize
+        Sanitizer.strip_comments(text)
+        Sanitizer.strip_disallowed_tags(text)
+        Sanitizer.entities_to_chars(text)
+      end
+      memoize(:summary)
+      def url
+        entry_url = @entry.entry_url || @feed.site_url
+        RedirectFollower.resolve(entry_url).busk_normalize if entry_url.present?
+      end
+      memoize(:url)
+      def author
+        text = @entry.entry_author || ""
+        Sanitizer.sanitize(text.busk_normalize)
+      end
+      def published_at
+        build_datetime_object(@entry.published) if @entry.published
+      end
+      def updated_at
+        build_datetime_object(@entry.updated) if @entry.updated
+      end
+      def uid
+        uid = self.url || ""
+        uid += self.title.downcase.busk_normalize
+        uid += self.striped_content.downcase.busk_normalize[0..25]
+        uid.to_sha1
+      end
+      memoize(:uid)
+      def content
+        text = encoded_raw_content
+        Sanitizer.strip_comments(text)
+        Sanitizer.strip_disallowed_tags(text)
+        Sanitizer.entities_to_chars(text)
+      end
+      memoize(:content)
+      def striped_content
+        text = encoded_raw_content
+        Sanitizer.strip_tags(text)
+      end
+      memoize(:striped_content)
+      def categories
+        @entry.entry_categories.map do |category|
+          Sanitizer.sanitize(category.busk_normalize)
+        end
+      end
+      def enclosures
+        @entry.entry_enclosures
+      end
+      private
+      def encoded_raw_content
+        text = @entry.entry_content || @entry.entry_summary || ""
+        text.busk_normalize
+      end
+    end
+  end
+end

data/lib/spix_parser/wrappers/feed.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class Feed
+      include Spix::Parser::DateTimeUtilities
+      def initialize(parsed_feed)
+        @feed = parsed_feed
+        verify_entries_timestamps
+      end
+      def title
+        text = @feed.feed_title || "(title unknow)"
+        text = text.busk_normalize
+        Sanitizer.sanitize(text)
+      end
+      def subtitle
+        text = @feed.feed_subtitle || ""
+        text = text.busk_normalize
+        Sanitizer.sanitize(text)
+      end
+      def language
+        text = @feed.feed_language || "en"
+        text = text.busk_normalize
+        Sanitizer.sanitize(text)
+      end
+      def site_url
+        @feed.url || extract_site_from_feed_url
+      end
+      def feed_url
+        @feed.feed_url
+      end
+      def uid
+        @feed.feed_url.to_sha1
+      end
+      def updated_at
+        timestamp = @feed.last_modified || @feed.feed_entries.first.published_at || Time.now.utc
+        build_datetime_object(timestamp)
+      end
+      def feed_items
+        #Se não for um feed válido, o accessor feed_entries não existe
+        if @feed.respond_to?(:feed_entries) && @feed.feed_entries.present?
+          @feed.feed_entries.map{|entry| Spix::Parser::FeedEntry.new(entry, self)}
+        else
+          []
+        end
+      end
+      private
+      def verify_entries_timestamps
+        # Some feeds return the timestamps of all entries as the timestamp of the request
+        # This means that the timestamp will change everytime we parse the feed, thus duplicating entries
+        # One way to detect that is to verify if all the entries have the same timestamp
+        items_with_same_timestamp = feed_items.map{|i| i.published_at}.uniq.size == 1
+        more_than_one_item = feed_items.count > 1
+        if items_with_same_timestamp && more_than_one_item
+          @feed.feed_entries.each {|item| item.published = Spix::Parser::Config::BASE_TIMESTAMP.to_s}
+        end
+      end
+      def extract_site_from_feed_url
+        # Eventually, we run into a feed that for some reason does not include
+        # the publisher website. In those cases, we try to guess the website
+        # root path looking at the feed_url. It may fail also, so be mindful.
+        return unless @feed.feed_url.present?
+        feed_host = URI.parse(@feed.feed_url).host
+        "http://#{feed_host}"
+      end
+    end
+  end
+end

data/lib/spix_parser/wrappers/parsing_error.rb ADDED Viewed

@@ -0,0 +1,7 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class ParsingError < StandardError
+    end
+  end
+end

data/lib/spix_parser.rb ADDED Viewed

@@ -0,0 +1,46 @@
+# encoding: utf-8
+require "rubygems"
+require "feedzirra"
+require "digest/sha1"
+require "zlib"
+require "logger"
+require "cgi"
+require "memoizable"
+$:.unshift(File.dirname(__FILE__) + '/../../lib')
+require "spix_parser/version"
+require "spix_parser/core_ext"
+require "spix_parser/config"
+require "spix_parser/parser"
+require "spix_parser/datetime"
+require "spix_parser/tools/redirect_follower"
+require "spix_parser/wrappers/entry"
+require "spix_parser/wrappers/enclosure_interface"
+require "spix_parser/wrappers/feed"
+require "spix_parser/wrappers/parsing_error"
+require "spix_parser/custom_parsers/enclosure"
+require "spix_parser/custom_parsers/atom_entry"
+require "spix_parser/custom_parsers/atom"
+require "spix_parser/custom_parsers/rss_entry"
+require "spix_parser/custom_parsers/rss"
+require "spix_parser/tools/feed_discovery"
+if RUBY_VERSION < '1.9'
+  $KCODE='u'
+else
+  Encoding.default_internal = Encoding::UTF_8
+  Encoding.default_external = Encoding::UTF_8
+end
+Feedzirra::Feed.add_feed_class(Spix::Parser::RSS)
+Feedzirra::Feed.add_feed_class(Spix::Parser::Atom)
+# Start the log over whenever the log exceeds 100 megabytes in size.
+Log = Logger.new('/var/log/spix/spix_parser.log', 0, 100 * 1024 * 1024)
+Log.level = Logger::ERROR
+Log.datetime_format = "%d-%m-%Y %H:%M:%S"

data/spec/parser_spec.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'spec_helper'
+describe Spix::Parser, 'parsing wellformed feeds' do
+  run_tests :wellformed
+end

data/spec/spix_parser/parser_spec.rb ADDED Viewed

@@ -0,0 +1,42 @@
+# encoding: utf-8
+require 'spec_helper'
+describe Spix::Parser do
+  describe "atom parsing" do
+    it 'should parse from a file path' do
+      feed = Spix::Parser.parse(fixture('feed.atom'), :mode => :file)
+      feed.should_not be_nil
+      feed.feed_items.should have(1).item
+    end
+    it 'should parse from a file' do
+      feed = Spix::Parser.parse(load_fixture('feed.atom'), :mode => :local)
+      feed.should_not be_nil
+      feed.feed_items.should have(1).item
+    end
+  end
+  describe "rss parsing" do
+    it 'should parse from a file path' do
+      feed = Spix::Parser.parse(fixture('feed.rss'), :mode => :file)
+      feed.should_not be_nil
+      feed.feed_items.should have(9).item
+    end
+    it 'should parse from a file' do
+      feed = Spix::Parser.parse(load_fixture('feed.rss'), :mode => :local)
+      feed.should_not be_nil
+      feed.feed_items.should have(9).item
+    end
+    it 'should parse a feed from meioemensagem.com' do
+      url = 'http://www.meioemensagem.com.br/home/rss/geral.xml'
+      feed = Spix::Parser.parse(load_fixture('meioemensagem.xml'), :mode => :local)
+      feed.should_not be_nil
+      feed.title.should == "RSS: Not&Atilde;&shy;cias Gerais"
+      feed.feed_items[0].title.should == "Cielo volta &Atilde;&nbsp; m&Atilde;&shy;dia com o cantor Fiuk"
+    end
+  end
+end

data/spec/spix_parser/tools/feed_discovery_spec.rb ADDED Viewed

@@ -0,0 +1,72 @@
+require 'spec_helper'
+describe Spix::FeedDiscovery, "#list" do
+  before(:all) do
+    @domain_url = "http://sitewithfeed.com"
+  end
+  describe "when the feed have an absolute URI" do
+    it "should return the feed url" do
+      FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("absolute_uri.html"))
+      Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
+    end
+  end
+  describe "when the feed have a relative URI" do
+    describe "which is relative to a path" do
+      it "should return the feed url when the URI is at the top domain" do
+        FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri.html"))
+        Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
+      end
+      it "should return the feed url when the URI is inside a path" do
+        @path_url = "/foo/bar"
+        @feed_url = @domain_url + @path_url
+        FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri.html"))
+        Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "/" + "html4-002.xml"
+      end
+    end
+    describe "which is relative to the top domain" do
+      it "should return the feed url when the URI is at the top domain" do
+        FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri_top_domain.html"))
+        Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
+      end
+      it "should return the feed url when the URI is inside a path" do
+        @path_url = "/foo/bar"
+        @feed_url = @domain_url + @path_url
+        FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri_top_domain.html"))
+        Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
+      end
+    end
+  end
+  describe "when the URI is a feed" do
+    before(:all) do
+      @path_url = "/feed.xml"
+      @feed_url = @domain_url + @path_url
+    end
+    it "should return the extracted url when there's a link at the feed" do
+      FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("feed_with_self_link.xml"))
+      Spix::FeedDiscovery.list(@feed_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
+    end
+    it "should return the same url when there's no link at the feed" do
+      fixture = load_fixture("feed_without_self_link.xml")
+      FakeWeb.register_uri(:get, @feed_url, :body => fixture)
+      # feedzirra doesn't work with fakeweb
+      feed_xml = fixture
+      feed = Feedzirra::Feed.parse(feed_xml)
+      Feedzirra::Feed.stub!(:fetch_and_parse).and_return(feed)
+      Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url
+    end
+  end
+end

data/spec/spix_parser/utils_spec.rb ADDED Viewed

@@ -0,0 +1,182 @@
+# encoding: utf-8
+require 'spec_helper'
+describe Spix::Utils do
+  describe ".format_links" do
+    context "html containing links" do
+      it "parsers links in the given html string adding rel and target" do
+        input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links removing other html attributes" do
+        input_html = %q[<div><a href="foo/bar.html" title="FooBar!" style="color: red" invalid="test">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links with simple quotes" do
+        input_html = %q[<div><a href='foo/bar.html' title='FooBar!'>FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      # TODO: should we strip these extra &quot; ?
+      it "parses links with html escaped quote (&quot;)" do
+        input_html = %q[<div><a href=&quot;foo/bar.html&quot; title=&quot;FooBar!&quot;>FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="&quot;foo/bar.html&quot;" title="&quot;FooBar!&quot;" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links with html attributes without quotes, based on spaces" do
+        input_html = %q[<div><a href=foo/bar.html title=FooBar!>FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links with html attributes having spaces before or after the equal sign" do
+        input_html = %q[<div><a href = foo/bar.html title = FooBar!>FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links downcasing attribute names" do
+        input_html = %q[<div><a HREF="foo/bar.html" TITLE="FooBar!">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links ignoring blank attributes" do
+        input_html = %q[<div><a href="foo/bar.html" title="">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+    end
+    context "html containing images" do
+      it "parsers images in the given html string matching default attributes (src, style, alt, title, width and height)" do
+        input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" alt="FooBar!" title="FooBar!" width="100" height="200" /></div>]
+      end
+      it "parses image tags removing other invalid html attributes" do
+        input_html = %q[<div><img src="images/bar.jpg" alt="FooBar!" style="color:red" invalid="test" target="_blank" /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" alt="FooBar!" style="color:red" /></div>]
+      end
+      it "parses image tags appending the given site url to relative images" do
+        input_html = %q[<div><img src="images/bar.jpg" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+          %q[<div><img src="http://example.com/images/bar.jpg" /></div>]
+      end
+      it "parses image tags having relative sources with invalid URI, appending the site url" do
+        input_html = %q[<div><img src="images/radiação.jpg" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+          %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
+      end
+      it "parses image tags having relative sources starting with / and with invalid URI, appending the site url" do
+        input_html = %q[<div><img src="/images/radiação.jpg" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+          %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
+      end
+      it "parses image tags having relative sources starting with / and with invalid URI, appending the site url also ending with /" do
+        input_html = %q[<div><img src="/images/radiação.jpg" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com/").should ==
+          %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
+      end
+      %w(http https ftp).each do |scheme|
+        it "parses image tags having absolute sources with #{scheme} and invalid URI" do
+          input_html = %Q[<div><img src="#{scheme}://example.com/images/radiação.jpg" /></div>]
+          Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+            %Q[<div><img src="#{scheme}://example.com/images/radiação.jpg" /></div>]
+        end
+      end
+      it "parses image tags having sources with spaces but using quotes" do
+        input_html = %q[<div><img src="images/foo bar.jpg" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+          %q[<div><img src="http://example.com/images/foo bar.jpg" /></div>]
+      end
+      it "parses image tags having style attributes with spaces" do
+        input_html = %q[<div><img src="images/foobar.jpg" style="color: blue;" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+          %q[<div><img src="http://example.com/images/foobar.jpg" style="color: blue;" /></div>]
+      end
+      it "parses image tags ignoring images with empty sources" do
+        input_html = %q[<div><img src="" title="FooBar!" /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div></div>]
+      end
+      it "parses image tags with simple quotes" do
+        input_html = %q[<div><img src='images/bar.jpg' title='FooBar!' /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
+      end
+      # TODO: should we strip these extra &quot; ?
+      it "parses image tags with html escaped quote (&quot;)" do
+        input_html = %q[<div><img src=&quot;images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="&quot;images/bar.jpg&quot;" title="&quot;FooBar!&quot;" /></div>]
+      end
+      it "parses image tags with html attributes without quotes, based on spaces" do
+        input_html = %q[<div><img src=images/bar.jpg title=FooBar! /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
+      end
+      it "parses image tags with html attributes having spaces before or after the equal sign" do
+        input_html = %q[<div><img src = images/bar.jpg title = FooBar! /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
+      end
+      it "parses image tags downcasing attribute names" do
+        input_html = %q[<div><img SRC="images/bar.jpg" TITLE="FooBar!" /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
+      end
+      it "parses image tags ignoring empty attributes" do
+        input_html = %q[<div><img src="images/bar.jpg" title="" /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" /></div>]
+      end
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,184 @@
+--- !ruby/object:Gem::Specification
+name: spix_parser
+version: !ruby/object:Gem::Version
+  hash: 7
+  prerelease:
+  segments:
+  - 1
+  - 5
+  - 2
+  version: 1.5.2
+platform: ruby
+authors:
+- Marcelo Eden
+- Fabio Mont'Alegre
+- "Lucas H\xC3\xBAngaro"
+- Luiz Rocha
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-05-12 00:00:00 -03:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: feedzirra
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 47
+        segments:
+        - 0
+        - 0
+        - 24
+        version: 0.0.24
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: memoizable
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 27
+        segments:
+        - 0
+        - 1
+        - 0
+        version: 0.1.0
+  type: :runtime
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: sanitizer
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 21
+        segments:
+        - 0
+        - 1
+        - 7
+        version: 0.1.7
+  type: :runtime
+  version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: i18n
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 25
+        segments:
+        - 0
+        - 1
+        - 1
+        version: 0.1.1
+  type: :runtime
+  version_requirements: *id004
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :development
+  version_requirements: *id005
+- !ruby/object:Gem::Dependency
+  name: fakeweb
+  prerelease: false
+  requirement: &id006 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :development
+  version_requirements: *id006
+description: A feed parser wrapper for Spix
+email: busk@busk.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/spix_parser/config.rb
+- lib/spix_parser/core_ext.rb
+- lib/spix_parser/custom_parsers/atom.rb
+- lib/spix_parser/custom_parsers/atom_entry.rb
+- lib/spix_parser/custom_parsers/enclosure.rb
+- lib/spix_parser/custom_parsers/rss.rb
+- lib/spix_parser/custom_parsers/rss_entry.rb
+- lib/spix_parser/datetime.rb
+- lib/spix_parser/parser.rb
+- lib/spix_parser/tools/feed_discovery.rb
+- lib/spix_parser/tools/redirect_follower.rb
+- lib/spix_parser/version.rb
+- lib/spix_parser/wrappers/enclosure_interface.rb
+- lib/spix_parser/wrappers/entry.rb
+- lib/spix_parser/wrappers/feed.rb
+- lib/spix_parser/wrappers/parsing_error.rb
+- lib/spix_parser.rb
+- spec/parser_spec.rb
+- spec/spix_parser/parser_spec.rb
+- spec/spix_parser/tools/feed_discovery_spec.rb
+- spec/spix_parser/utils_spec.rb
+has_rdoc: true
+homepage: http://github.com/busk/spix_parser
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.6.2
+signing_key:
+specification_version: 3
+summary: FeedParser for Spix
+test_files:
+- spec/parser_spec.rb
+- spec/spix_parser/parser_spec.rb
+- spec/spix_parser/tools/feed_discovery_spec.rb
+- spec/spix_parser/utils_spec.rb