RubyGems - spix_parser - Versions diffs - 1.5.2 - Mend

spix_parser 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/lib/spix_parser/config.rb +10 -0
data/lib/spix_parser/core_ext.rb +117 -0
data/lib/spix_parser/custom_parsers/atom.rb +36 -0
data/lib/spix_parser/custom_parsers/atom_entry.rb +28 -0
data/lib/spix_parser/custom_parsers/enclosure.rb +13 -0
data/lib/spix_parser/custom_parsers/rss.rb +28 -0
data/lib/spix_parser/custom_parsers/rss_entry.rb +36 -0
data/lib/spix_parser/datetime.rb +34 -0
data/lib/spix_parser/parser.rb +124 -0
data/lib/spix_parser/tools/feed_discovery.rb +94 -0
data/lib/spix_parser/tools/redirect_follower.rb +31 -0
data/lib/spix_parser/version.rb +18 -0
data/lib/spix_parser/wrappers/enclosure_interface.rb +18 -0
data/lib/spix_parser/wrappers/entry.rb +87 -0
data/lib/spix_parser/wrappers/feed.rb +81 -0
data/lib/spix_parser/wrappers/parsing_error.rb +7 -0
data/lib/spix_parser.rb +46 -0
data/spec/parser_spec.rb +6 -0
data/spec/spix_parser/parser_spec.rb +42 -0
data/spec/spix_parser/tools/feed_discovery_spec.rb +72 -0
data/spec/spix_parser/utils_spec.rb +182 -0
metadata +184 -0

data/lib/spix_parser/config.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    module Config
+      ENCODING = "UTF-8"
+      BASE_TIMESTAMP = Time.mktime("1970").utc
+    end
+  end
+end

data/lib/spix_parser/core_ext.rb ADDED Viewed

@@ -0,0 +1,117 @@
+# encoding: utf-8
+class Time
+  def to_db_format(format=:default)
+    format == :default ? to_s_default : strftime("%Y-%m-%d %H:%M:%S").strip
+  end
+  alias_method :to_s_default, :to_s
+  alias_method :to_s, :to_db_format unless method_defined?(:to_formatted_s)
+  def self.can_parse?(date)
+    begin
+      Time.parse(date)
+    rescue StandardError => e
+      return false
+    end
+    true
+  end
+  # We have a lot of timestamps in portuguese and Ruby can't parse'em
+  # So, this function will translate portuguese date related terms to english for correct parsing,
+  # because we can't afford so many feed entries with wrong timestamps
+  def self.translate_for_parsing(date_as_string)
+    # First, add leading zero to days of month below 10
+    formatted_date = date_as_string.sub(/\A[a-zA-Z]+\,\s{1}(\d)[^\d]/, '0\1 ')
+    day_names = {"Domingo" => "Sunday", "Segunda" => "Monday", "Terça" => "Tuesday", "Quarta" => "Wednesday",
+                 "Quinta" => "Thursday", "Sexta" => "Friday", "Sábado" => "Saturday", "Sabado" => "Saturday"}
+    abbr_day_names = {"Dom" => "Sun", "Seg" => "Mon", "Ter" => "Tue", "Qua" => "Wed",
+                      "Qui" => "Thu", "Sex" => "Fri", "Sáb" => "Sat", "Sab" => "Sat"}
+    month_names = {"Janeiro" => "January", "Fevereiro" => "February", "Março" => "March", "Marco" => "March",
+                   "Abril" => "April", "Maio" => "May", "Junho" => "June", "Julho" => "July",
+                   "Agosto" => "August", "Setembro" => "September", "Outubro" => "October",
+                   "Novembro" => "November", "Dezembro" => "December"}
+    abbr_month_names = {"Jan" => "Jan", "Fev" => "Feb", "Abr" => "Apr", "Mai" => "May",
+                        "Ago" => "Aug", "Set" => "Sep", "Out" => "Oct", "Dez" => "Dec"}
+    day_names.each do |key, value|
+      formatted_date.sub!(key, value)
+    end
+    abbr_day_names.each do |key, value|
+      formatted_date.sub!(key, value)
+    end
+    month_names.each do |key, value|
+      formatted_date.sub!(key, value)
+    end
+    abbr_month_names.each do |key, value|
+      formatted_date.sub!(key, value)
+    end
+    formatted_date
+  end
+end
+class Object
+  def blank?
+    respond_to?(:empty?) ? empty? : !self
+  end
+  def silence_warnings
+    old_verbose, $VERBOSE = $VERBOSE, nil
+    yield
+  ensure
+    $VERBOSE = old_verbose
+  end
+end
+class NilClass
+  def blank?
+    true
+  end
+end
+class FalseClass
+  def blank?
+    true
+  end
+end
+class TrueClass
+  def blank?
+    false
+  end
+end
+class Array
+  alias_method :blank?, :empty?
+end
+class Hash
+  alias_method :blank?, :empty?
+end
+class Numeric
+  def blank?
+    false
+  end
+end
+class String
+  def to_sha1
+    Digest::SHA1.hexdigest "--17f7e62310d5a2bbb9bfc535b95134ece1cb474d--#{self}"
+  end
+  def blank?
+    self !~ /\S/
+  end
+  def busk_normalize
+    if RUBY_VERSION >= '1.9'
+      self.force_encoding(Spix::Parser::Config::ENCODING)
+    else
+      self
+    end
+  end
+end

data/lib/spix_parser/custom_parsers/atom.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class Atom
+      include SAXMachine
+      include Feedzirra::FeedUtilities
+      element :title, :as => :feed_title
+      element :subtitle, :as => :feed_subtitle
+      element :language, :as => :feed_language
+      element :updated, :as => :last_modified
+      element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
+      element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
+      elements :link, :as => :links, :value => :href
+      elements :entry, :as => :feed_entries, :class => Spix::Parser::AtomEntry
+      alias_method :entries, :feed_entries
+      def self.able_to_parse?(xml) #:nodoc:
+        (xml =~ /application\/atom\+xml|(#{Regexp.escape("http://www.w3.org/2005/Atom")})|(#{Regexp.escape("http://purl.org/atom")})/) && (xml =~ /\<feed\s/)
+      end
+      def url
+        @url || links.last
+      end
+      def feed_url
+        @feed_url || links.first
+      end
+      def last_modified
+        @last_modified.present? ? @last_modified : super
+      end
+    end
+  end
+end

data/lib/spix_parser/custom_parsers/atom_entry.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class AtomEntry
+      include SAXMachine
+      include Feedzirra::FeedEntryUtilities
+      element :title, :as => :entry_title
+      element :link, :as => :entry_url, :value => :href, :with => {:rel => "alternate"}
+      element :name, :as => :entry_author
+      element :content, :as => :entry_content
+      element :summary, :as => :entry_summary
+      element :published
+      element :id
+      element :created, :as => :published
+      element :issued, :as => :published
+      element :updated
+      element :modified, :as => :updated
+      elements :category, :as => :entry_categories, :value => :term
+      elements :enclosure, :as => :entry_enclosures, :class => Spix::Parser::Enclosure
+      element :"media:content", :as => :media_content, :value => :url
+      element :"media:description", :as => :media_description
+      element :"media:thumbnail", :as => :media_thumbnail, :value => :url
+    end
+  end
+end

data/lib/spix_parser/custom_parsers/enclosure.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class Enclosure
+      include SAXMachine
+      include Spix::Parser::EnclosureInterface
+      element :enclosure, :value => :length, :as => :enclosure_length
+      element :enclosure, :value => :type, :as => :enclosure_type
+      element :enclosure, :value => :url, :as => :enclosure_url
+    end
+  end
+end

data/lib/spix_parser/custom_parsers/rss.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class RSS
+      include SAXMachine
+      include Feedzirra::FeedUtilities
+      element :title, :as => :feed_title
+      element :description, :as => :feed_subtitle
+      element :language, :as => :feed_language
+      element :link, :as => :url
+      element :pubDate, :as => :last_modified
+      elements :item, :as => :feed_entries, :class => Spix::Parser::RSSEntry
+      alias_method :entries, :feed_entries
+      attr_accessor :feed_url
+      def self.able_to_parse?(xml) #:nodoc:
+        (xml =~ /\<rss|rdf/) && (xml =~ /\<channel/)
+      end
+      def last_modified
+        @last_modified.present? ? @last_modified : super
+      end
+    end
+  end
+end

data/lib/spix_parser/custom_parsers/rss_entry.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class RSSEntry
+      include SAXMachine
+      include Feedzirra::FeedEntryUtilities
+      element :title, :as => :entry_title
+      element :link, :as => :entry_url
+      element :author, :as => :entry_author
+      element :"dc:creator", :as => :entry_author
+      element :"content:encoded", :as => :entry_content
+      element :description, :as => :entry_summary
+      element :summary, :as => :entry_summary
+      element :pubDate, :as => :published
+      element :"dc:date", :as => :published
+      element :"dc:Date", :as => :published
+      element :"dcterms:created", :as => :published
+      element :"dcterms:modified", :as => :updated
+      element :issued, :as => :published
+      elements :category, :as => :entry_categories
+      element :guid, :as => :id
+      elements :enclosure, :as => :entry_enclosures, :class => Spix::Parser::Enclosure
+      element :"media:content", :as => :media_content, :value => :url
+      element :"media:description", :as => :media_description
+      element :"media:thumbnail", :as => :media_thumbnail, :value => :url
+    end
+  end
+end

data/lib/spix_parser/datetime.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    module DateTimeUtilities
+      def build_datetime_object(timestamp)
+        timestamp = normalize_timestamp(timestamp)
+        if Time.can_parse?(timestamp)
+          #if the timestamp is a non-date string, it will be Time.mktime("1970").utc
+          timestamp = Time.parse(timestamp, Spix::Parser::Config::BASE_TIMESTAMP).utc
+          # non-english dates sometimes are parsed to "future" dates by Ruby
+          # we also cover the case where the timestamp is Time.mktime("1970").utc as explained above
+          if (timestamp > Time.now.utc) || (timestamp == Spix::Parser::Config::BASE_TIMESTAMP)
+            timestamp = nil
+          end
+        else
+          timestamp = nil
+        end
+        timestamp
+      end
+      private
+      def normalize_timestamp(timestamp)
+        # In Ruby 1.9 the date is returned as String
+        # In Ruby 1.8 it is returned as Time
+        timestamp_string = timestamp.to_s
+        Time.translate_for_parsing(timestamp_string.busk_normalize)
+      end
+    end
+  end
+end

data/lib/spix_parser/parser.rb ADDED Viewed

@@ -0,0 +1,124 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    def self.parse(url, options = {})
+      feed = case options.delete(:mode)
+      when :local
+        Feedzirra::Feed.parse(url)
+      when :file
+        Feedzirra::Feed.parse(File.read(url))
+      else
+        Feedzirra::Feed.fetch_and_parse(url, options)
+      end
+      # Feedzirra has some issues with failure cases:
+      # If the failure occurs on the parsing phase, then the on_failure callback is triggered
+      # If the failure occurs on the fetching phase (i. e. a network error), the a number is returned
+      # That number may represent an http status code or be 0 in case of other errors.
+      # Also, we can't raise an exception on the on_failure callback, 'cause it will be raised even on success - that's really odd
+      # So we need this 'safety net' here until we patch it to use an uniform error architecture
+      if feed.nil? || (feed.is_a?(Fixnum) && feed == 0)
+        Log.error("The parser couldn't fetch the feed at #{url}")
+        return nil
+      elsif feed.is_a?(Fixnum)
+        feed
+      else
+        Spix::Parser::Feed.new(feed)
+      end
+    end
+  end
+  module Utils
+    extend self
+    def format_links(options)
+      text     = options[:text]
+      site_url = options[:site_url]
+      parse_links(text)
+      parse_images(text, site_url)
+      text
+    end
+    private
+    def join_attributes(attrs)
+      attrs.map do |attr, value|
+        %Q[#{attr}="#{value.to_s.gsub(/"/, "&quot;")}"] unless value.blank?
+      end.compact.join(" ")
+    end
+    def parse_attrs(str)
+      attrs = {}
+      return attrs unless str || str.respond_to?(:scan)
+      match_by_spaces = str !~ /'|"/
+      if match_by_spaces
+        # Make sure to match the last html attribute.
+        str += " "
+        value_regexp = /\s*(.*?)\s/
+      else
+        value_regexp = /\s*["'](.*?)["']/
+      end
+      attribute_regexp = /\b([a-zA-Z0-9:]+)\s*/
+      str.scan(/#{attribute_regexp}=#{value_regexp}/im) do
+        attrs[$1.to_s.downcase] = $2
+      end
+      attrs
+    end
+    def parse_links(text)
+      text.gsub!(/(<a\s+([^>]+)>)/uim) do |match|
+        attrs = parse_attrs($2.to_s)
+        # just parse these attributes
+        attrs = {
+          :href   => attrs["href"],
+          :title  => attrs["title"],
+          :target => "_blank",
+          :rel    => "external nofollow"
+        }
+        "<a #{join_attributes(attrs)}>"
+      end
+    end
+    def parse_images(text, site_url)
+      text.gsub!(/(<img(.*?)\/?>)/uim) do |match|
+        attrs = parse_attrs($2.to_s)
+        # just parse these attributes
+        attrs = {
+          :src    => parse_relative_image_source(attrs["src"], site_url),
+          :alt    => attrs["alt"],
+          :title  => attrs["title"],
+          :style  => attrs["style"],
+          :width  => attrs["width"],
+          :height => attrs["height"]
+        }
+        "<img #{join_attributes(attrs)} />" if attrs[:src].present?
+      end
+    end
+    def parse_relative_image_source(src, site_url)
+      if src.present? && site_url
+        begin
+          src = URI.parse(src)
+          src = URI.parse(site_url).merge(src) if src.relative?
+        rescue URI::InvalidURIError
+          # Manually concatenating if it is "relative uri", stripping slashes.
+          if src !~ /\A(https?|ftp):\/\//
+            site_url = site_url[0..-2] if site_url[-1] == ?/
+            src      = src[1..-1]      if src[0]       == ?/
+            src      = "#{site_url}/#{src}"
+          end
+        end
+      end
+      src
+    end
+  end
+end

data/lib/spix_parser/tools/feed_discovery.rb ADDED Viewed

@@ -0,0 +1,94 @@
+gem "feedzirra", ">=0.0.24"
+require "feedzirra"
+require "nokogiri"
+require "uri"
+require "open-uri"
+module Spix
+  class FeedDiscovery
+    # HTTP "User-Agent" header to send to servers when downloading feeds.
+    USER_AGENT = "SpixParser"
+    def self.feed?(uri)
+      Spix::Parser.parse(uri, :mode => :fetch) ? true : false
+    end
+    def self.list(uri)
+      content = self.read(uri)
+      doc = Nokogiri::HTML(content)
+      # get page title
+      title = doc.search('title')[0].content
+      items = doc.search("//link[@type='application/atom+xml']", "//link[@type='application/rss+xml']").collect do |link|
+        url_object = URI::parse(uri).normalize
+        href = link.get_attribute(:href).to_s
+        feed_url_object = URI::parse(href)
+        if feed_url_object.relative?
+          # there's 2 types of relative URIs
+          # the ones based on a path (base: http://sitewithfeed.com/foo/, relative: feed.xml, feed: http://sitewithfeed.com/foo/feed.xml)
+          # and the ones based on the top domain (base: http://sitewithfeed.com/foo/, relative: /feed.xml, feed: http://sitewithfeed.com/feed.xml)
+          if feed_url_object.path.match(/^\//)
+            # when the feed_url_object is relative and starts with a "/" we should ignore the domain path
+            path = nil
+          else
+            # when the feed_url_object is relative and do not starts with a "/" we should use the domain path
+            if url_object.path.match(/\/$/)
+              # when the url_object ends with a "/" we should use it
+              path = url_object.path
+            else
+              # when the url_object do not ends with a "/" we should add it
+              path = url_object.path + "/"
+            end
+          end
+          href = "#{url_object.scheme}://" +
+                 "#{url_object.host}" +
+                 "#{path}" +
+                 "#{url_object.query}" +
+                 href
+        end
+        item = {
+          :title => link.get_attribute(:title) || title,
+          :url => href
+        }
+      end
+      if items.size == 0
+        # if there's no item found at the given URI, maybe it's a feed URI
+        if self.feed?(uri)
+          items = [
+                    {
+                      :title => title,
+                      :url => uri
+                    }
+                  ]
+        end
+      end
+      items
+    rescue
+      nil
+    end
+    def self.read(uri)
+      if uri.respond_to?(:read)
+        content = uri.read
+      else
+        req_headers = {}
+        req_headers["User-Agent"] = USER_AGENT
+        content = open(uri, req_headers).read
+      end
+    end
+  end
+end

data/lib/spix_parser/tools/redirect_follower.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require 'net/http'
+require 'thread'
+class RedirectFollower
+  def self.resolve(url)
+    @response = ""
+    begin
+      timeout(5) do
+        t = Thread.new {@response = Net::HTTP.get_response(URI.parse(url)) }
+        t.join
+        if @response.kind_of?(Net::HTTPRedirection)
+          return redirect_url(@response)
+        end
+      end
+    rescue Timeout::Error, URI::InvalidURIError
+      return url
+    end
+    url
+  end
+  protected
+  def self.redirect_url(response)
+    if response['location'].nil?
+      response.body.match(/<a href=\"([^>]+)\">/i)[1]
+    else
+      response['location']
+    end
+  end
+end

data/lib/spix_parser/version.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    module Version
+      MAJOR = 1
+      MINOR = 5
+      TINY  = 2
+      def self.current_version
+        "#{MAJOR}.#{MINOR}.#{TINY}"
+      end
+      def self.date
+        "2011-04-25"
+      end
+    end
+  end
+end

data/lib/spix_parser/wrappers/enclosure_interface.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    module EnclosureInterface
+      def url
+        enclosure_url
+      end
+      def mime_type
+        enclosure_type
+      end
+      def length
+        enclosure_length
+      end
+    end
+  end
+end

data/lib/spix_parser/wrappers/entry.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class FeedEntry
+      include Spix::Parser::DateTimeUtilities
+      include Memoizable
+      def initialize(entry, feed)
+        @feed = feed
+        @entry = entry
+      end
+      def title
+        text = @entry.entry_title || "(title unknow)"
+        text = text.busk_normalize
+        Sanitizer.sanitize(text)
+      end
+      memoize(:title)
+      def summary
+        text = @entry.entry_summary || ""
+        text = text.busk_normalize
+        Sanitizer.strip_comments(text)
+        Sanitizer.strip_disallowed_tags(text)
+        Sanitizer.entities_to_chars(text)
+      end
+      memoize(:summary)
+      def url
+        entry_url = @entry.entry_url || @feed.site_url
+        RedirectFollower.resolve(entry_url).busk_normalize if entry_url.present?
+      end
+      memoize(:url)
+      def author
+        text = @entry.entry_author || ""
+        Sanitizer.sanitize(text.busk_normalize)
+      end
+      def published_at
+        build_datetime_object(@entry.published) if @entry.published
+      end
+      def updated_at
+        build_datetime_object(@entry.updated) if @entry.updated
+      end
+      def uid
+        uid = self.url || ""
+        uid += self.title.downcase.busk_normalize
+        uid += self.striped_content.downcase.busk_normalize[0..25]
+        uid.to_sha1
+      end
+      memoize(:uid)
+      def content
+        text = encoded_raw_content
+        Sanitizer.strip_comments(text)
+        Sanitizer.strip_disallowed_tags(text)
+        Sanitizer.entities_to_chars(text)
+      end
+      memoize(:content)
+      def striped_content
+        text = encoded_raw_content
+        Sanitizer.strip_tags(text)
+      end
+      memoize(:striped_content)
+      def categories
+        @entry.entry_categories.map do |category|
+          Sanitizer.sanitize(category.busk_normalize)
+        end
+      end
+      def enclosures
+        @entry.entry_enclosures
+      end
+      private
+      def encoded_raw_content
+        text = @entry.entry_content || @entry.entry_summary || ""
+        text.busk_normalize
+      end
+    end
+  end
+end

data/lib/spix_parser/wrappers/feed.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class Feed
+      include Spix::Parser::DateTimeUtilities
+      def initialize(parsed_feed)
+        @feed = parsed_feed
+        verify_entries_timestamps
+      end
+      def title
+        text = @feed.feed_title || "(title unknow)"
+        text = text.busk_normalize
+        Sanitizer.sanitize(text)
+      end
+      def subtitle
+        text = @feed.feed_subtitle || ""
+        text = text.busk_normalize
+        Sanitizer.sanitize(text)
+      end
+      def language
+        text = @feed.feed_language || "en"
+        text = text.busk_normalize
+        Sanitizer.sanitize(text)
+      end
+      def site_url
+        @feed.url || extract_site_from_feed_url
+      end
+      def feed_url
+        @feed.feed_url
+      end
+      def uid
+        @feed.feed_url.to_sha1
+      end
+      def updated_at
+        timestamp = @feed.last_modified || @feed.feed_entries.first.published_at || Time.now.utc
+        build_datetime_object(timestamp)
+      end
+      def feed_items
+        #Se não for um feed válido, o accessor feed_entries não existe
+        if @feed.respond_to?(:feed_entries) && @feed.feed_entries.present?
+          @feed.feed_entries.map{|entry| Spix::Parser::FeedEntry.new(entry, self)}
+        else
+          []
+        end
+      end
+      private
+      def verify_entries_timestamps
+        # Some feeds return the timestamps of all entries as the timestamp of the request
+        # This means that the timestamp will change everytime we parse the feed, thus duplicating entries
+        # One way to detect that is to verify if all the entries have the same timestamp
+        items_with_same_timestamp = feed_items.map{|i| i.published_at}.uniq.size == 1
+        more_than_one_item = feed_items.count > 1
+        if items_with_same_timestamp && more_than_one_item
+          @feed.feed_entries.each {|item| item.published = Spix::Parser::Config::BASE_TIMESTAMP.to_s}
+        end
+      end
+      def extract_site_from_feed_url
+        # Eventually, we run into a feed that for some reason does not include
+        # the publisher website. In those cases, we try to guess the website
+        # root path looking at the feed_url. It may fail also, so be mindful.
+        return unless @feed.feed_url.present?
+        feed_host = URI.parse(@feed.feed_url).host
+        "http://#{feed_host}"
+      end
+    end
+  end
+end

data/lib/spix_parser/wrappers/parsing_error.rb ADDED Viewed

@@ -0,0 +1,7 @@
+# encoding: utf-8
+module Spix
+  module Parser
+    class ParsingError < StandardError
+    end
+  end
+end

data/lib/spix_parser.rb ADDED Viewed

@@ -0,0 +1,46 @@
+# encoding: utf-8
+require "rubygems"
+require "feedzirra"
+require "digest/sha1"
+require "zlib"
+require "logger"
+require "cgi"
+require "memoizable"
+$:.unshift(File.dirname(__FILE__) + '/../../lib')
+require "spix_parser/version"
+require "spix_parser/core_ext"
+require "spix_parser/config"
+require "spix_parser/parser"
+require "spix_parser/datetime"
+require "spix_parser/tools/redirect_follower"
+require "spix_parser/wrappers/entry"
+require "spix_parser/wrappers/enclosure_interface"
+require "spix_parser/wrappers/feed"
+require "spix_parser/wrappers/parsing_error"
+require "spix_parser/custom_parsers/enclosure"
+require "spix_parser/custom_parsers/atom_entry"
+require "spix_parser/custom_parsers/atom"
+require "spix_parser/custom_parsers/rss_entry"
+require "spix_parser/custom_parsers/rss"
+require "spix_parser/tools/feed_discovery"
+if RUBY_VERSION < '1.9'
+  $KCODE='u'
+else
+  Encoding.default_internal = Encoding::UTF_8
+  Encoding.default_external = Encoding::UTF_8
+end
+Feedzirra::Feed.add_feed_class(Spix::Parser::RSS)
+Feedzirra::Feed.add_feed_class(Spix::Parser::Atom)
+# Start the log over whenever the log exceeds 100 megabytes in size.
+Log = Logger.new('/var/log/spix/spix_parser.log', 0, 100 * 1024 * 1024)
+Log.level = Logger::ERROR
+Log.datetime_format = "%d-%m-%Y %H:%M:%S"

data/spec/parser_spec.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'spec_helper'
+describe Spix::Parser, 'parsing wellformed feeds' do
+  run_tests :wellformed
+end

data/spec/spix_parser/parser_spec.rb ADDED Viewed

@@ -0,0 +1,42 @@
+# encoding: utf-8
+require 'spec_helper'
+describe Spix::Parser do
+  describe "atom parsing" do
+    it 'should parse from a file path' do
+      feed = Spix::Parser.parse(fixture('feed.atom'), :mode => :file)
+      feed.should_not be_nil
+      feed.feed_items.should have(1).item
+    end
+    it 'should parse from a file' do
+      feed = Spix::Parser.parse(load_fixture('feed.atom'), :mode => :local)
+      feed.should_not be_nil
+      feed.feed_items.should have(1).item
+    end
+  end
+  describe "rss parsing" do
+    it 'should parse from a file path' do
+      feed = Spix::Parser.parse(fixture('feed.rss'), :mode => :file)
+      feed.should_not be_nil
+      feed.feed_items.should have(9).item
+    end
+    it 'should parse from a file' do
+      feed = Spix::Parser.parse(load_fixture('feed.rss'), :mode => :local)
+      feed.should_not be_nil
+      feed.feed_items.should have(9).item
+    end
+    it 'should parse a feed from meioemensagem.com' do
+      url = 'http://www.meioemensagem.com.br/home/rss/geral.xml'
+      feed = Spix::Parser.parse(load_fixture('meioemensagem.xml'), :mode => :local)
+      feed.should_not be_nil
+      feed.title.should == "RSS: Not&Atilde;&shy;cias Gerais"
+      feed.feed_items[0].title.should == "Cielo volta &Atilde;&nbsp; m&Atilde;&shy;dia com o cantor Fiuk"
+    end
+  end
+end

data/spec/spix_parser/tools/feed_discovery_spec.rb ADDED Viewed

@@ -0,0 +1,72 @@
+require 'spec_helper'
+describe Spix::FeedDiscovery, "#list" do
+  before(:all) do
+    @domain_url = "http://sitewithfeed.com"
+  end
+  describe "when the feed have an absolute URI" do
+    it "should return the feed url" do
+      FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("absolute_uri.html"))
+      Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
+    end
+  end
+  describe "when the feed have a relative URI" do
+    describe "which is relative to a path" do
+      it "should return the feed url when the URI is at the top domain" do
+        FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri.html"))
+        Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
+      end
+      it "should return the feed url when the URI is inside a path" do
+        @path_url = "/foo/bar"
+        @feed_url = @domain_url + @path_url
+        FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri.html"))
+        Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "/" + "html4-002.xml"
+      end
+    end
+    describe "which is relative to the top domain" do
+      it "should return the feed url when the URI is at the top domain" do
+        FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri_top_domain.html"))
+        Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
+      end
+      it "should return the feed url when the URI is inside a path" do
+        @path_url = "/foo/bar"
+        @feed_url = @domain_url + @path_url
+        FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri_top_domain.html"))
+        Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
+      end
+    end
+  end
+  describe "when the URI is a feed" do
+    before(:all) do
+      @path_url = "/feed.xml"
+      @feed_url = @domain_url + @path_url
+    end
+    it "should return the extracted url when there's a link at the feed" do
+      FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("feed_with_self_link.xml"))
+      Spix::FeedDiscovery.list(@feed_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
+    end
+    it "should return the same url when there's no link at the feed" do
+      fixture = load_fixture("feed_without_self_link.xml")
+      FakeWeb.register_uri(:get, @feed_url, :body => fixture)
+      # feedzirra doesn't work with fakeweb
+      feed_xml = fixture
+      feed = Feedzirra::Feed.parse(feed_xml)
+      Feedzirra::Feed.stub!(:fetch_and_parse).and_return(feed)
+      Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url
+    end
+  end
+end

data/spec/spix_parser/utils_spec.rb ADDED Viewed

@@ -0,0 +1,182 @@
+# encoding: utf-8
+require 'spec_helper'
+describe Spix::Utils do
+  describe ".format_links" do
+    context "html containing links" do
+      it "parsers links in the given html string adding rel and target" do
+        input_html = %q[<div><a href="foo/bar.html" title="FooBar!">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links removing other html attributes" do
+        input_html = %q[<div><a href="foo/bar.html" title="FooBar!" style="color: red" invalid="test">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links with simple quotes" do
+        input_html = %q[<div><a href='foo/bar.html' title='FooBar!'>FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      # TODO: should we strip these extra &quot; ?
+      it "parses links with html escaped quote (&quot;)" do
+        input_html = %q[<div><a href=&quot;foo/bar.html&quot; title=&quot;FooBar!&quot;>FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="&quot;foo/bar.html&quot;" title="&quot;FooBar!&quot;" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links with html attributes without quotes, based on spaces" do
+        input_html = %q[<div><a href=foo/bar.html title=FooBar!>FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links with html attributes having spaces before or after the equal sign" do
+        input_html = %q[<div><a href = foo/bar.html title = FooBar!>FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links downcasing attribute names" do
+        input_html = %q[<div><a HREF="foo/bar.html" TITLE="FooBar!">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" title="FooBar!" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+      it "parses links ignoring blank attributes" do
+        input_html = %q[<div><a href="foo/bar.html" title="">FooBar!</a></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><a href="foo/bar.html" target="_blank" rel="external nofollow">FooBar!</a></div>]
+      end
+    end
+    context "html containing images" do
+      it "parsers images in the given html string matching default attributes (src, style, alt, title, width and height)" do
+        input_html = %q[<div><img src="images/bar.jpg" title="FooBar!" alt="FooBar!" width="100" height="200" /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" alt="FooBar!" title="FooBar!" width="100" height="200" /></div>]
+      end
+      it "parses image tags removing other invalid html attributes" do
+        input_html = %q[<div><img src="images/bar.jpg" alt="FooBar!" style="color:red" invalid="test" target="_blank" /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" alt="FooBar!" style="color:red" /></div>]
+      end
+      it "parses image tags appending the given site url to relative images" do
+        input_html = %q[<div><img src="images/bar.jpg" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+          %q[<div><img src="http://example.com/images/bar.jpg" /></div>]
+      end
+      it "parses image tags having relative sources with invalid URI, appending the site url" do
+        input_html = %q[<div><img src="images/radiação.jpg" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+          %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
+      end
+      it "parses image tags having relative sources starting with / and with invalid URI, appending the site url" do
+        input_html = %q[<div><img src="/images/radiação.jpg" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+          %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
+      end
+      it "parses image tags having relative sources starting with / and with invalid URI, appending the site url also ending with /" do
+        input_html = %q[<div><img src="/images/radiação.jpg" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com/").should ==
+          %q[<div><img src="http://example.com/images/radiação.jpg" /></div>]
+      end
+      %w(http https ftp).each do |scheme|
+        it "parses image tags having absolute sources with #{scheme} and invalid URI" do
+          input_html = %Q[<div><img src="#{scheme}://example.com/images/radiação.jpg" /></div>]
+          Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+            %Q[<div><img src="#{scheme}://example.com/images/radiação.jpg" /></div>]
+        end
+      end
+      it "parses image tags having sources with spaces but using quotes" do
+        input_html = %q[<div><img src="images/foo bar.jpg" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+          %q[<div><img src="http://example.com/images/foo bar.jpg" /></div>]
+      end
+      it "parses image tags having style attributes with spaces" do
+        input_html = %q[<div><img src="images/foobar.jpg" style="color: blue;" /></div>]
+        Spix::Utils.format_links(:text => input_html, :site_url => "http://example.com").should ==
+          %q[<div><img src="http://example.com/images/foobar.jpg" style="color: blue;" /></div>]
+      end
+      it "parses image tags ignoring images with empty sources" do
+        input_html = %q[<div><img src="" title="FooBar!" /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div></div>]
+      end
+      it "parses image tags with simple quotes" do
+        input_html = %q[<div><img src='images/bar.jpg' title='FooBar!' /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
+      end
+      # TODO: should we strip these extra &quot; ?
+      it "parses image tags with html escaped quote (&quot;)" do
+        input_html = %q[<div><img src=&quot;images/bar.jpg&quot; title=&quot;FooBar!&quot; /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="&quot;images/bar.jpg&quot;" title="&quot;FooBar!&quot;" /></div>]
+      end
+      it "parses image tags with html attributes without quotes, based on spaces" do
+        input_html = %q[<div><img src=images/bar.jpg title=FooBar! /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
+      end
+      it "parses image tags with html attributes having spaces before or after the equal sign" do
+        input_html = %q[<div><img src = images/bar.jpg title = FooBar! /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
+      end
+      it "parses image tags downcasing attribute names" do
+        input_html = %q[<div><img SRC="images/bar.jpg" TITLE="FooBar!" /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" title="FooBar!" /></div>]
+      end
+      it "parses image tags ignoring empty attributes" do
+        input_html = %q[<div><img src="images/bar.jpg" title="" /></div>]
+        Spix::Utils.format_links(:text => input_html).should ==
+          %q[<div><img src="images/bar.jpg" /></div>]
+      end
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,184 @@
+--- !ruby/object:Gem::Specification
+name: spix_parser
+version: !ruby/object:Gem::Version
+  hash: 7
+  prerelease:
+  segments:
+  - 1
+  - 5
+  - 2
+  version: 1.5.2
+platform: ruby
+authors:
+- Marcelo Eden
+- Fabio Mont'Alegre
+- "Lucas H\xC3\xBAngaro"
+- Luiz Rocha
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-05-12 00:00:00 -03:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: feedzirra
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 47
+        segments:
+        - 0
+        - 0
+        - 24
+        version: 0.0.24
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: memoizable
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 27
+        segments:
+        - 0
+        - 1
+        - 0
+        version: 0.1.0
+  type: :runtime
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: sanitizer
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 21
+        segments:
+        - 0
+        - 1
+        - 7
+        version: 0.1.7
+  type: :runtime
+  version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: i18n
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 25
+        segments:
+        - 0
+        - 1
+        - 1
+        version: 0.1.1
+  type: :runtime
+  version_requirements: *id004
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :development
+  version_requirements: *id005
+- !ruby/object:Gem::Dependency
+  name: fakeweb
+  prerelease: false
+  requirement: &id006 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :development
+  version_requirements: *id006
+description: A feed parser wrapper for Spix
+email: busk@busk.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/spix_parser/config.rb
+- lib/spix_parser/core_ext.rb
+- lib/spix_parser/custom_parsers/atom.rb
+- lib/spix_parser/custom_parsers/atom_entry.rb
+- lib/spix_parser/custom_parsers/enclosure.rb
+- lib/spix_parser/custom_parsers/rss.rb
+- lib/spix_parser/custom_parsers/rss_entry.rb
+- lib/spix_parser/datetime.rb
+- lib/spix_parser/parser.rb
+- lib/spix_parser/tools/feed_discovery.rb
+- lib/spix_parser/tools/redirect_follower.rb
+- lib/spix_parser/version.rb
+- lib/spix_parser/wrappers/enclosure_interface.rb
+- lib/spix_parser/wrappers/entry.rb
+- lib/spix_parser/wrappers/feed.rb
+- lib/spix_parser/wrappers/parsing_error.rb
+- lib/spix_parser.rb
+- spec/parser_spec.rb
+- spec/spix_parser/parser_spec.rb
+- spec/spix_parser/tools/feed_discovery_spec.rb
+- spec/spix_parser/utils_spec.rb
+has_rdoc: true
+homepage: http://github.com/busk/spix_parser
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.6.2
+signing_key:
+specification_version: 3
+summary: FeedParser for Spix
+test_files:
+- spec/parser_spec.rb
+- spec/spix_parser/parser_spec.rb
+- spec/spix_parser/tools/feed_discovery_spec.rb
+- spec/spix_parser/utils_spec.rb