RubyGems - spix_parser - Versions diffs - 1.5.4 → 1.6.1 - Mend

spix_parser 1.5.4 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/lib/spix_parser/tools/discovery.rb +9 -0
data/lib/spix_parser/tools/feed_discovery/base.rb +74 -0
data/lib/spix_parser/tools/feed_discovery/document.rb +104 -0
data/lib/spix_parser/tools/feed_discovery/feed.rb +30 -0
data/lib/spix_parser/tools/feed_discovery.rb +7 -81
data/lib/spix_parser/version.rb +2 -2
data/lib/spix_parser.rb +1 -1
data/spec/spix_parser/tools/feed_discovery/document_spec.rb +68 -0
data/spec/spix_parser/tools/feed_discovery/feed_spec.rb +29 -0
data/spec/spix_parser/tools/feed_discovery_spec.rb +56 -13
data/spec/spix_parser/tools/feed_list_spec.rb +17 -0
metadata +12 -2

data/lib/spix_parser/tools/discovery.rb ADDED Viewed

@@ -0,0 +1,9 @@
+$:.unshift File.expand_path(File.dirname(__FILE__))
+module Spix
+  autoload :FeedDiscovery, 'feed_discovery'
+  module FeedDiscovery
+    autoload :Base, 'feed_discovery/base'
+    autoload :Feed, 'feed_discovery/feed'
+    autoload :Document, 'feed_discovery/document'
+  end
+end

data/lib/spix_parser/tools/feed_discovery/base.rb ADDED Viewed

@@ -0,0 +1,74 @@
+module Spix
+  module FeedDiscovery
+    class Base
+      # HTTP "User-Agent" header to send to servers when downloading feeds.
+      USER_AGENT = "SpixParser"
+      def initialize uri
+        self.uri = uri
+        self.document = FeedDiscovery::Document.new uri
+      end
+      def uri= value
+        @uri = URI.parse value
+      end
+      def uri
+        @uri
+      end
+      def document= value
+        @document = value
+      end
+      private :document=
+      def document
+        @document
+      end
+      private :document
+      def list
+        extract_feeds_from_anchors if html?
+        extract_feeds_from_links
+        include_it_self if feed?
+        items
+      end
+      def html?
+        document.html?
+      end
+      private :html?
+      def extract_feeds_from_anchors
+        document.feed_uris_from_anchors +
+        document.generic_uris_from_anchors.map { |uri|
+          FeedDiscovery::Document.new(uri).feed_uris_from_anchors
+        }.flatten.each { |uri|
+          items << Feed.new(uri, document.favicon)
+        }
+      end
+      private :extract_feeds_from_anchors
+      def extract_feeds_from_links
+        document.feed_uris_from_links.each { |uri|
+          items << Feed.new(uri, document.favicon)
+        }
+      end
+      private :extract_feeds_from_links
+      def include_it_self
+        items << Feed.new(uri.to_s, document.favicon)
+      end
+      def feed?
+        document.feed?
+      end
+      private :feed?
+      def items
+        @items ||= []
+      end
+    end
+  end
+end

data/lib/spix_parser/tools/feed_discovery/document.rb ADDED Viewed

@@ -0,0 +1,104 @@
+module Spix
+  module FeedDiscovery
+    class Document
+      def initialize uri
+        @uri = URI.parse uri
+        @document = Nokogiri::XML(content)
+      end
+      def content
+        if @uri.respond_to?(:read)
+          @uri.read
+        else
+          req_headers = {}
+          req_headers["User-Agent"] = USER_AGENT
+          open(@uri.to_s, req_headers).read
+        end
+      end
+      private :content
+      def feed_uris_from_anchors
+        @document.search("a").select { |anchor|
+          rss_or_atom_content_type_in? anchor
+        }.map { |node|
+          uri = @uri.merge node.get_attribute 'href'
+          uri.to_s
+        }
+      end
+      def generic_uris_from_anchors
+        @document.search("a").select { |anchor|
+          not rss_or_atom_content_type_in? anchor
+        }.map { |node|
+          uri = @uri.merge node.get_attribute 'href'
+          uri.to_s
+        }
+      end
+      def feed_uris_from_links
+        @document.search(
+          "link[@type='application/atom+xml']",
+          "link[@type='application/rss+xml']"
+        ).map { |node|
+          uri = @uri.merge node.get_attribute 'href'
+          uri.to_s
+        }
+      end
+      def rss_or_atom_content_type_in? anchor
+        req, path = request_and_path_using address_from anchor
+        resp = req.request_head path
+        resp['content-type'] =~ /rss|atom/
+      end
+      private :rss_or_atom_content_type_in?
+      def request_and_path_using address
+        uri = @uri.merge URI.parse address
+        req = Net::HTTP.new uri.host, uri.port
+        path = uri - uri.select(:scheme, :host).join("://")
+        return req, path.to_s
+      end
+      private :request_and_path_using
+      def address_from node
+        node.get_attribute("href")
+      end
+      private :address_from
+      def html?
+        @document.root.name == "html"
+      end
+      def feed?
+        %w[rss feed].include? @document.root.name
+      end
+      def favicon
+        shortcuts_in_document or shortcuts_from(base_path) or base_path.merge('favicon.ico').to_s
+      end
+      def shortcuts_in_document
+        shortcuts = @document.search('link[@rel*=shortcut]')
+        shortcuts.any? ? shortcuts : nil
+      end
+      private :shortcuts_in_document
+      def shortcuts_from base_path
+        doc = Nokogiri::HTML Net::HTTP.get base_path
+        doc.search('link[@rel*=shortcut]')
+      rescue Net::HTTPError, Net::HTTPFatalError
+        logger.warn "error opening favicon: #{$!}"
+        nil
+      end
+      private :shortcuts_from
+      def base_path
+        URI.parse @uri.select(:scheme, :host).join("://")
+      end
+      private :base_path
+    end
+  end
+end

data/lib/spix_parser/tools/feed_discovery/feed.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module Spix
+  module FeedDiscovery
+    class Feed < Struct.new(:url, :favicon, :title)
+      def initialize url, favicon
+        self.url = url
+        self.favicon = favicon
+        self.title = get_title
+      end
+      def get_title
+        content.search('title').first.content
+      end
+      private :get_title
+      def content
+        req = Net::HTTP.new uri.host, uri.port
+        path = uri - uri.select(:scheme, :host).join("://")
+        resp = req.request_get path.to_s
+        Nokogiri::XML(resp.body)
+      end
+      private :content
+      def uri
+        URI.parse url
+      end
+      private :uri
+    end
+  end
+end

data/lib/spix_parser/tools/feed_discovery.rb CHANGED Viewed

@@ -1,94 +1,20 @@
-gem "feedzirra", ">=0.0.24"
 require "feedzirra"
 require "nokogiri"
 require "uri"
 require "open-uri"
 module Spix
-  class FeedDiscovery
+  module FeedDiscovery
+    extend self
-    # HTTP "User-Agent" header to send to servers when downloading feeds.
-    USER_AGENT = "SpixParser"
-    def self.feed?(uri)
+    def feed? uri
       Spix::Parser.parse(uri, :mode => :fetch) ? true : false
     end
-    def self.list(uri)
-      content = self.read(uri)
-      doc = Nokogiri::HTML(content)
-      # get page title
-      title = doc.search('title')[0].content
-      items = doc.search("//link[@type='application/atom+xml']", "//link[@type='application/rss+xml']").collect do |link|
-        url_object = URI::parse(uri).normalize
-        href = link.get_attribute(:href).to_s
-        feed_url_object = URI::parse(href)
-        if feed_url_object.relative?
-          # there's 2 types of relative URIs
-          # the ones based on a path (base: http://sitewithfeed.com/foo/, relative: feed.xml, feed: http://sitewithfeed.com/foo/feed.xml)
-          # and the ones based on the top domain (base: http://sitewithfeed.com/foo/, relative: /feed.xml, feed: http://sitewithfeed.com/feed.xml)
-          if feed_url_object.path.match(/^\//)
-            # when the feed_url_object is relative and starts with a "/" we should ignore the domain path
-            path = nil
-          else
-            # when the feed_url_object is relative and do not starts with a "/" we should use the domain path
-            if url_object.path.match(/\/$/)
-              # when the url_object ends with a "/" we should use it
-              path = url_object.path
-            else
-              # when the url_object do not ends with a "/" we should add it
-              path = url_object.path + "/"
-            end
-          end
-          href = "#{url_object.scheme}://" +
-                 "#{url_object.host}" +
-                 "#{path}" +
-                 "#{url_object.query}" +
-                 href
-        end
-        item = {
-          :title => link.get_attribute(:title) || title,
-          :url => href
-        }
-      end
-      if items.size == 0
-        # if there's no item found at the given URI, maybe it's a feed URI
-        if self.feed?(uri)
-          items = [
-                    {
-                      :title => title,
-                      :url => uri
-                    }
-                  ]
-        end
-      end
-      items
-    rescue
-      nil
+    def list uri
+      page = Base.new uri
+      page.list
     end
-    def self.read(uri)
-      if uri.respond_to?(:read)
-        content = uri.read
-      else
-        req_headers = {}
-        req_headers["User-Agent"] = USER_AGENT
-        content = open(uri, req_headers).read
-      end
-    end
   end
-end
+end

data/lib/spix_parser/version.rb CHANGED Viewed

@@ -3,8 +3,8 @@ module Spix
   module Parser
     module Version
       MAJOR = 1
-      MINOR = 5
-      TINY  = 4
+      MINOR = 6
+      TINY  = 1
       def self.current_version
         "#{MAJOR}.#{MINOR}.#{TINY}"

data/lib/spix_parser.rb CHANGED Viewed

@@ -29,7 +29,7 @@ require "spix_parser/custom_parsers/atom"
 require "spix_parser/custom_parsers/rss_entry"
 require "spix_parser/custom_parsers/rss"
-require "spix_parser/tools/feed_discovery"
+require "spix_parser/tools/discovery"
 if RUBY_VERSION < '1.9'
   $KCODE='u'

data/spec/spix_parser/tools/feed_discovery/document_spec.rb ADDED Viewed

@@ -0,0 +1,68 @@
+require 'spec_helper'
+describe Spix::FeedDiscovery::Document do
+  let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
+  describe '#feed_uris_from_anchors' do
+    it 'should return only uris from anchors' do
+      document.feed_uris_from_anchors.should eql expected_uris_inside('a.has_feed')
+    end
+  end
+  describe '#feed_uris_from_links' do
+    it 'should return only uris from links' do
+      document.feed_uris_from_links.should eql expected_uris_inside('link.has_feed')
+    end
+  end
+  describe '#generic_uris' do
+    it 'should return only ordinary uris from anchors' do
+      document.generic_uris_from_anchors.should eql expected_uris_inside('a.generic')
+    end
+  end
+  describe '#html?' do
+    it 'should return true if is a html document' do
+      content = load_fixture('rss_list.html')
+      Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
+      document.html?.should eql true
+    end
+    it 'should return false if is a rss/feed document' do
+      content = load_fixture('feed.rss')
+      Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
+      document.html?.should eql false
+    end
+  end
+  describe '#feed?' do
+    it 'should return true if a feed document' do
+      content = load_fixture('feed.rss')
+      Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
+      document.feed?.should eql true
+    end
+    it 'should return false if hot a html document' do
+      content = load_fixture('rss_list.html')
+      Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
+      document.feed?.should eql false
+    end
+  end
+  before :all do
+    @rss_uri = 'http://myfeed.com/rss_list.html'
+    @content = load_fixture("rss_list.html")
+    @document = Nokogiri::XML(@content)
+  end
+  before :each do
+    Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
+    FakeWeb.register_uri(:head, 'http://myfeed.com/has_feed.html', :content_type => 'application/atom+xml')
+    FakeWeb.register_uri(:head, 'http://myfeed.com/generic.html', :content_type => 'text/html' )
+  end
+end
+def expected_uris_inside to_search
+  @document.search(to_search).map { |node| URI.parse(@rss_uri).merge(node.get_attribute('href')).to_s }
+end

data/spec/spix_parser/tools/feed_discovery/feed_spec.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require 'spec_helper'
+describe Spix::FeedDiscovery::Feed do
+  context 'given an expecific uri' do
+    let(:feed) { Spix::FeedDiscovery::Feed.new @feed_uri, @favicon_uri }
+    it 'should set the favicon' do
+      feed.favicon.should == @favicon_uri
+    end
+    it 'should set the url' do
+      feed.url.should == @feed_uri
+    end
+    it 'should set title' do
+      feed.title.should == @document.search('title').first.content
+    end
+    before :all do
+      @feed_uri = "http://myfeed.com/feed.rss"
+      @favicon_uri = "http://myfeed.com/images/favicon.ico"
+      @document = Nokogiri::XML load_fixture 'feed.rss'
+      FakeWeb.register_uri(:get, @feed_uri, :body => @document.to_s)
+    end
+  end
+end

data/spec/spix_parser/tools/feed_discovery_spec.rb CHANGED Viewed

@@ -2,13 +2,11 @@ require 'spec_helper'
 describe Spix::FeedDiscovery, "#list" do
-  before(:all) do
-    @domain_url = "http://sitewithfeed.com"
-  end
   describe "when the feed have an absolute URI" do
     it "should return the feed url" do
-      FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("absolute_uri.html"))
+      fake_requests_for :path_inside_content => '/html4-002.html',
+                        :resource_path => @domain_url,
+                        :content => load_fixture("absolute_uri.html")
       Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
     end
   end
@@ -16,30 +14,37 @@ describe Spix::FeedDiscovery, "#list" do
   describe "when the feed have a relative URI" do
     describe "which is relative to a path" do
       it "should return the feed url when the URI is at the top domain" do
-        FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri.html"))
+        fake_requests_for :path_inside_content => '/html4-003.html',
+                          :resource_path => @domain_url,
+                          :content => load_fixture("relative_uri.html")
         Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
       end
       it "should return the feed url when the URI is inside a path" do
-        @path_url = "/foo/bar"
+        @path_url = "/foo/bar/"
         @feed_url = @domain_url + @path_url
-        FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri.html"))
-        Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "/" + "html4-002.xml"
+        fake_requests_for :path_inside_content => 'html4-003.html',
+                          :resource_path => @feed_url,
+                          :content => load_fixture('relative_uri.html')
+        Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url +  "html4-002.xml"
       end
     end
     describe "which is relative to the top domain" do
       it "should return the feed url when the URI is at the top domain" do
-        FakeWeb.register_uri(:get, @domain_url, :body => load_fixture("relative_uri_top_domain.html"))
+        fake_requests_for :path_inside_content => '/html4-004.html',
+                          :resource_path => @domain_url,
+                          :content => load_fixture("relative_uri_top_domain.html")
         Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
       end
       it "should return the feed url when the URI is inside a path" do
-        @path_url = "/foo/bar"
+        @path_url = "/foo/bar/"
         @feed_url = @domain_url + @path_url
-        FakeWeb.register_uri(:get, @feed_url, :body => load_fixture("relative_uri_top_domain.html"))
+        fake_requests_for :path_inside_content => 'html4-004.html',
+                          :resource_path => @feed_url,
+                          :content => load_fixture("relative_uri_top_domain.html")
         Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
       end
     end
@@ -68,5 +73,43 @@ describe Spix::FeedDiscovery, "#list" do
       Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url
     end
+    it "should return the title of feed" do
+      feed_xml = load_fixture("feed_without_self_link.xml")
+      FakeWeb.register_uri(:get, @feed_url, :body => feed_xml)
+      # feedzirra doesn't work with fakeweb
+      feed = Feedzirra::Feed.parse(feed_xml)
+      Feedzirra::Feed.stub!(:fetch_and_parse).and_return(feed)
+      Spix::FeedDiscovery.list(@feed_url).first[:title].should == feed.feed_title
+    end
+    it "should return the title of feed when enclosed in CDATA" do
+      feed_xml = load_fixture("feed_with_content_in_cdata.xml")
+      FakeWeb.register_uri(:get, @feed_url, :body => feed_xml)
+      # feedzirra doesn't work with fakeweb
+      feed = Feedzirra::Feed.parse(feed_xml)
+      Feedzirra::Feed.stub!(:fetch_and_parse).and_return(feed)
+      Spix::FeedDiscovery.list(@feed_url).first[:title].should_not be_empty
+    end
   end
+  before(:all) do
+    @domain_url = "http://sitewithfeed.com"
+  end
+end
+def fake_requests_for options = {}
+  content = options.delete(:content)
+  path_inside_content = options.delete(:path_inside_content)
+  resource_path = options.delete(:resource_path)
+  FakeWeb.register_uri(:get, resource_path, :body => content)
+  FakeWeb.register_uri(:head, resource_path + path_inside_content, :content => 'text/html')
+  FakeWeb.register_uri(:get, resource_path + path_inside_content, :content => content)
 end

data/spec/spix_parser/tools/feed_list_spec.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require 'spec_helper'
+describe Spix::FeedDiscoveryList do
+  let(:feed_discovery_list) { Spix::FeedDiscoveryList.new }
+  it "should inherit from array" do
+    feed_discovery_list.class.superclass.should == Array
+  end
+  describe "#invalids" do
+    it "should return an empty array from invalids accessor method" do
+      feed_discovery_list.invalids.should == []
+    end
+  end
+end

metadata CHANGED Viewed

@@ -2,7 +2,7 @@
 name: spix_parser
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 1.5.4
+  version: 1.6.1
 platform: ruby
 authors:
 - Marcelo Eden
@@ -13,7 +13,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-05-12 00:00:00 -03:00
+date: 2011-05-31 00:00:00 -03:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -100,6 +100,10 @@ files:
 - lib/spix_parser/custom_parsers/rss_entry.rb
 - lib/spix_parser/datetime.rb
 - lib/spix_parser/parser.rb
+- lib/spix_parser/tools/discovery.rb
+- lib/spix_parser/tools/feed_discovery/base.rb
+- lib/spix_parser/tools/feed_discovery/document.rb
+- lib/spix_parser/tools/feed_discovery/feed.rb
 - lib/spix_parser/tools/feed_discovery.rb
 - lib/spix_parser/tools/redirect_follower.rb
 - lib/spix_parser/version.rb
@@ -110,7 +114,10 @@ files:
 - lib/spix_parser.rb
 - spec/parser_spec.rb
 - spec/spix_parser/parser_spec.rb
+- spec/spix_parser/tools/feed_discovery/document_spec.rb
+- spec/spix_parser/tools/feed_discovery/feed_spec.rb
 - spec/spix_parser/tools/feed_discovery_spec.rb
+- spec/spix_parser/tools/feed_list_spec.rb
 - spec/spix_parser/utils_spec.rb
 has_rdoc: true
 homepage: http://github.com/busk/spix_parser
@@ -143,5 +150,8 @@ summary: FeedParser for Spix
 test_files:
 - spec/parser_spec.rb
 - spec/spix_parser/parser_spec.rb
+- spec/spix_parser/tools/feed_discovery/document_spec.rb
+- spec/spix_parser/tools/feed_discovery/feed_spec.rb
 - spec/spix_parser/tools/feed_discovery_spec.rb
+- spec/spix_parser/tools/feed_list_spec.rb
 - spec/spix_parser/utils_spec.rb