RubyGems - spix_parser - Versions diffs - 1.6.1 → 1.6.4 - Mend

spix_parser 1.6.1 → 1.6.4

Files changed (9) hide show

data/lib/spix_parser/tools/feed_discovery/base.rb +3 -23
data/lib/spix_parser/tools/feed_discovery/document.rb +106 -38
data/lib/spix_parser/tools/feed_discovery/feed.rb +18 -4
data/lib/spix_parser/tools/feed_discovery.rb +4 -2
data/lib/spix_parser/version.rb +1 -1
data/spec/spix_parser/tools/feed_discovery/document_spec.rb +42 -21
data/spec/spix_parser/tools/feed_discovery_spec.rb +25 -9
metadata +3 -4
data/spec/spix_parser/tools/feed_list_spec.rb +0 -17

data/lib/spix_parser/tools/feed_discovery/base.rb CHANGED Viewed

@@ -27,11 +27,8 @@ module Spix
       end
       private :document
-      def list
-        extract_feeds_from_anchors if html?
-        extract_feeds_from_links
-        include_it_self if feed?
-        items
+      def list &block
+        @document.feed_uris &block
       end
       def html?
@@ -39,24 +36,7 @@ module Spix
       end
       private :html?
-      def extract_feeds_from_anchors
-        document.feed_uris_from_anchors +
-        document.generic_uris_from_anchors.map { |uri|
-          FeedDiscovery::Document.new(uri).feed_uris_from_anchors
-        }.flatten.each { |uri|
-          items << Feed.new(uri, document.favicon)
-        }
-      end
-      private :extract_feeds_from_anchors
-      def extract_feeds_from_links
-        document.feed_uris_from_links.each { |uri|
-          items << Feed.new(uri, document.favicon)
-        }
-      end
-      private :extract_feeds_from_links
-      def include_it_self
+      def include_it_self &block
         items << Feed.new(uri.to_s, document.favicon)
       end

data/lib/spix_parser/tools/feed_discovery/document.rb CHANGED Viewed

@@ -16,53 +16,81 @@ module Spix
           open(@uri.to_s, req_headers).read
         end
       end
-      private :content
+      protected :content
-      def feed_uris_from_anchors
-        @document.search("a").select { |anchor|
-          rss_or_atom_content_type_in? anchor
-        }.map { |node|
-          uri = @uri.merge node.get_attribute 'href'
-          uri.to_s
-        }
-      end
-      def generic_uris_from_anchors
-        @document.search("a").select { |anchor|
-          not rss_or_atom_content_type_in? anchor
-        }.map { |node|
-          uri = @uri.merge node.get_attribute 'href'
-          uri.to_s
-        }
+      def feed_uris &block
+        items = feed_uris_from_links(&block) + feed_uris_from_anchors(&block)
+        items << feed_unsing_address(@uri, &block) if feed?
+        items
       end
-      def feed_uris_from_links
+      def feed_uris_from_links &block
         @document.search(
           "link[@type='application/atom+xml']",
           "link[@type='application/rss+xml']"
-        ).map { |node|
-          uri = @uri.merge node.get_attribute 'href'
-          uri.to_s
+        ).map { |node| feed_from node, &block }
+      end
+      private :feed_uris_from_links
+      def feed_uris_from_anchors &block
+        @document.search('a').select { |node|
+          valid_url_in? node
+        }.select { |node|
+          rss_or_atom_content_type_in? node
+        }.map { |node|
+          feed_from node, &block
         }
       end
+      private :feed_uris_from_anchors
+      def feed_from node, &block
+        uri = @uri.merge node.get_attribute 'href'
+        feed_unsing_address uri, &block
+      end
+      private :feed_from
+      def feed_unsing_address uri, &block
+        begin
+          Feed.new(uri, favicon).tap do |feed|
+            block.call feed if block_given?
+          end
+        rescue => error
+          error.tap do |e|
+            block.call e if block_given?
+          end
+        end
+      end
+      private :feed_unsing_address
+      def valid_url_in? anchor
+        uri = address_from anchor
+        uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host)
+      end
+      private :valid_url_in?
       def rss_or_atom_content_type_in? anchor
         req, path = request_and_path_using address_from anchor
         resp = req.request_head path
         resp['content-type'] =~ /rss|atom/
+      rescue
+        true
       end
       private :rss_or_atom_content_type_in?
-      def request_and_path_using address
-        uri = @uri.merge URI.parse address
+      def request_and_path_using uri
         req = Net::HTTP.new uri.host, uri.port
-        path = uri - uri.select(:scheme, :host).join("://")
-        return req, path.to_s
+        return req, path_from(uri) || uri.to_s
       end
       private :request_and_path_using
+      def path_from uri
+        path = uri - uri.select(:scheme, :host).join("://")
+        path.to_s unless path.to_s.blank?
+      end
+      private :path_from
       def address_from node
-        node.get_attribute("href")
+        @uri.merge URI.parse node.get_attribute("href")
       end
       private :address_from
@@ -75,30 +103,70 @@ module Spix
       end
       def favicon
-        shortcuts_in_document or shortcuts_from(base_path) or base_path.merge('favicon.ico').to_s
+        shortcut_in_document or shortcut_from_original_page or shortcut_from(base_uri) or default_favico_if_exist
       end
-      def shortcuts_in_document
-        shortcuts = @document.search('link[@rel*=shortcut]')
-        shortcuts.any? ? shortcuts : nil
+      def shortcut_in_document
+        shortcuts = find_shortcut_in @document
+        shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
       end
-      private :shortcuts_in_document
+      private :shortcut_in_document
+      def shortcut_from_original_page
+        if feed?
+          if node = @document.search('link').first
+            path = URI.parse node.content.strip
+            shortcut_from URI.parse path.select(:scheme, :host).join("://") rescue nil
+          end
+        end
+      end
+      private :shortcut_from_original_page
-      def shortcuts_from base_path
-        doc = Nokogiri::HTML Net::HTTP.get base_path
-        doc.search('link[@rel*=shortcut]')
+      def shortcut_from base_uri
+        doc = get base_uri
+        shortcuts = find_shortcut_in doc
+        shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
       rescue Net::HTTPError, Net::HTTPFatalError
         logger.warn "error opening favicon: #{$!}"
         nil
       end
-      private :shortcuts_from
+      private :shortcut_from
+      def find_shortcut_in doc
+        doc.xpath(
+          '//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "shortcut")]',
+          '//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "ico")]'
+        ).map { |node| node.get_attribute "href" }
+      end
-      def base_path
-        URI.parse @uri.select(:scheme, :host).join("://")
+      def default_favico_if_exist
+        http = Net::HTTP.new base_uri.host, base_uri.port
+        resp = http.request_head 'favicon.ico'
+        base_uri.merge('favicon.ico').to_s unless resp.kind_of? Net::HTTPError
+      rescue
+        nil
+      end
+      private :default_favico_if_exist
+      def get uri
+        resp = Net::HTTP.get_response uri
+        doc = Nokogiri::HTML(resp.body)
+        if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
+          path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
+          get URI.parse path
+        else
+          doc
+        end
+      rescue
+        Nokogiri::HTML('')
       end
-      private :base_path
+      def base_uri
+        @base_uri ||= URI.parse @uri.select(:scheme, :host).join("://")
+      end
+      private :base_uri
     end
   end
 end

data/lib/spix_parser/tools/feed_discovery/feed.rb CHANGED Viewed

@@ -1,15 +1,26 @@
 module Spix
   module FeedDiscovery
-    class Feed < Struct.new(:url, :favicon, :title)
+    class Feed < Hash
       def initialize url, favicon
-        self.url = url
+        self.url = url.to_s
         self.favicon = favicon
         self.title = get_title
       end
+      %w[url favicon title].each do |attr|
+        define_method attr do
+          self[attr.to_sym]
+        end
+        define_method "#{attr}=" do |value|
+          self[attr.to_sym] = value
+        end
+      end
       def get_title
-        content.search('title').first.content
+        node = content.search('title').first
+        node.content if node
       end
       private :get_title
@@ -22,9 +33,12 @@ module Spix
       private :content
       def uri
-        URI.parse url
+        @uri ||= URI.parse url
       end
       private :uri
+      def to_hash
+      end
     end
   end
 end

data/lib/spix_parser/tools/feed_discovery.rb CHANGED Viewed

@@ -11,9 +11,11 @@ module Spix
       Spix::Parser.parse(uri, :mode => :fetch) ? true : false
     end
-    def list uri
+    def list uri, &block
       page = Base.new uri
-      page.list
+      page.list &block
+    rescue => error
+      [error]
     end
   end

data/lib/spix_parser/version.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module Spix
     module Version
       MAJOR = 1
       MINOR = 6
-      TINY  = 1
+      TINY  = 4
       def self.current_version
         "#{MAJOR}.#{MINOR}.#{TINY}"

data/spec/spix_parser/tools/feed_discovery/document_spec.rb CHANGED Viewed

@@ -4,22 +4,27 @@ describe Spix::FeedDiscovery::Document do
   let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
-  describe '#feed_uris_from_anchors' do
-    it 'should return only uris from anchors' do
-      document.feed_uris_from_anchors.should eql expected_uris_inside('a.has_feed')
-    end
-  end
+  describe '#feed_uris' do
-  describe '#feed_uris_from_links' do
-    it 'should return only uris from links' do
-      document.feed_uris_from_links.should eql expected_uris_inside('link.has_feed')
-    end
-  end
+    context 'when the uri exists' do
+      it 'should return only feed uris' do
+        document.should_receive(:feed_uris).and_return expected_feeds
+        document.feed_uris
+      end
+      it 'should yield feed with correct uri content' do
+        document.feed_uris do |feed|
+          expected_feeds.should include(feed)
+        end
+      end
+      before :each do
+        stub_requests
+      end
-  describe '#generic_uris' do
-    it 'should return only ordinary uris from anchors' do
-      document.generic_uris_from_anchors.should eql expected_uris_inside('a.generic')
     end
   end
   describe '#html?' do
@@ -34,6 +39,11 @@ describe Spix::FeedDiscovery::Document do
       Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
       document.html?.should eql false
     end
+    before :each do
+      stub_requests
+    end
   end
   describe '#feed?' do
@@ -48,21 +58,32 @@ describe Spix::FeedDiscovery::Document do
       Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
       document.feed?.should eql false
     end
+    before :each do
+      stub_requests
+    end
   end
   before :all do
-    @rss_uri = 'http://myfeed.com/rss_list.html'
+    @domain = 'http://diveintomark.org'
+    @rss_uri = @domain + '/rss_list.html'
     @content = load_fixture("rss_list.html")
     @document = Nokogiri::XML(@content)
   end
-  before :each do
-    Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
-    FakeWeb.register_uri(:head, 'http://myfeed.com/has_feed.html', :content_type => 'application/atom+xml')
-    FakeWeb.register_uri(:head, 'http://myfeed.com/generic.html', :content_type => 'text/html' )
-  end
 end
-def expected_uris_inside to_search
-  @document.search(to_search).map { |node| URI.parse(@rss_uri).merge(node.get_attribute('href')).to_s }
+def stub_requests
+  Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
+  %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].each { |path|
+    FakeWeb.register_uri(:head, @domain + '/' + path, :content_type => 'application/atom+xml')
+    FakeWeb.register_uri(:get, @domain + '/' + path, :body => load_fixture(path))
+  }
+  FakeWeb.register_uri(:head, @domain + '/generic.html', :content_type => 'text/html' )
+end
+def expected_feeds
+  @feeds ||= %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].map { |path|
+    Spix::FeedDiscovery::Feed.new(@domain + '/' + path, @domain + '/images/favicon.ico')
+  }
 end

data/spec/spix_parser/tools/feed_discovery_spec.rb CHANGED Viewed

@@ -4,7 +4,8 @@ describe Spix::FeedDiscovery, "#list" do
   describe "when the feed have an absolute URI" do
     it "should return the feed url" do
-      fake_requests_for :path_inside_content => '/html4-002.html',
+      fake_requests_for :ignore => ['/html4-002.html'],
+                        :accept => ['/tests/client/autodiscovery/html4-001.xml'],
                         :resource_path => @domain_url,
                         :content => load_fixture("absolute_uri.html")
       Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
@@ -14,7 +15,8 @@ describe Spix::FeedDiscovery, "#list" do
   describe "when the feed have a relative URI" do
     describe "which is relative to a path" do
       it "should return the feed url when the URI is at the top domain" do
-        fake_requests_for :path_inside_content => '/html4-003.html',
+        fake_requests_for :ignore => ['/html4-003.html'],
+                          :accept => ['/html4-002.xml'],
                           :resource_path => @domain_url,
                           :content => load_fixture("relative_uri.html")
         Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
@@ -23,7 +25,8 @@ describe Spix::FeedDiscovery, "#list" do
       it "should return the feed url when the URI is inside a path" do
         @path_url = "/foo/bar/"
         @feed_url = @domain_url + @path_url
-        fake_requests_for :path_inside_content => 'html4-003.html',
+        fake_requests_for :ignore => ['html4-003.html'],
+                          :accept => ['html4-002.xml'],
                           :resource_path => @feed_url,
                           :content => load_fixture('relative_uri.html')
         Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url +  "html4-002.xml"
@@ -32,7 +35,8 @@ describe Spix::FeedDiscovery, "#list" do
     describe "which is relative to the top domain" do
       it "should return the feed url when the URI is at the top domain" do
-        fake_requests_for :path_inside_content => '/html4-004.html',
+        fake_requests_for :ignore => ['/html4-004.html'],
+                          :accept => ['/html4-003.xml'],
                           :resource_path => @domain_url,
                           :content => load_fixture("relative_uri_top_domain.html")
         Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
@@ -42,7 +46,8 @@ describe Spix::FeedDiscovery, "#list" do
         @path_url = "/foo/bar/"
         @feed_url = @domain_url + @path_url
-        fake_requests_for :path_inside_content => 'html4-004.html',
+        fake_requests_for :ignore => ['/html4-004.html'],
+                          :accept => ['/html4-003.xml'],
                           :resource_path => @feed_url,
                           :content => load_fixture("relative_uri_top_domain.html")
         Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
@@ -100,16 +105,27 @@ describe Spix::FeedDiscovery, "#list" do
   end
   before(:all) do
-    @domain_url = "http://sitewithfeed.com"
+    @domain_url = "http://diveintomark.org"
   end
 end
 def fake_requests_for options = {}
   content = options.delete(:content)
-  path_inside_content = options.delete(:path_inside_content)
+  ignore = options.delete(:ignore)
+  accept = options.delete(:accept)
   resource_path = options.delete(:resource_path)
   FakeWeb.register_uri(:get, resource_path, :body => content)
-  FakeWeb.register_uri(:head, resource_path + path_inside_content, :content => 'text/html')
-  FakeWeb.register_uri(:get, resource_path + path_inside_content, :content => content)
+  ignore.each do |path|
+    FakeWeb.register_uri(:head, resource_path + path, :content => 'text/html')
+  end
+  accept.each do |path|
+    FakeWeb.register_uri(:head, resource_path + path, :content => 'application/atom+xml')
+    FakeWeb.register_uri(:get, resource_path + path, :body => load_fixture('feed.rss'))
+  end
 end

metadata CHANGED Viewed

@@ -2,9 +2,10 @@
 name: spix_parser
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 1.6.1
+  version: 1.6.4
 platform: ruby
 authors:
+- Marcio Lopes de Faria
 - Marcelo Eden
 - Fabio Mont'Alegre
 - "Lucas H\xC3\xBAngaro"
@@ -13,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-05-31 00:00:00 -03:00
+date: 2011-06-03 00:00:00 -03:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -117,7 +118,6 @@ files:
 - spec/spix_parser/tools/feed_discovery/document_spec.rb
 - spec/spix_parser/tools/feed_discovery/feed_spec.rb
 - spec/spix_parser/tools/feed_discovery_spec.rb
-- spec/spix_parser/tools/feed_list_spec.rb
 - spec/spix_parser/utils_spec.rb
 has_rdoc: true
 homepage: http://github.com/busk/spix_parser
@@ -153,5 +153,4 @@ test_files:
 - spec/spix_parser/tools/feed_discovery/document_spec.rb
 - spec/spix_parser/tools/feed_discovery/feed_spec.rb
 - spec/spix_parser/tools/feed_discovery_spec.rb
-- spec/spix_parser/tools/feed_list_spec.rb
 - spec/spix_parser/utils_spec.rb

data/spec/spix_parser/tools/feed_list_spec.rb DELETED Viewed

@@ -1,17 +0,0 @@
-require 'spec_helper'
-describe Spix::FeedDiscoveryList do
-  let(:feed_discovery_list) { Spix::FeedDiscoveryList.new }
-  it "should inherit from array" do
-    feed_discovery_list.class.superclass.should == Array
-  end
-  describe "#invalids" do
-    it "should return an empty array from invalids accessor method" do
-      feed_discovery_list.invalids.should == []
-    end
-  end
-end