RubyGems - spix_parser - Versions diffs - 1.6.1 → 1.6.4 - Mend

spix_parser 1.6.1 → 1.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/lib/spix_parser/tools/feed_discovery/base.rb +3 -23
data/lib/spix_parser/tools/feed_discovery/document.rb +106 -38
data/lib/spix_parser/tools/feed_discovery/feed.rb +18 -4
data/lib/spix_parser/tools/feed_discovery.rb +4 -2
data/lib/spix_parser/version.rb +1 -1
data/spec/spix_parser/tools/feed_discovery/document_spec.rb +42 -21
data/spec/spix_parser/tools/feed_discovery_spec.rb +25 -9
metadata +3 -4
data/spec/spix_parser/tools/feed_list_spec.rb +0 -17

data/lib/spix_parser/tools/feed_discovery/base.rb CHANGED Viewed

@@ -27,11 +27,8 @@ module Spix
       end
       private :document
-      def list
-        extract_feeds_from_anchors if html?
-        extract_feeds_from_links
-        include_it_self if feed?
-        items
+      def list &block
+        @document.feed_uris &block
       end
       def html?
@@ -39,24 +36,7 @@ module Spix
       end
       private :html?
-      def extract_feeds_from_anchors
-        document.feed_uris_from_anchors +
-        document.generic_uris_from_anchors.map { |uri|
-          FeedDiscovery::Document.new(uri).feed_uris_from_anchors
-        }.flatten.each { |uri|
-          items << Feed.new(uri, document.favicon)
-        }
-      end
-      private :extract_feeds_from_anchors
-      def extract_feeds_from_links
-        document.feed_uris_from_links.each { |uri|
-          items << Feed.new(uri, document.favicon)
-        }
-      end
-      private :extract_feeds_from_links
-      def include_it_self
+      def include_it_self &block
         items << Feed.new(uri.to_s, document.favicon)
       end

data/lib/spix_parser/tools/feed_discovery/document.rb CHANGED Viewed

@@ -16,53 +16,81 @@ module Spix
           open(@uri.to_s, req_headers).read
         end
       end
-      private :content
+      protected :content
-      def feed_uris_from_anchors
-        @document.search("a").select { |anchor|
-          rss_or_atom_content_type_in? anchor
-        }.map { |node|
-          uri = @uri.merge node.get_attribute 'href'
-          uri.to_s
-        }
-      end
-      def generic_uris_from_anchors
-        @document.search("a").select { |anchor|
-          not rss_or_atom_content_type_in? anchor
-        }.map { |node|
-          uri = @uri.merge node.get_attribute 'href'
-          uri.to_s
-        }
+      def feed_uris &block
+        items = feed_uris_from_links(&block) + feed_uris_from_anchors(&block)
+        items << feed_unsing_address(@uri, &block) if feed?
+        items
       end
-      def feed_uris_from_links
+      def feed_uris_from_links &block
         @document.search(
           "link[@type='application/atom+xml']",
           "link[@type='application/rss+xml']"
-        ).map { |node|
-          uri = @uri.merge node.get_attribute 'href'
-          uri.to_s
+        ).map { |node| feed_from node, &block }
+      end
+      private :feed_uris_from_links
+      def feed_uris_from_anchors &block
+        @document.search('a').select { |node|
+          valid_url_in? node
+        }.select { |node|
+          rss_or_atom_content_type_in? node
+        }.map { |node|
+          feed_from node, &block
         }
       end
+      private :feed_uris_from_anchors
+      def feed_from node, &block
+        uri = @uri.merge node.get_attribute 'href'
+        feed_unsing_address uri, &block
+      end
+      private :feed_from
+      def feed_unsing_address uri, &block
+        begin
+          Feed.new(uri, favicon).tap do |feed|
+            block.call feed if block_given?
+          end
+        rescue => error
+          error.tap do |e|
+            block.call e if block_given?
+          end
+        end
+      end
+      private :feed_unsing_address
+      def valid_url_in? anchor
+        uri = address_from anchor
+        uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host)
+      end
+      private :valid_url_in?
       def rss_or_atom_content_type_in? anchor
         req, path = request_and_path_using address_from anchor
         resp = req.request_head path
         resp['content-type'] =~ /rss|atom/
+      rescue
+        true
       end
       private :rss_or_atom_content_type_in?
-      def request_and_path_using address
-        uri = @uri.merge URI.parse address
+      def request_and_path_using uri
         req = Net::HTTP.new uri.host, uri.port
-        path = uri - uri.select(:scheme, :host).join("://")
-        return req, path.to_s
+        return req, path_from(uri) || uri.to_s
       end
       private :request_and_path_using
+      def path_from uri
+        path = uri - uri.select(:scheme, :host).join("://")
+        path.to_s unless path.to_s.blank?
+      end
+      private :path_from
       def address_from node
-        node.get_attribute("href")
+        @uri.merge URI.parse node.get_attribute("href")
       end
       private :address_from
@@ -75,30 +103,70 @@ module Spix
       end
       def favicon
-        shortcuts_in_document or shortcuts_from(base_path) or base_path.merge('favicon.ico').to_s
+        shortcut_in_document or shortcut_from_original_page or shortcut_from(base_uri) or default_favico_if_exist
       end
-      def shortcuts_in_document
-        shortcuts = @document.search('link[@rel*=shortcut]')
-        shortcuts.any? ? shortcuts : nil
+      def shortcut_in_document
+        shortcuts = find_shortcut_in @document
+        shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
       end
-      private :shortcuts_in_document
+      private :shortcut_in_document
+      def shortcut_from_original_page
+        if feed?
+          if node = @document.search('link').first
+            path = URI.parse node.content.strip
+            shortcut_from URI.parse path.select(:scheme, :host).join("://") rescue nil
+          end
+        end
+      end
+      private :shortcut_from_original_page
-      def shortcuts_from base_path
-        doc = Nokogiri::HTML Net::HTTP.get base_path
-        doc.search('link[@rel*=shortcut]')
+      def shortcut_from base_uri
+        doc = get base_uri
+        shortcuts = find_shortcut_in doc
+        shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
       rescue Net::HTTPError, Net::HTTPFatalError
         logger.warn "error opening favicon: #{$!}"
         nil
       end
-      private :shortcuts_from
+      private :shortcut_from
+      def find_shortcut_in doc
+        doc.xpath(
+          '//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "shortcut")]',
+          '//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "ico")]'
+        ).map { |node| node.get_attribute "href" }
+      end
-      def base_path
-        URI.parse @uri.select(:scheme, :host).join("://")
+      def default_favico_if_exist
+        http = Net::HTTP.new base_uri.host, base_uri.port
+        resp = http.request_head 'favicon.ico'
+        base_uri.merge('favicon.ico').to_s unless resp.kind_of? Net::HTTPError
+      rescue
+        nil
+      end
+      private :default_favico_if_exist
+      def get uri
+        resp = Net::HTTP.get_response uri
+        doc = Nokogiri::HTML(resp.body)
+        if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
+          path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
+          get URI.parse path
+        else
+          doc
+        end
+      rescue
+        Nokogiri::HTML('')
       end
-      private :base_path
+      def base_uri
+        @base_uri ||= URI.parse @uri.select(:scheme, :host).join("://")
+      end
+      private :base_uri
     end
   end
 end

data/lib/spix_parser/tools/feed_discovery/feed.rb CHANGED Viewed

@@ -1,15 +1,26 @@
 module Spix
   module FeedDiscovery
-    class Feed < Struct.new(:url, :favicon, :title)
+    class Feed < Hash
       def initialize url, favicon
-        self.url = url
+        self.url = url.to_s
         self.favicon = favicon
         self.title = get_title
       end
+      %w[url favicon title].each do |attr|
+        define_method attr do
+          self[attr.to_sym]
+        end
+        define_method "#{attr}=" do |value|
+          self[attr.to_sym] = value
+        end
+      end
       def get_title
-        content.search('title').first.content
+        node = content.search('title').first
+        node.content if node
       end
       private :get_title
@@ -22,9 +33,12 @@ module Spix
       private :content
       def uri
-        URI.parse url
+        @uri ||= URI.parse url
       end
       private :uri
+      def to_hash
+      end
     end
   end
 end

data/lib/spix_parser/tools/feed_discovery.rb CHANGED Viewed

@@ -11,9 +11,11 @@ module Spix
       Spix::Parser.parse(uri, :mode => :fetch) ? true : false
     end
-    def list uri
+    def list uri, &block
       page = Base.new uri
-      page.list
+      page.list &block
+    rescue => error
+      [error]
     end
   end

data/lib/spix_parser/version.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module Spix
     module Version
       MAJOR = 1
       MINOR = 6
-      TINY  = 1
+      TINY  = 4
       def self.current_version
         "#{MAJOR}.#{MINOR}.#{TINY}"

data/spec/spix_parser/tools/feed_discovery/document_spec.rb CHANGED Viewed

@@ -4,22 +4,27 @@ describe Spix::FeedDiscovery::Document do
   let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
-  describe '#feed_uris_from_anchors' do
-    it 'should return only uris from anchors' do
-      document.feed_uris_from_anchors.should eql expected_uris_inside('a.has_feed')
-    end
-  end
+  describe '#feed_uris' do
-  describe '#feed_uris_from_links' do
-    it 'should return only uris from links' do
-      document.feed_uris_from_links.should eql expected_uris_inside('link.has_feed')
-    end
-  end
+    context 'when the uri exists' do
+      it 'should return only feed uris' do
+        document.should_receive(:feed_uris).and_return expected_feeds
+        document.feed_uris
+      end
+      it 'should yield feed with correct uri content' do
+        document.feed_uris do |feed|
+          expected_feeds.should include(feed)
+        end
+      end
+      before :each do
+        stub_requests
+      end
-  describe '#generic_uris' do
-    it 'should return only ordinary uris from anchors' do
-      document.generic_uris_from_anchors.should eql expected_uris_inside('a.generic')
     end
   end
   describe '#html?' do
@@ -34,6 +39,11 @@ describe Spix::FeedDiscovery::Document do
       Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
       document.html?.should eql false
     end
+    before :each do
+      stub_requests
+    end
   end
   describe '#feed?' do
@@ -48,21 +58,32 @@ describe Spix::FeedDiscovery::Document do
       Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
       document.feed?.should eql false
     end
+    before :each do
+      stub_requests
+    end
   end
   before :all do
-    @rss_uri = 'http://myfeed.com/rss_list.html'
+    @domain = 'http://diveintomark.org'
+    @rss_uri = @domain + '/rss_list.html'
     @content = load_fixture("rss_list.html")
     @document = Nokogiri::XML(@content)
   end
-  before :each do
-    Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
-    FakeWeb.register_uri(:head, 'http://myfeed.com/has_feed.html', :content_type => 'application/atom+xml')
-    FakeWeb.register_uri(:head, 'http://myfeed.com/generic.html', :content_type => 'text/html' )
-  end
 end
-def expected_uris_inside to_search
-  @document.search(to_search).map { |node| URI.parse(@rss_uri).merge(node.get_attribute('href')).to_s }
+def stub_requests
+  Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
+  %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].each { |path|
+    FakeWeb.register_uri(:head, @domain + '/' + path, :content_type => 'application/atom+xml')
+    FakeWeb.register_uri(:get, @domain + '/' + path, :body => load_fixture(path))
+  }
+  FakeWeb.register_uri(:head, @domain + '/generic.html', :content_type => 'text/html' )
+end
+def expected_feeds
+  @feeds ||= %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].map { |path|
+    Spix::FeedDiscovery::Feed.new(@domain + '/' + path, @domain + '/images/favicon.ico')
+  }
 end

data/spec/spix_parser/tools/feed_discovery_spec.rb CHANGED Viewed

@@ -4,7 +4,8 @@ describe Spix::FeedDiscovery, "#list" do
   describe "when the feed have an absolute URI" do
     it "should return the feed url" do
-      fake_requests_for :path_inside_content => '/html4-002.html',
+      fake_requests_for :ignore => ['/html4-002.html'],
+                        :accept => ['/tests/client/autodiscovery/html4-001.xml'],
                         :resource_path => @domain_url,
                         :content => load_fixture("absolute_uri.html")
       Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
@@ -14,7 +15,8 @@ describe Spix::FeedDiscovery, "#list" do
   describe "when the feed have a relative URI" do
     describe "which is relative to a path" do
       it "should return the feed url when the URI is at the top domain" do
-        fake_requests_for :path_inside_content => '/html4-003.html',
+        fake_requests_for :ignore => ['/html4-003.html'],
+                          :accept => ['/html4-002.xml'],
                           :resource_path => @domain_url,
                           :content => load_fixture("relative_uri.html")
         Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
@@ -23,7 +25,8 @@ describe Spix::FeedDiscovery, "#list" do
       it "should return the feed url when the URI is inside a path" do
         @path_url = "/foo/bar/"
         @feed_url = @domain_url + @path_url
-        fake_requests_for :path_inside_content => 'html4-003.html',
+        fake_requests_for :ignore => ['html4-003.html'],
+                          :accept => ['html4-002.xml'],
                           :resource_path => @feed_url,
                           :content => load_fixture('relative_uri.html')
         Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url +  "html4-002.xml"
@@ -32,7 +35,8 @@ describe Spix::FeedDiscovery, "#list" do
     describe "which is relative to the top domain" do
       it "should return the feed url when the URI is at the top domain" do
-        fake_requests_for :path_inside_content => '/html4-004.html',
+        fake_requests_for :ignore => ['/html4-004.html'],
+                          :accept => ['/html4-003.xml'],
                           :resource_path => @domain_url,
                           :content => load_fixture("relative_uri_top_domain.html")
         Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
@@ -42,7 +46,8 @@ describe Spix::FeedDiscovery, "#list" do
         @path_url = "/foo/bar/"
         @feed_url = @domain_url + @path_url
-        fake_requests_for :path_inside_content => 'html4-004.html',
+        fake_requests_for :ignore => ['/html4-004.html'],
+                          :accept => ['/html4-003.xml'],
                           :resource_path => @feed_url,
                           :content => load_fixture("relative_uri_top_domain.html")
         Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
@@ -100,16 +105,27 @@ describe Spix::FeedDiscovery, "#list" do
   end
   before(:all) do
-    @domain_url = "http://sitewithfeed.com"
+    @domain_url = "http://diveintomark.org"
   end
 end
 def fake_requests_for options = {}
   content = options.delete(:content)
-  path_inside_content = options.delete(:path_inside_content)
+  ignore = options.delete(:ignore)
+  accept = options.delete(:accept)
   resource_path = options.delete(:resource_path)
   FakeWeb.register_uri(:get, resource_path, :body => content)
-  FakeWeb.register_uri(:head, resource_path + path_inside_content, :content => 'text/html')
-  FakeWeb.register_uri(:get, resource_path + path_inside_content, :content => content)
+  ignore.each do |path|
+    FakeWeb.register_uri(:head, resource_path + path, :content => 'text/html')
+  end
+  accept.each do |path|
+    FakeWeb.register_uri(:head, resource_path + path, :content => 'application/atom+xml')
+    FakeWeb.register_uri(:get, resource_path + path, :body => load_fixture('feed.rss'))
+  end
 end

metadata CHANGED Viewed

@@ -2,9 +2,10 @@
 name: spix_parser
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 1.6.1
+  version: 1.6.4
 platform: ruby
 authors:
+- Marcio Lopes de Faria
 - Marcelo Eden
 - Fabio Mont'Alegre
 - "Lucas H\xC3\xBAngaro"
@@ -13,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-05-31 00:00:00 -03:00
+date: 2011-06-03 00:00:00 -03:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -117,7 +118,6 @@ files:
 - spec/spix_parser/tools/feed_discovery/document_spec.rb
 - spec/spix_parser/tools/feed_discovery/feed_spec.rb
 - spec/spix_parser/tools/feed_discovery_spec.rb
-- spec/spix_parser/tools/feed_list_spec.rb
 - spec/spix_parser/utils_spec.rb
 has_rdoc: true
 homepage: http://github.com/busk/spix_parser
@@ -153,5 +153,4 @@ test_files:
 - spec/spix_parser/tools/feed_discovery/document_spec.rb
 - spec/spix_parser/tools/feed_discovery/feed_spec.rb
 - spec/spix_parser/tools/feed_discovery_spec.rb
-- spec/spix_parser/tools/feed_list_spec.rb
 - spec/spix_parser/utils_spec.rb

data/spec/spix_parser/tools/feed_list_spec.rb DELETED Viewed

@@ -1,17 +0,0 @@
-require 'spec_helper'
-describe Spix::FeedDiscoveryList do
-  let(:feed_discovery_list) { Spix::FeedDiscoveryList.new }
-  it "should inherit from array" do
-    feed_discovery_list.class.superclass.should == Array
-  end
-  describe "#invalids" do
-    it "should return an empty array from invalids accessor method" do
-      feed_discovery_list.invalids.should == []
-    end
-  end
-end