RubyGems - spix_parser - Versions diffs - 1.6.6 → 1.6.7 - Mend

spix_parser 1.6.6 → 1.6.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/lib/spix_parser/tools/feed_discovery.rb +5 -2
data/lib/spix_parser/tools/feed_discovery/base.rb +4 -0
data/lib/spix_parser/tools/feed_discovery/document.rb +100 -36
data/lib/spix_parser/tools/feed_discovery/feed.rb +33 -22
data/lib/spix_parser/version.rb +1 -1
data/spec/spix_parser/tools/feed_discovery/document_spec.rb +17 -13
data/spec/spix_parser/tools/feed_discovery/feed_spec.rb +4 -7
metadata +2 -2

data/lib/spix_parser/tools/feed_discovery.rb CHANGED

@@ -14,8 +14,11 @@ module Spix
     def list uri, &block
       page = Base.new uri
       page.list &block
-    rescue => error
-      [error]
+    end
+    def list_uris_finded_in uri
+      page = Base.new uri
+      page.list_uris
     end
   end

data/lib/spix_parser/tools/feed_discovery/base.rb CHANGED

@@ -28,6 +28,10 @@ module Spix
       private :document
       def list &block
+        @document.feeds &block
+      end
+      def list_uris &block
         @document.feed_uris &block
       end

data/lib/spix_parser/tools/feed_discovery/document.rb CHANGED

@@ -2,86 +2,146 @@ module Spix
   module FeedDiscovery
     class Document
-      def initialize uri
-        @uri = URI.parse uri
-        @document = Nokogiri::XML(content)
+      def initialize uri_name
+        @uri = parse_uri uri_name
       end
-      def content
-        if @uri.respond_to?(:read)
-          @uri.read
+      def load_content
+        @document ||= content_from @uri.to_s
+      end
+      def content_from uri_name, limit=10
+        raise ArgumentError, 'HTTP redirect too deep' if limit == 0
+        uri = parse_uri uri_name
+        connection, path = connection_and_path_using uri
+        response = connection.request_get path
+        content = Nokogiri::XML response.body
+        if response.kind_of? Net::HTTPRedirection
+          content_from response['location'], limit - 1
+        elsif meta_refresh = content.search('//meta[contains(translate(@http-equiv, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "refresh")]').first
+          content_from meta_refresh.get_attribute('content')[/http:\/\/.*/], limit - 1;
         else
-          req_headers = {}
-          req_headers["User-Agent"] = USER_AGENT
-          open(@uri.to_s, req_headers).read
+          content
         end
       end
-      protected :content
+      protected :content_from
       def feed_uris &block
-        items = feed_uris_from_links(&block) + feed_uris_from_anchors(&block)
+        items = []
+        load_content
+        items = uris_from_links(&block) + uris_from_anchors(&block)
+        items << Feed.new(@uri).tap { |uri|
+          block.call uri if block_given?
+        } if feed?
+      rescue => error
+        items << feed_exception_from(error)
+      ensure
+        return items
+      end
+      def feeds &block
+        items = []
+        load_content
+        items = feeds_from_links(&block) + feeds_from_anchors(&block)
         items << feed_unsing_address(@uri, &block) if feed?
-        items
+      rescue => error
+        items << feed_exception_from(error)
+      ensure
+        return items
+      end
+      def feed_exception_from error, &block
+        Feed.new(@uri).tap { |item|
+          item.exceptions = [error.to_s]
+          block.call item if block_given?
+        }
       end
-      def feed_uris_from_links &block
+      def feeds_from_links &block
+        from_links.map { |node| feed_from node, &block }
+      end
+      private :feeds_from_links
+      def uris_from_links &block
+        from_links.map { |node|
+          Feed.new(node.get_attribute('href')).tap do |item|
+            block.call item if block_given?
+          end
+        }
+      end
+      private :uris_from_links
+      def from_links
         @document.search(
           "link[@type='application/atom+xml']",
           "link[@type='application/rss+xml']"
-        ).map { |node| feed_from node, &block }
+        )
       end
-      private :feed_uris_from_links
+      private :from_links
-      def feed_uris_from_anchors &block
+      def feeds_from_anchors &block
+        from_anchors.map { |node|
+          feed_from node, &block
+        }
+      end
+      private :feeds_from_anchors
+      def uris_from_anchors &block
+        from_anchors.map { |node|
+          Feed.new(node.get_attribute('href')).tap do |item|
+            block.call item if block_given?
+          end
+        }
+      end
+      private :uris_from_anchors
+      def from_anchors
         @document.search('a').select { |node|
           valid_url_in? node
         }.select { |node|
           rss_or_atom_content_type_in? node
-        }.map { |node|
-          feed_from node, &block
-        }
+        }
       end
-      private :feed_uris_from_anchors
+      private :from_anchors
       def feed_from node, &block
-        uri = @uri.merge node.get_attribute 'href'
+        uri = @uri.merge node.get_attribute("href").to_s
         feed_unsing_address uri, &block
       end
       private :feed_from
       def feed_unsing_address uri, &block
         begin
-          Feed.new(uri).tap do |feed|
+          Feed.new(uri) { |feed|
+            feed.set_title
+            feed.set_favicon
+          }.tap { |feed|
             block.call feed if block_given?
-          end
-        rescue => error
-          error.tap do |e|
-            block.call e if block_given?
-          end
+          }
         end
       end
       private :feed_unsing_address
       def valid_url_in? anchor
         uri = address_from anchor
-        uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host)
+        uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host) and uri.to_s.present?
       end
       private :valid_url_in?
       def rss_or_atom_content_type_in? anchor
-        req, path = request_and_path_using address_from anchor
-        resp = req.request_head path
-        resp['content-type'] =~ /rss|atom/
+        connection, path = connection_and_path_using address_from anchor
+        response = connection.request_head path
+        response['content-type'] =~ /rss|atom/
       rescue
         true
       end
       private :rss_or_atom_content_type_in?
-      def request_and_path_using uri
-        req = Net::HTTP.new uri.host, uri.port
-        return req, path_from(uri) || uri.to_s
+      def connection_and_path_using uri
+        connection = Net::HTTP.new uri.host, uri.port
+        return connection, path_from(uri) || uri.to_s
       end
-      private :request_and_path_using
+      private :connection_and_path_using
       def path_from uri
         path = uri - uri.select(:scheme, :host).join("://")
@@ -90,7 +150,7 @@ module Spix
       private :path_from
       def address_from node
-        @uri.merge URI.parse node.get_attribute("href")
+        @uri.merge parse_uri node.get_attribute("href").to_s
       end
       private :address_from
@@ -101,6 +161,10 @@ module Spix
       def feed?
         %w[rss feed].include? @document.root.name
       end
+      def parse_uri path
+        URI.parse URI.encode path.to_s
+      end
     end
   end

data/lib/spix_parser/tools/feed_discovery/feed.rb CHANGED

@@ -4,11 +4,14 @@ module Spix
       def initialize url
         self.url = url.to_s
-        self.favicon = get_favicon
-        self.title = get_title
+        self.similars = []
+        self.exceptions = []
+        yield self if block_given?
+      rescue => error
+        self.errors = [error]
       end
-      %w[url favicon title].each do |attr|
+      %w[url favicon title exceptions similars].each do |attr|
         define_method attr do
           self[attr.to_sym]
         end
@@ -18,21 +21,20 @@ module Spix
         end
       end
-      def get_title
+      def set_title
         node = content.search('title').first
-        node.content if node
+        self.title = node.content if node
       end
-      private :get_title
-      def get_favicon
+      def set_favicon
         if node = content.search('link').first
-          path = URI.parse node.content.strip
-          shortcut_from URI.parse path.select(:scheme, :host).join("://") rescue nil
+          path = parse_uri node.content.strip
+          self.favicon = shortcut_from parse_uri path.select(:scheme, :host).join("://") rescue nil
         end
       end
       def shortcut_from base_uri
-        doc = get base_uri
+        doc = fetch_html base_uri
         shortcuts = find_shortcut_in doc
         shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
@@ -49,21 +51,31 @@ module Spix
         ).map { |node| node.get_attribute "href" }
       end
-      def get uri
+      def fetch uri, limit = 10
+        raise ArgumentError, 'HTTP redirect too deep' if limit == 0
         resp = Net::HTTP.get_response uri
-        doc = Nokogiri::HTML(resp.body)
         if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
           path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
-          get URI.parse path
+          from_redirect = parse_uri path
+          self.url = from_redirect.to_s
+          fetch from_redirect, limit - 1
         else
-          doc
+          resp.body
         end
       rescue
-        Nokogiri::HTML('')
+        String.new
+      end
+      def fetch_xml uri
+        Nokogiri::XML fetch uri
+      end
+      def fetch_html uri
+        Nokogiri::HTML fetch uri
       end
       def base_uri
-        @base_uri ||= URI.parse uri.select(:scheme, :host).join("://")
+        @base_uri ||= parse_uri uri.select(:scheme, :host).join("://")
       end
       private :base_uri
@@ -73,20 +85,19 @@ module Spix
       private :content
       def load_content
-        req = Net::HTTP.new uri.host, uri.port
-        path = uri - uri.select(:scheme, :host).join("://")
-        resp = req.request_get path.to_s
-        Nokogiri::XML(resp.body)
+        fetch_xml uri
       end
       private :load_content
       def uri
-        @uri ||= URI.parse url
+        @uri ||= parse_uri url
       end
       private :uri
-      def to_hash
+      def parse_uri path
+        URI.parse URI.encode path
       end
     end
   end
 end

data/lib/spix_parser/version.rb CHANGED

@@ -4,7 +4,7 @@ module Spix
     module Version
       MAJOR = 1
       MINOR = 6
-      TINY  = 6
+      TINY  = 7
       def self.current_version
         "#{MAJOR}.#{MINOR}.#{TINY}"

data/spec/spix_parser/tools/feed_discovery/document_spec.rb CHANGED

@@ -4,17 +4,17 @@ describe Spix::FeedDiscovery::Document do
   let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
-  describe '#feed_uris' do
+  describe '#feed' do
     context 'when the uri exists' do
       it 'should return only feed uris' do
-        document.should_receive(:feed_uris).and_return expected_feeds
-        document.feed_uris
+        document.should_receive(:feeds).and_return expected_feeds
+        document.feeds
       end
       it 'should yield feed with correct uri content' do
-        document.feed_uris do |feed|
+        document.feeds do |feed|
           expected_feeds.should include(feed)
         end
       end
@@ -29,14 +29,16 @@ describe Spix::FeedDiscovery::Document do
   describe '#html?' do
     it 'should return true if is a html document' do
-      content = load_fixture('rss_list.html')
-      Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
+      content = Nokogiri::XML load_fixture('rss_list.html')
+      Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
+      document.load_content
       document.html?.should eql true
     end
     it 'should return false if is a rss/feed document' do
-      content = load_fixture('feed.rss')
-      Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
+      content = Nokogiri::XML load_fixture('feed.rss')
+      Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
+      document.load_content
       document.html?.should eql false
     end
@@ -48,14 +50,16 @@ describe Spix::FeedDiscovery::Document do
   describe '#feed?' do
     it 'should return true if a feed document' do
-      content = load_fixture('feed.rss')
-      Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
+      content = Nokogiri::XML load_fixture('feed.rss')
+      Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
+      document.load_content
       document.feed?.should eql true
     end
     it 'should return false if hot a html document' do
-      content = load_fixture('rss_list.html')
-      Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
+      content = Nokogiri::XML load_fixture('rss_list.html')
+      document.load_content
+      Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
       document.feed?.should eql false
     end
@@ -84,6 +88,6 @@ end
 def expected_feeds
   @feeds ||= %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].map { |path|
-    Spix::FeedDiscovery::Feed.new(@domain + '/' + path, @domain + '/images/favicon.ico')
+    Spix::FeedDiscovery::Feed.new(@domain + '/' + path)
   }
 end

data/spec/spix_parser/tools/feed_discovery/feed_spec.rb CHANGED

@@ -4,23 +4,20 @@ describe Spix::FeedDiscovery::Feed do
   context 'given an expecific uri' do
-    let(:feed) { Spix::FeedDiscovery::Feed.new @feed_uri, @favicon_uri }
-    it 'should set the favicon' do
-      feed.favicon.should == @favicon_uri
-    end
     it 'should set the url' do
+      feed = described_class.new @feed_uri
       feed.url.should == @feed_uri
     end
     it 'should set title' do
+      feed = described_class.new @feed_uri do |f|
+        f.set_title
+      end
       feed.title.should == @document.search('title').first.content
     end
     before :all do
       @feed_uri = "http://myfeed.com/feed.rss"
-      @favicon_uri = "http://myfeed.com/images/favicon.ico"
       @document = Nokogiri::XML load_fixture 'feed.rss'
       FakeWeb.register_uri(:get, @feed_uri, :body => @document.to_s)
     end

metadata CHANGED

@@ -2,7 +2,7 @@
 name: spix_parser
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 1.6.6
+  version: 1.6.7
 platform: ruby
 authors:
 - Marcio Lopes de Faria
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-06-03 00:00:00 -03:00
+date: 2011-06-06 00:00:00 -03:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency