RubyGems - rodneyc-anemone - Versions diffs - 0.7.1.1 - Mend

rodneyc-anemone 0.7.1.1

Files changed (39) hide show

data/CHANGELOG.rdoc +129 -0
data/LICENSE.txt +19 -0
data/README.rdoc +38 -0
data/Rakefile +23 -0
data/VERSION +1 -0
data/bin/anemone +4 -0
data/lib/anemone/cli/count.rb +22 -0
data/lib/anemone/cli/cron.rb +90 -0
data/lib/anemone/cli/pagedepth.rb +32 -0
data/lib/anemone/cli/serialize.rb +35 -0
data/lib/anemone/cli/url_list.rb +41 -0
data/lib/anemone/cli.rb +24 -0
data/lib/anemone/cookie_store.rb +35 -0
data/lib/anemone/core.rb +332 -0
data/lib/anemone/exceptions.rb +5 -0
data/lib/anemone/http.rb +187 -0
data/lib/anemone/page.rb +217 -0
data/lib/anemone/page_store.rb +160 -0
data/lib/anemone/storage/base.rb +75 -0
data/lib/anemone/storage/exceptions.rb +15 -0
data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
data/lib/anemone/storage/mongodb.rb +89 -0
data/lib/anemone/storage/pstore.rb +50 -0
data/lib/anemone/storage/redis.rb +90 -0
data/lib/anemone/storage/sqlite3.rb +90 -0
data/lib/anemone/storage/tokyo_cabinet.rb +60 -0
data/lib/anemone/storage.rb +44 -0
data/lib/anemone/tentacle.rb +39 -0
data/lib/anemone.rb +2 -0
data/spec/anemone_spec.rb +16 -0
data/spec/cookie_store_spec.rb +28 -0
data/spec/core_spec.rb +344 -0
data/spec/fakeweb_helper.rb +77 -0
data/spec/http_spec.rb +19 -0
data/spec/page_spec.rb +177 -0
data/spec/page_store_spec.rb +171 -0
data/spec/spec_helper.rb +9 -0
data/spec/storage_spec.rb +252 -0
metadata +207 -0

data/spec/core_spec.rb ADDED Viewed

@@ -0,0 +1,344 @@
+$:.unshift(File.dirname(__FILE__))
+require 'spec_helper'
+%w[pstore tokyo_cabinet sqlite3].each { |file| require "anemone/storage/#{file}.rb" }
+module Anemone
+  describe Core do
+    before(:each) do
+      FakeWeb.clean_registry
+    end
+    shared_examples_for "crawl" do
+      it "should crawl all the html pages in a domain by following <a> href's" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1', '2'])
+        pages << FakePage.new('1', :links => ['3'])
+        pages << FakePage.new('2')
+        pages << FakePage.new('3')
+        Anemone.crawl(pages[0].url, @opts).should have(4).pages
+      end
+      it "should not follow links that leave the original domain" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
+        pages << FakePage.new('1')
+        core = Anemone.crawl(pages[0].url, @opts)
+        core.should have(2).pages
+        core.pages.keys.should_not include('http://www.other.com/')
+      end
+      it "should not follow redirects that leave the original domain" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1'], :redirect => 'http://www.other.com/')
+        pages << FakePage.new('1')
+        core = Anemone.crawl(pages[0].url, @opts)
+        core.should have(2).pages
+        core.pages.keys.should_not include('http://www.other.com/')
+      end
+      it "should follow http redirects" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1'])
+        pages << FakePage.new('1', :redirect => '2')
+        pages << FakePage.new('2')
+        Anemone.crawl(pages[0].url, @opts).should have(3).pages
+      end
+      it "should follow with HTTP basic authentication" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1', '2'], :auth => true)
+        pages << FakePage.new('1', :links => ['3'], :auth => true)
+        Anemone.crawl(pages.first.auth_url, @opts).should have(3).pages
+      end
+      it "should accept multiple starting URLs" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1'])
+        pages << FakePage.new('1')
+        pages << FakePage.new('2', :links => ['3'])
+        pages << FakePage.new('3')
+        Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
+      end
+      it "should include the query string when following links" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1?foo=1'])
+        pages << FakePage.new('1?foo=1')
+        pages << FakePage.new('1')
+        core = Anemone.crawl(pages[0].url, @opts)
+        core.should have(2).pages
+        core.pages.keys.should_not include(pages[2].url)
+      end
+      it "should be able to skip links with query strings" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1?foo=1', '2'])
+        pages << FakePage.new('1?foo=1')
+        pages << FakePage.new('2')
+        core = Anemone.crawl(pages[0].url, @opts) do |a|
+          a.skip_query_strings = true
+        end
+        core.should have(2).pages
+      end
+      it "should be able to skip links based on a RegEx" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1', '2'])
+        pages << FakePage.new('1')
+        pages << FakePage.new('2')
+        pages << FakePage.new('3')
+        core = Anemone.crawl(pages[0].url, @opts) do |a|
+          a.skip_links_like /1/, /3/
+        end
+        core.should have(2).pages
+        core.pages.keys.should_not include(pages[1].url)
+        core.pages.keys.should_not include(pages[3].url)
+      end
+      it "should be able to call a block on every page" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1', '2'])
+        pages << FakePage.new('1')
+        pages << FakePage.new('2')
+        count = 0
+        Anemone.crawl(pages[0].url, @opts) do |a|
+          a.on_every_page { count += 1 }
+        end
+        count.should == 3
+      end
+      it "should not discard page bodies by default" do
+        Anemone.crawl(FakePage.new('0').url, @opts).pages.values#.first.doc.should_not be_nil
+      end
+      it "should optionally discard page bodies to conserve memory" do
+       # core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
+       # core.pages.values.first.doc.should be_nil
+      end
+      it "should provide a focus_crawl method to select the links on each page to follow" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1', '2'])
+        pages << FakePage.new('1')
+        pages << FakePage.new('2')
+        core = Anemone.crawl(pages[0].url, @opts) do |a|
+          a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
+        end
+        core.should have(2).pages
+        core.pages.keys.should_not include(pages[1].url)
+      end
+      it "should optionally delay between page requests" do
+        delay = 0.25
+        pages = []
+        pages << FakePage.new('0', :links => '1')
+        pages << FakePage.new('1')
+        start = Time.now
+        Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
+        finish = Time.now
+        (finish - start).should satisfy {|t| t > delay * 2}
+      end
+      it "should optionally obey the robots exclusion protocol" do
+        pages = []
+        pages << FakePage.new('0', :links => '1')
+        pages << FakePage.new('1')
+        pages << FakePage.new('robots.txt',
+                              :body => "User-agent: *\nDisallow: /1",
+                              :content_type => 'text/plain')
+        core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
+        urls = core.pages.keys
+        urls.should include(pages[0].url)
+        urls.should_not include(pages[1].url)
+      end
+      it "should be able to set cookies to send with HTTP requests" do
+        cookies = {:a => '1', :b => '2'}
+        core = Anemone.crawl(FakePage.new('0').url) do |anemone|
+          anemone.cookies = cookies
+        end
+        core.opts[:cookies].should == cookies
+      end
+      it "should freeze the options once the crawl begins" do
+        core = Anemone.crawl(FakePage.new('0').url) do |anemone|
+          anemone.threads = 4
+          anemone.on_every_page do
+            lambda {anemone.threads = 2}.should raise_error
+          end
+        end
+        core.opts[:threads].should == 4
+      end
+      describe "many pages" do
+        before(:each) do
+          @pages, size = [], 5
+          size.times do |n|
+            # register this page with a link to the next page
+            link = (n + 1).to_s if n + 1 < size
+            @pages << FakePage.new(n.to_s, :links => Array(link))
+          end
+        end
+        it "should track the page depth and referer" do
+          core = Anemone.crawl(@pages[0].url, @opts)
+          previous_page = nil
+          @pages.each_with_index do |page, i|
+            page = core.pages[page.url]
+            page.should be
+            page.depth.should == i
+            if previous_page
+              page.referer.should == previous_page.url
+            else
+              page.referer.should be_nil
+            end
+            previous_page = page
+          end
+        end
+        it "should optionally limit the depth of the crawl" do
+          core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
+          core.should have(4).pages
+        end
+      end
+    end
+    describe Hash do
+      it_should_behave_like "crawl"
+      before(:all) do
+        @opts = {}
+      end
+    end
+    describe Storage::PStore do
+      it_should_behave_like "crawl"
+      before(:all) do
+        @test_file = 'test.pstore'
+      end
+      before(:each) do
+        File.delete(@test_file) if File.exists?(@test_file)
+        @opts = {:storage => Storage.PStore(@test_file)}
+      end
+      after(:each) do
+        File.delete(@test_file) if File.exists?(@test_file)
+      end
+    end
+    describe Storage::TokyoCabinet do
+      it_should_behave_like "crawl"
+      before(:all) do
+        @test_file = 'test.tch'
+      end
+      before(:each) do
+        File.delete(@test_file) if File.exists?(@test_file)
+        @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
+      end
+      after(:each) do
+        @store.close
+      end
+      after(:each) do
+        File.delete(@test_file) if File.exists?(@test_file)
+      end
+    end
+    describe Storage::SQLite3 do
+      it_should_behave_like "crawl"
+      before(:all) do
+        @test_file = 'test.db'
+      end
+      before(:each) do
+        File.delete(@test_file) if File.exists?(@test_file)
+        @opts = {:storage => @store = Storage.SQLite3(@test_file)}
+      end
+      after(:each) do
+        @store.close
+      end
+      after(:each) do
+        File.delete(@test_file) if File.exists?(@test_file)
+      end
+    end
+    describe "options" do
+      it "should accept options for the crawl" do
+        core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
+                                          :threads => 2,
+                                          :discard_page_bodies => true,
+                                          :user_agent => 'test',
+                                          :obey_robots_txt => true,
+                                          :depth_limit => 3)
+        core.opts[:verbose].should == false
+        core.opts[:threads].should == 2
+        core.opts[:discard_page_bodies].should == true
+        core.opts[:delay].should == 0
+        core.opts[:user_agent].should == 'test'
+        core.opts[:obey_robots_txt].should == true
+        core.opts[:depth_limit].should == 3
+      end
+      it "should accept options via setter methods in the crawl block" do
+        core = Anemone.crawl(SPEC_DOMAIN) do |a|
+          a.verbose = false
+          a.threads = 2
+          a.discard_page_bodies = true
+          a.user_agent = 'test'
+          a.obey_robots_txt = true
+          a.depth_limit = 3
+        end
+        core.opts[:verbose].should == false
+        core.opts[:threads].should == 2
+        core.opts[:discard_page_bodies].should == true
+        core.opts[:delay].should == 0
+        core.opts[:user_agent].should == 'test'
+        core.opts[:obey_robots_txt].should == true
+        core.opts[:depth_limit].should == 3
+      end
+      it "should use 1 thread if a delay is requested" do
+        Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
+      end
+    end
+  end
+end

data/spec/fakeweb_helper.rb ADDED Viewed

@@ -0,0 +1,77 @@
+FakeWeb.allow_net_connect = false
+module Anemone
+  SPEC_DOMAIN = "http://www.example.com/"
+  AUTH_SPEC_DOMAIN = "http://user:pass@#{URI.parse(SPEC_DOMAIN).host}/"
+  class FakePage
+    attr_accessor :links
+    attr_accessor :hrefs
+    attr_accessor :body
+    def initialize(name = '', options = {})
+      @name = name
+      @links = [options[:links]].flatten if options.has_key?(:links)
+      @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
+      @redirect = options[:redirect] if options.has_key?(:redirect)
+      @auth = options[:auth] if options.has_key?(:auth)
+      @base = options[:base] if options.has_key?(:base)
+      @content_type = options[:content_type] || "text/html"
+      @body = options[:body]
+      create_body unless @body
+      add_to_fakeweb
+    end
+    def url
+      SPEC_DOMAIN + @name
+    end
+    def auth_url
+      AUTH_SPEC_DOMAIN + @name
+    end
+    private
+    def create_body
+      if @base
+        @body = "<html><head><base href=\"#{@base}\"></head><body>"
+      else
+        @body = "<html><body>"
+      end
+      @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
+      @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
+      @body += "</body></html>"
+    end
+    def add_to_fakeweb
+      options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
+      if @redirect
+        options[:status] = [301, "Permanently Moved"]
+        # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
+        redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
+        options[:location] = redirect_url
+        # register the page this one redirects to
+        FakeWeb.register_uri(:get, redirect_url, {:body => '',
+                                                  :content_type => @content_type,
+                                                  :status => [200, "OK"]})
+      end
+      if @auth
+        unautorized_options = {
+          :body => "Unauthorized", :status => ["401", "Unauthorized"]
+        }
+        FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, unautorized_options)
+        FakeWeb.register_uri(:get, AUTH_SPEC_DOMAIN + @name, options)
+      else
+        FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
+      end
+    end
+  end
+end
+#default root
+Anemone::FakePage.new

data/spec/http_spec.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require 'spec_helper'
+module Anemone
+  describe HTTP do
+    describe "fetch_page" do
+      before(:each) do
+        FakeWeb.clean_registry
+      end
+      it "should still return a Page if an exception occurs during the HTTP connection" do
+        HTTP.stub!(:refresh_connection).and_raise(StandardError)
+        http = Anemone::HTTP.new
+        http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
+      end
+    end
+  end
+end

data/spec/page_spec.rb ADDED Viewed

@@ -0,0 +1,177 @@
+$:.unshift(File.dirname(__FILE__))
+require 'spec_helper'
+module Anemone
+  describe Page do
+    before(:each) do
+      FakeWeb.clean_registry
+      @http = Anemone::HTTP.new
+      @page = @http.fetch_page(FakePage.new('home', :links => '1').url)
+    end
+    it "should indicate whether it successfully fetched via HTTP" do
+      @page.should respond_to(:fetched?)
+      @page.fetched?.should == true
+      fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
+      fail_page.fetched?.should == false
+    end
+    it "should store and expose the response body of the HTTP request" do
+      body = 'test'
+      page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
+      page.body.should == body
+    end
+    it "should record any error that occurs during fetch_page" do
+      @page.should respond_to(:error)
+      @page.error.should be_nil
+      fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
+      fail_page.error.should_not be_nil
+    end
+    it "should store the response headers when fetching a page" do
+      @page.headers.should_not be_nil
+      @page.headers.should have_key('content-type')
+    end
+    it "should have an OpenStruct attribute for the developer to store data in" do
+      @page.data.should_not be_nil
+      @page.data.should be_an_instance_of(OpenStruct)
+      @page.data.test = 'test'
+      @page.data.test.should == 'test'
+    end
+    it "should have a Nokogori::HTML::Document attribute for the page body" do
+      @page.doc.should_not be_nil
+      @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
+    end
+    it "should indicate whether it was fetched after an HTTP redirect" do
+      @page.should respond_to(:redirect?)
+      @page.redirect?.should == false
+      @http.fetch_pages(FakePage.new('redir', :redirect => 'home').url).first.redirect?.should == true
+    end
+    it "should have a method to tell if a URI is in the same domain as the page" do
+      @page.should respond_to(:in_domain?)
+      @page.in_domain?(URI(FakePage.new('test').url)).should == true
+      @page.in_domain?(URI('http://www.other.com/')).should == false
+    end
+    it "should include the response time for the HTTP request" do
+      @page.should respond_to(:response_time)
+    end
+    it "should have the cookies received with the page" do
+      @page.should respond_to(:cookies)
+      @page.cookies.should == []
+    end
+    describe "#to_hash" do
+      it "converts the page to a hash" do
+        hash = @page.to_hash
+        hash['url'].should == @page.url.to_s
+        hash['referer'].should == @page.referer.to_s
+        hash['links'].should == @page.links.map(&:to_s)
+      end
+      context "when redirect_to is nil" do
+        it "sets 'redirect_to' to nil in the hash" do
+          @page.redirect_to.should be_nil
+          @page.to_hash[:redirect_to].should be_nil
+        end
+      end
+      context "when redirect_to is a non-nil URI" do
+        it "sets 'redirect_to' to the URI string" do
+          new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
+          new_page.redirect_to.to_s.should == SPEC_DOMAIN + '1'
+          new_page.to_hash['redirect_to'].should == SPEC_DOMAIN + '1'
+        end
+      end
+    end
+    describe "#from_hash" do
+      it "converts from a hash to a Page" do
+        page = @page.dup
+        page.depth = 1
+        converted = Page.from_hash(page.to_hash)
+        converted.links.should == page.links
+        converted.depth.should == page.depth
+      end
+      it 'handles a from_hash with a nil redirect_to' do
+        page_hash = @page.to_hash
+        page_hash['redirect_to'] = nil
+        lambda{Page.from_hash(page_hash)}.should_not raise_error(URI::InvalidURIError)
+        Page.from_hash(page_hash).redirect_to.should be_nil
+      end
+    end
+    describe "#redirect_to" do
+      context "when the page was a redirect" do
+        it "returns a URI of the page it redirects to" do
+          new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
+          redirect = new_page.redirect_to
+          redirect.should be_a(URI)
+          redirect.to_s.should == SPEC_DOMAIN + '1'
+        end
+      end
+    end
+    it "should detect, store and expose the base url for the page head" do
+      base = "#{SPEC_DOMAIN}path/to/base_url/"
+      page = @http.fetch_page(FakePage.new('body_test', {:base => base}).url)
+      page.base.should == URI(base)
+      @page.base.should be_nil
+    end
+    it "should have a method to convert a relative url to an absolute one" do
+      @page.should respond_to(:to_absolute)
+      # Identity
+      @page.to_absolute(@page.url).should == @page.url
+      @page.to_absolute("").should == @page.url
+      # Root-ness
+      @page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
+      # Relativeness
+      relative_path = "a/relative/path"
+      @page.to_absolute(relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
+      deep_page = @http.fetch_page(FakePage.new('home/deep', :links => '1').url)
+      upward_relative_path = "../a/relative/path"
+      deep_page.to_absolute(upward_relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
+      # The base URL case
+      base_path = "path/to/base_url/"
+      base = "#{SPEC_DOMAIN}#{base_path}"
+      page = @http.fetch_page(FakePage.new('home', {:base => base}).url)
+      # Identity
+      page.to_absolute(page.url).should == page.url
+      # It should revert to the base url
+      page.to_absolute("").should_not == page.url
+      # Root-ness
+      page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
+      # Relativeness
+      relative_path = "a/relative/path"
+      page.to_absolute(relative_path).should == URI("#{base}#{relative_path}")
+      upward_relative_path = "../a/relative/path"
+      upward_base = "#{SPEC_DOMAIN}path/to/"
+      page.to_absolute(upward_relative_path).should == URI("#{upward_base}#{relative_path}")
+    end
+  end
+end