sutch-anemone 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/CHANGELOG.rdoc +136 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +38 -0
- data/Rakefile +23 -0
- data/VERSION +1 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +339 -0
- data/lib/anemone/exceptions.rb +5 -0
- data/lib/anemone/http.rb +187 -0
- data/lib/anemone/page.rb +217 -0
- data/lib/anemone/page_store.rb +161 -0
- data/lib/anemone/resource.rb +42 -0
- data/lib/anemone/storage.rb +44 -0
- data/lib/anemone/storage/base.rb +75 -0
- data/lib/anemone/storage/exceptions.rb +15 -0
- data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
- data/lib/anemone/storage/mongodb.rb +89 -0
- data/lib/anemone/storage/pstore.rb +50 -0
- data/lib/anemone/storage/redis.rb +90 -0
- data/lib/anemone/storage/sqlite3.rb +90 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +60 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +16 -0
- data/spec/cookie_store_spec.rb +28 -0
- data/spec/core_spec.rb +344 -0
- data/spec/fakeweb_helper.rb +77 -0
- data/spec/http_spec.rb +19 -0
- data/spec/page_spec.rb +186 -0
- data/spec/page_store_spec.rb +171 -0
- data/spec/resource_spec.rb +91 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/storage_spec.rb +252 -0
- metadata +281 -0
| @@ -0,0 +1,77 @@ | |
| 1 | 
            +
            FakeWeb.allow_net_connect = false
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Anemone
         | 
| 4 | 
            +
              SPEC_DOMAIN = "http://www.example.com/"
         | 
| 5 | 
            +
              AUTH_SPEC_DOMAIN = "http://user:pass@#{URI.parse(SPEC_DOMAIN).host}/"
         | 
| 6 | 
            +
             | 
| 7 | 
            +
              class FakePage
         | 
| 8 | 
            +
                attr_accessor :links
         | 
| 9 | 
            +
                attr_accessor :hrefs
         | 
| 10 | 
            +
                attr_accessor :body
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                def initialize(name = '', options = {})
         | 
| 13 | 
            +
                  @name = name
         | 
| 14 | 
            +
                  @links = [options[:links]].flatten if options.has_key?(:links)
         | 
| 15 | 
            +
                  @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
         | 
| 16 | 
            +
                  @redirect = options[:redirect] if options.has_key?(:redirect)
         | 
| 17 | 
            +
                  @auth = options[:auth] if options.has_key?(:auth)
         | 
| 18 | 
            +
                  @base = options[:base] if options.has_key?(:base)      
         | 
| 19 | 
            +
                  @content_type = options[:content_type] || "text/html"
         | 
| 20 | 
            +
                  @body = options[:body]
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                  create_body unless @body
         | 
| 23 | 
            +
                  add_to_fakeweb
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                def url
         | 
| 27 | 
            +
                  SPEC_DOMAIN + @name
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                def auth_url
         | 
| 31 | 
            +
                  AUTH_SPEC_DOMAIN + @name
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                private
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                def create_body
         | 
| 37 | 
            +
                  if @base
         | 
| 38 | 
            +
                    @body = "<html><head><base href=\"#{@base}\"></head><body>"
         | 
| 39 | 
            +
                  else
         | 
| 40 | 
            +
                    @body = "<html><body>"
         | 
| 41 | 
            +
                  end
         | 
| 42 | 
            +
                  @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
         | 
| 43 | 
            +
                  @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
         | 
| 44 | 
            +
                  @body += "</body></html>"
         | 
| 45 | 
            +
                end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                def add_to_fakeweb
         | 
| 48 | 
            +
                  options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                  if @redirect
         | 
| 51 | 
            +
                    options[:status] = [301, "Permanently Moved"]
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                    # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
         | 
| 54 | 
            +
                    redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
         | 
| 55 | 
            +
                    options[:location] = redirect_url
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                    # register the page this one redirects to
         | 
| 58 | 
            +
                    FakeWeb.register_uri(:get, redirect_url, {:body => '',
         | 
| 59 | 
            +
                                                              :content_type => @content_type,
         | 
| 60 | 
            +
                                                              :status => [200, "OK"]})
         | 
| 61 | 
            +
                  end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                  if @auth
         | 
| 64 | 
            +
                    unautorized_options = {
         | 
| 65 | 
            +
                      :body => "Unauthorized", :status => ["401", "Unauthorized"]
         | 
| 66 | 
            +
                    }
         | 
| 67 | 
            +
                    FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, unautorized_options)
         | 
| 68 | 
            +
                    FakeWeb.register_uri(:get, AUTH_SPEC_DOMAIN + @name, options)
         | 
| 69 | 
            +
                  else
         | 
| 70 | 
            +
                    FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
         | 
| 71 | 
            +
                  end
         | 
| 72 | 
            +
                end
         | 
| 73 | 
            +
              end
         | 
| 74 | 
            +
            end
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            #default root
         | 
| 77 | 
            +
            Anemone::FakePage.new
         | 
    
        data/spec/http_spec.rb
    ADDED
    
    | @@ -0,0 +1,19 @@ | |
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Anemone
         | 
| 4 | 
            +
              describe HTTP do
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                describe "fetch_page" do
         | 
| 7 | 
            +
                  before(:each) do
         | 
| 8 | 
            +
                    FakeWeb.clean_registry
         | 
| 9 | 
            +
                  end
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                  it "should still return a Page if an exception occurs during the HTTP connection" do
         | 
| 12 | 
            +
                    HTTP.stub!(:refresh_connection).and_raise(StandardError)
         | 
| 13 | 
            +
                    http = Anemone::HTTP.new(:page_class => Anemone::Page)
         | 
| 14 | 
            +
                    http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
         | 
| 15 | 
            +
                  end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                end
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
            end
         | 
    
        data/spec/page_spec.rb
    ADDED
    
    | @@ -0,0 +1,186 @@ | |
| 1 | 
            +
            $:.unshift(File.dirname(__FILE__))
         | 
| 2 | 
            +
            require 'spec_helper'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module Anemone
         | 
| 5 | 
            +
              describe Page do
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                before(:each) do
         | 
| 8 | 
            +
                  FakeWeb.clean_registry
         | 
| 9 | 
            +
                  @http = Anemone::HTTP.new(:page_class => Anemone::Page)
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                  @page = @http.fetch_page(FakePage.new('home', :links => '1').url)
         | 
| 12 | 
            +
                end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                it "should indicate whether it successfully fetched via HTTP" do
         | 
| 15 | 
            +
                  @page.should respond_to(:fetched?)
         | 
| 16 | 
            +
                  @page.fetched?.should == true
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                  fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
         | 
| 19 | 
            +
                  fail_page.fetched?.should == false
         | 
| 20 | 
            +
                end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                it "should store and expose the response body of the HTTP request" do
         | 
| 23 | 
            +
                  body = 'test'
         | 
| 24 | 
            +
                  page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
         | 
| 25 | 
            +
                  page.body.should == body
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                it "should record any error that occurs during fetch_page" do
         | 
| 29 | 
            +
                  @page.should respond_to(:error)
         | 
| 30 | 
            +
                  @page.error.should be_nil
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
         | 
| 33 | 
            +
                  fail_page.error.should_not be_nil
         | 
| 34 | 
            +
                end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                it "should store the response headers when fetching a page" do
         | 
| 37 | 
            +
                  @page.headers.should_not be_nil
         | 
| 38 | 
            +
                  @page.headers.should have_key('content-type')
         | 
| 39 | 
            +
                end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                it "should have an OpenStruct attribute for the developer to store data in" do
         | 
| 42 | 
            +
                  @page.data.should_not be_nil
         | 
| 43 | 
            +
                  @page.data.should be_an_instance_of(OpenStruct)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                  @page.data.test = 'test'
         | 
| 46 | 
            +
                  @page.data.test.should == 'test'
         | 
| 47 | 
            +
                end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                it "should have a Nokogori::HTML::Document attribute for the page body" do
         | 
| 50 | 
            +
                  @page.doc.should_not be_nil
         | 
| 51 | 
            +
                  @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
         | 
| 52 | 
            +
                end
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                it "should indicate whether it was fetched after an HTTP redirect" do
         | 
| 55 | 
            +
                  @page.should respond_to(:redirect?)
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                  @page.redirect?.should == false
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                  @http.fetch_pages(FakePage.new('redir', :redirect => 'home').url).first.redirect?.should == true
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                it "should have a method to tell if a URI is in the same domain as the page" do
         | 
| 63 | 
            +
                  @page.should respond_to(:in_domain?)
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                  @page.in_domain?(URI(FakePage.new('test').url)).should == true
         | 
| 66 | 
            +
                  @page.in_domain?(URI('http://www.other.com/')).should == false
         | 
| 67 | 
            +
                end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                it "should include the response time for the HTTP request" do
         | 
| 70 | 
            +
                  @page.should respond_to(:response_time)
         | 
| 71 | 
            +
                end
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                it "should have the cookies received with the page" do
         | 
| 74 | 
            +
                  @page.should respond_to(:cookies)
         | 
| 75 | 
            +
                  @page.cookies.should == []
         | 
| 76 | 
            +
                end
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                describe "#to_hash" do
         | 
| 79 | 
            +
                  it "converts the page to a hash" do
         | 
| 80 | 
            +
                    hash = @page.to_hash
         | 
| 81 | 
            +
                    hash['url'].should == @page.url.to_s
         | 
| 82 | 
            +
                    hash['referer'].should == @page.referer.to_s
         | 
| 83 | 
            +
                    hash['links'].should == @page.links.map(&:to_s)
         | 
| 84 | 
            +
                  end
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                  context "when redirect_to is nil" do
         | 
| 87 | 
            +
                    it "sets 'redirect_to' to nil in the hash" do
         | 
| 88 | 
            +
                      @page.redirect_to.should be_nil
         | 
| 89 | 
            +
                      @page.to_hash[:redirect_to].should be_nil
         | 
| 90 | 
            +
                    end
         | 
| 91 | 
            +
                  end
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                  context "when redirect_to is a non-nil URI" do
         | 
| 94 | 
            +
                    it "sets 'redirect_to' to the URI string" do
         | 
| 95 | 
            +
                      new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
         | 
| 96 | 
            +
                      new_page.redirect_to.to_s.should == SPEC_DOMAIN + '1'
         | 
| 97 | 
            +
                      new_page.to_hash['redirect_to'].should == SPEC_DOMAIN + '1'
         | 
| 98 | 
            +
                    end
         | 
| 99 | 
            +
                  end
         | 
| 100 | 
            +
                end
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                describe "#from_hash" do
         | 
| 103 | 
            +
                  it "converts from a hash to a Page" do
         | 
| 104 | 
            +
                    page = @page.dup
         | 
| 105 | 
            +
                    page.depth = 1
         | 
| 106 | 
            +
                    converted = Page.from_hash(page.to_hash)
         | 
| 107 | 
            +
                    converted.links.should == page.links
         | 
| 108 | 
            +
                    converted.depth.should == page.depth
         | 
| 109 | 
            +
                  end
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                  it 'handles a from_hash with a nil redirect_to' do
         | 
| 112 | 
            +
                    page_hash = @page.to_hash
         | 
| 113 | 
            +
                    page_hash['redirect_to'] = nil
         | 
| 114 | 
            +
                    lambda{Page.from_hash(page_hash)}.should_not raise_error(URI::InvalidURIError)
         | 
| 115 | 
            +
                    Page.from_hash(page_hash).redirect_to.should be_nil
         | 
| 116 | 
            +
                  end
         | 
| 117 | 
            +
                end
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                describe "#redirect_to" do
         | 
| 120 | 
            +
                  context "when the page was a redirect" do
         | 
| 121 | 
            +
                    it "returns a URI of the page it redirects to" do
         | 
| 122 | 
            +
                      new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
         | 
| 123 | 
            +
                      redirect = new_page.redirect_to
         | 
| 124 | 
            +
                      redirect.should be_a(URI)
         | 
| 125 | 
            +
                      redirect.to_s.should == SPEC_DOMAIN + '1'
         | 
| 126 | 
            +
                    end
         | 
| 127 | 
            +
                  end
         | 
| 128 | 
            +
                end
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                describe "#links" do
         | 
| 131 | 
            +
                  it "should not convert anchors to %23" do
         | 
| 132 | 
            +
                    page = @http.fetch_page(FakePage.new('', :body => '<a href="#top">Top</a>').url)
         | 
| 133 | 
            +
                    page.links.should have(1).link
         | 
| 134 | 
            +
                    page.links.first.to_s.should == SPEC_DOMAIN
         | 
| 135 | 
            +
                  end
         | 
| 136 | 
            +
                end
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                it "should detect, store and expose the base url for the page head" do
         | 
| 139 | 
            +
                  base = "#{SPEC_DOMAIN}path/to/base_url/"
         | 
| 140 | 
            +
                  page = @http.fetch_page(FakePage.new('body_test', {:base => base}).url)
         | 
| 141 | 
            +
                  page.base.should == URI(base)
         | 
| 142 | 
            +
                  @page.base.should be_nil
         | 
| 143 | 
            +
                end
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                it "should have a method to convert a relative url to an absolute one" do
         | 
| 146 | 
            +
                  @page.should respond_to(:to_absolute)
         | 
| 147 | 
            +
                  
         | 
| 148 | 
            +
                  # Identity
         | 
| 149 | 
            +
                  @page.to_absolute(@page.url).should == @page.url
         | 
| 150 | 
            +
                  @page.to_absolute("").should == @page.url
         | 
| 151 | 
            +
                  
         | 
| 152 | 
            +
                  # Root-ness
         | 
| 153 | 
            +
                  @page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
         | 
| 154 | 
            +
                  
         | 
| 155 | 
            +
                  # Relativeness
         | 
| 156 | 
            +
                  relative_path = "a/relative/path"
         | 
| 157 | 
            +
                  @page.to_absolute(relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
         | 
| 158 | 
            +
                  
         | 
| 159 | 
            +
                  deep_page = @http.fetch_page(FakePage.new('home/deep', :links => '1').url)
         | 
| 160 | 
            +
                  upward_relative_path = "../a/relative/path"
         | 
| 161 | 
            +
                  deep_page.to_absolute(upward_relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
         | 
| 162 | 
            +
                  
         | 
| 163 | 
            +
                  # The base URL case
         | 
| 164 | 
            +
                  base_path = "path/to/base_url/"
         | 
| 165 | 
            +
                  base = "#{SPEC_DOMAIN}#{base_path}"
         | 
| 166 | 
            +
                  page = @http.fetch_page(FakePage.new('home', {:base => base}).url)
         | 
| 167 | 
            +
                  
         | 
| 168 | 
            +
                  # Identity
         | 
| 169 | 
            +
                  page.to_absolute(page.url).should == page.url
         | 
| 170 | 
            +
                  # It should revert to the base url
         | 
| 171 | 
            +
                  page.to_absolute("").should_not == page.url
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                  # Root-ness
         | 
| 174 | 
            +
                  page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
         | 
| 175 | 
            +
                  
         | 
| 176 | 
            +
                  # Relativeness
         | 
| 177 | 
            +
                  relative_path = "a/relative/path"
         | 
| 178 | 
            +
                  page.to_absolute(relative_path).should == URI("#{base}#{relative_path}")
         | 
| 179 | 
            +
                  
         | 
| 180 | 
            +
                  upward_relative_path = "../a/relative/path"
         | 
| 181 | 
            +
                  upward_base = "#{SPEC_DOMAIN}path/to/"
         | 
| 182 | 
            +
                  page.to_absolute(upward_relative_path).should == URI("#{upward_base}#{relative_path}")      
         | 
| 183 | 
            +
                end
         | 
| 184 | 
            +
             | 
| 185 | 
            +
              end
         | 
| 186 | 
            +
            end
         | 
| @@ -0,0 +1,171 @@ | |
| 1 | 
            +
            $:.unshift(File.dirname(__FILE__))
         | 
| 2 | 
            +
            require 'spec_helper'
         | 
| 3 | 
            +
            %w[pstore tokyo_cabinet sqlite3 mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module Anemone
         | 
| 6 | 
            +
              describe PageStore do
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                before(:all) do
         | 
| 9 | 
            +
                  FakeWeb.clean_registry
         | 
| 10 | 
            +
                end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                shared_examples_for "page storage" do
         | 
| 13 | 
            +
                  it "should be able to compute single-source shortest paths in-place" do
         | 
| 14 | 
            +
                    pages = []
         | 
| 15 | 
            +
                    pages << FakePage.new('0', :links => ['1', '3'])
         | 
| 16 | 
            +
                    pages << FakePage.new('1', :redirect => '2')
         | 
| 17 | 
            +
                    pages << FakePage.new('2', :links => ['4'])
         | 
| 18 | 
            +
                    pages << FakePage.new('3')
         | 
| 19 | 
            +
                    pages << FakePage.new('4')
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                    # crawl, then set depths to nil
         | 
| 22 | 
            +
                    page_store = Anemone.crawl(pages.first.url, @opts) do |a|
         | 
| 23 | 
            +
                      a.after_crawl do |ps|
         | 
| 24 | 
            +
                        ps.each { |url, page| page.depth = nil; ps[url] = page }
         | 
| 25 | 
            +
                      end
         | 
| 26 | 
            +
                    end.pages
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    page_store.should respond_to(:shortest_paths!)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    page_store.shortest_paths!(pages[0].url)
         | 
| 31 | 
            +
                    page_store[pages[0].url].depth.should == 0
         | 
| 32 | 
            +
                    page_store[pages[1].url].depth.should == 1
         | 
| 33 | 
            +
                    page_store[pages[2].url].depth.should == 1
         | 
| 34 | 
            +
                    page_store[pages[3].url].depth.should == 1
         | 
| 35 | 
            +
                    page_store[pages[4].url].depth.should == 2
         | 
| 36 | 
            +
                  end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                  it "should be able to remove all redirects in-place" do
         | 
| 39 | 
            +
                    pages = []
         | 
| 40 | 
            +
                    pages << FakePage.new('0', :links => ['1'])
         | 
| 41 | 
            +
                    pages << FakePage.new('1', :redirect => '2')
         | 
| 42 | 
            +
                    pages << FakePage.new('2')
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                    page_store = Anemone.crawl(pages[0].url, @opts).pages
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    page_store.should respond_to(:uniq!)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                    page_store.uniq!
         | 
| 49 | 
            +
                    page_store.has_key?(pages[1].url).should == false
         | 
| 50 | 
            +
                    page_store.has_key?(pages[0].url).should == true
         | 
| 51 | 
            +
                    page_store.has_key?(pages[2].url).should == true
         | 
| 52 | 
            +
                  end
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                  it "should be able to find pages linking to a url" do
         | 
| 55 | 
            +
                    pages = []
         | 
| 56 | 
            +
                    pages << FakePage.new('0', :links => ['1'])
         | 
| 57 | 
            +
                    pages << FakePage.new('1', :redirect => '2')
         | 
| 58 | 
            +
                    pages << FakePage.new('2')
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                    page_store = Anemone.crawl(pages[0].url, @opts).pages
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                    page_store.should respond_to(:pages_linking_to)
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    page_store.pages_linking_to(pages[2].url).size.should == 0
         | 
| 65 | 
            +
                    links_to_1 = page_store.pages_linking_to(pages[1].url)
         | 
| 66 | 
            +
                    links_to_1.size.should == 1
         | 
| 67 | 
            +
                    links_to_1.first.should be_an_instance_of(Page)
         | 
| 68 | 
            +
                    links_to_1.first.url.to_s.should == pages[0].url
         | 
| 69 | 
            +
                  end
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                  it "should be able to find urls linking to a url" do
         | 
| 72 | 
            +
                    pages = []
         | 
| 73 | 
            +
                    pages << FakePage.new('0', :links => ['1'])
         | 
| 74 | 
            +
                    pages << FakePage.new('1', :redirect => '2')
         | 
| 75 | 
            +
                    pages << FakePage.new('2')
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                    page_store = Anemone.crawl(pages[0].url, @opts).pages
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    page_store.should respond_to(:pages_linking_to)
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                    page_store.urls_linking_to(pages[2].url).size.should == 0
         | 
| 82 | 
            +
                    links_to_1 = page_store.urls_linking_to(pages[1].url)
         | 
| 83 | 
            +
                    links_to_1.size.should == 1
         | 
| 84 | 
            +
                    links_to_1.first.to_s.should == pages[0].url
         | 
| 85 | 
            +
                  end
         | 
| 86 | 
            +
                end
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                describe Hash do
         | 
| 89 | 
            +
                  it_should_behave_like "page storage"
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                  before(:all) do
         | 
| 92 | 
            +
                    @opts = {}
         | 
| 93 | 
            +
                  end
         | 
| 94 | 
            +
                end
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                describe Storage::PStore do
         | 
| 97 | 
            +
                  it_should_behave_like "page storage"
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                  before(:each) do
         | 
| 100 | 
            +
                    @test_file = 'test.pstore'
         | 
| 101 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 102 | 
            +
                    @opts = {:storage => Storage.PStore(@test_file)}
         | 
| 103 | 
            +
                  end
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                  after(:each) do
         | 
| 106 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 107 | 
            +
                  end
         | 
| 108 | 
            +
                end
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                describe Storage::TokyoCabinet do
         | 
| 111 | 
            +
                  it_should_behave_like "page storage"
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                  before(:each) do
         | 
| 114 | 
            +
                    @test_file = 'test.tch'
         | 
| 115 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 116 | 
            +
                    @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
         | 
| 117 | 
            +
                  end
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                  after(:each) do
         | 
| 120 | 
            +
                    @store.close
         | 
| 121 | 
            +
                  end
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                  after(:each) do
         | 
| 124 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 125 | 
            +
                  end
         | 
| 126 | 
            +
                end
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                describe Storage::SQLite3 do
         | 
| 129 | 
            +
                  it_should_behave_like "page storage"
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                  before(:each) do
         | 
| 132 | 
            +
                    @test_file = 'test.db'
         | 
| 133 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 134 | 
            +
                    @opts = {:storage => @store = Storage.SQLite3(@test_file)}
         | 
| 135 | 
            +
                  end
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                  after(:each) do
         | 
| 138 | 
            +
                    @store.close
         | 
| 139 | 
            +
                  end
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                  after(:each) do
         | 
| 142 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 143 | 
            +
                  end
         | 
| 144 | 
            +
                end
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                describe Storage::MongoDB do
         | 
| 147 | 
            +
                  it_should_behave_like "page storage"
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                  before(:each) do
         | 
| 150 | 
            +
                    @opts = {:storage => @store = Storage.MongoDB}
         | 
| 151 | 
            +
                  end
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                  after(:each) do
         | 
| 154 | 
            +
                    @store.close
         | 
| 155 | 
            +
                  end
         | 
| 156 | 
            +
                end
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                describe Storage::Redis do
         | 
| 159 | 
            +
                  it_should_behave_like "page storage"
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                  before(:each) do
         | 
| 162 | 
            +
                    @opts = {:storage => @store = Storage.Redis}
         | 
| 163 | 
            +
                  end
         | 
| 164 | 
            +
             | 
| 165 | 
            +
                  after(:each) do
         | 
| 166 | 
            +
                    @store.close
         | 
| 167 | 
            +
                  end
         | 
| 168 | 
            +
                end
         | 
| 169 | 
            +
             | 
| 170 | 
            +
              end
         | 
| 171 | 
            +
            end
         |