sutch-anemone 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/CHANGELOG.rdoc +136 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +38 -0
- data/Rakefile +23 -0
- data/VERSION +1 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +339 -0
- data/lib/anemone/exceptions.rb +5 -0
- data/lib/anemone/http.rb +187 -0
- data/lib/anemone/page.rb +217 -0
- data/lib/anemone/page_store.rb +161 -0
- data/lib/anemone/resource.rb +42 -0
- data/lib/anemone/storage.rb +44 -0
- data/lib/anemone/storage/base.rb +75 -0
- data/lib/anemone/storage/exceptions.rb +15 -0
- data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
- data/lib/anemone/storage/mongodb.rb +89 -0
- data/lib/anemone/storage/pstore.rb +50 -0
- data/lib/anemone/storage/redis.rb +90 -0
- data/lib/anemone/storage/sqlite3.rb +90 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +60 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +16 -0
- data/spec/cookie_store_spec.rb +28 -0
- data/spec/core_spec.rb +344 -0
- data/spec/fakeweb_helper.rb +77 -0
- data/spec/http_spec.rb +19 -0
- data/spec/page_spec.rb +186 -0
- data/spec/page_store_spec.rb +171 -0
- data/spec/resource_spec.rb +91 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/storage_spec.rb +252 -0
- metadata +281 -0
| @@ -0,0 +1,60 @@ | |
| 1 | 
            +
            begin
         | 
| 2 | 
            +
              require 'tokyocabinet'
         | 
| 3 | 
            +
            rescue LoadError
         | 
| 4 | 
            +
              puts $!
         | 
| 5 | 
            +
              puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
         | 
| 6 | 
            +
              exit
         | 
| 7 | 
            +
            end
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            require 'forwardable'
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            module Anemone
         | 
| 12 | 
            +
              module Storage
         | 
| 13 | 
            +
                class TokyoCabinet
         | 
| 14 | 
            +
                  extend Forwardable
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                  def_delegators :@db, :close, :size, :keys, :has_key?
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                  def initialize(file)
         | 
| 19 | 
            +
                    raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
         | 
| 20 | 
            +
                    @db = ::TokyoCabinet::HDB::new
         | 
| 21 | 
            +
                    @db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
         | 
| 22 | 
            +
                    @db.clear
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  def [](key)
         | 
| 26 | 
            +
                    if value = @db[key]
         | 
| 27 | 
            +
                      load_value(value)
         | 
| 28 | 
            +
                    end
         | 
| 29 | 
            +
                  end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                  def []=(key, value)
         | 
| 32 | 
            +
                    @db[key] = [Marshal.dump(value)].pack("m")
         | 
| 33 | 
            +
                  end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                  def delete(key)
         | 
| 36 | 
            +
                    value = self[key]
         | 
| 37 | 
            +
                    @db.delete(key)
         | 
| 38 | 
            +
                    value
         | 
| 39 | 
            +
                  end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                  def each
         | 
| 42 | 
            +
                    @db.keys.each do |k|
         | 
| 43 | 
            +
                      yield(k, self[k])
         | 
| 44 | 
            +
                    end
         | 
| 45 | 
            +
                  end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                  def merge!(hash)
         | 
| 48 | 
            +
                    hash.each { |key, value| self[key] = value }
         | 
| 49 | 
            +
                    self
         | 
| 50 | 
            +
                  end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                  private
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                  def load_value(value)
         | 
| 55 | 
            +
                    Marshal.load(value.unpack("m")[0])
         | 
| 56 | 
            +
                  end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                end
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
            end
         | 
| @@ -0,0 +1,39 @@ | |
| 1 | 
            +
            require 'anemone/http'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Anemone
         | 
| 4 | 
            +
              class Tentacle
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                #
         | 
| 7 | 
            +
                # Create a new Tentacle
         | 
| 8 | 
            +
                #
         | 
| 9 | 
            +
                def initialize(link_queue, page_queue, opts = {})
         | 
| 10 | 
            +
                  @link_queue = link_queue
         | 
| 11 | 
            +
                  @page_queue = page_queue
         | 
| 12 | 
            +
                  @http = Anemone::HTTP.new(opts)
         | 
| 13 | 
            +
                  @opts = opts
         | 
| 14 | 
            +
                end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                #
         | 
| 17 | 
            +
                # Gets links from @link_queue, and returns the fetched
         | 
| 18 | 
            +
                # Page objects into @page_queue
         | 
| 19 | 
            +
                #
         | 
| 20 | 
            +
                def run
         | 
| 21 | 
            +
                  loop do
         | 
| 22 | 
            +
                    link, referer, depth = @link_queue.deq
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    break if link == :END
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    @http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    delay
         | 
| 29 | 
            +
                  end
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                private
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                def delay
         | 
| 35 | 
            +
                  sleep @opts[:delay] if @opts[:delay] > 0
         | 
| 36 | 
            +
                end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
              end
         | 
| 39 | 
            +
            end
         | 
| @@ -0,0 +1,16 @@ | |
| 1 | 
            +
            $:.unshift(File.dirname(__FILE__))
         | 
| 2 | 
            +
            require 'spec_helper'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            describe Anemone do
         | 
| 5 | 
            +
             | 
| 6 | 
            +
              it "should have a version" do
         | 
| 7 | 
            +
                Anemone.const_defined?('VERSION').should == true
         | 
| 8 | 
            +
              end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
              it "should return a Anemone::Core from the crawl, which has a PageStore" do
         | 
| 11 | 
            +
                result = Anemone.crawl(SPEC_DOMAIN)
         | 
| 12 | 
            +
                result.should be_an_instance_of(Anemone::Core)
         | 
| 13 | 
            +
                result.pages.should be_an_instance_of(Anemone::PageStore)
         | 
| 14 | 
            +
              end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            end
         | 
| @@ -0,0 +1,28 @@ | |
| 1 | 
            +
            $:.unshift(File.dirname(__FILE__))
         | 
| 2 | 
            +
            require 'spec_helper'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module Anemone
         | 
| 5 | 
            +
              describe CookieStore do
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                it "should start out empty if no cookies are specified" do
         | 
| 8 | 
            +
                  CookieStore.new.empty?.should be true
         | 
| 9 | 
            +
                end
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                it "should accept a Hash of cookies in the constructor" do
         | 
| 12 | 
            +
                  CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
         | 
| 13 | 
            +
                end
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                it "should be able to merge an HTTP cookie string" do
         | 
| 16 | 
            +
                  cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
         | 
| 17 | 
            +
                  cs.merge! "a=A; path=/, c=C; path=/"
         | 
| 18 | 
            +
                  cs['a'].value.should == 'A'
         | 
| 19 | 
            +
                  cs['b'].value.should == 'b'
         | 
| 20 | 
            +
                  cs['c'].value.should == 'C'
         | 
| 21 | 
            +
                end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
         | 
| 24 | 
            +
                  CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
         | 
| 25 | 
            +
                end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
              end
         | 
| 28 | 
            +
            end
         | 
    
        data/spec/core_spec.rb
    ADDED
    
    | @@ -0,0 +1,344 @@ | |
| 1 | 
            +
            $:.unshift(File.dirname(__FILE__))
         | 
| 2 | 
            +
            require 'spec_helper'
         | 
| 3 | 
            +
            %w[pstore tokyo_cabinet sqlite3].each { |file| require "anemone/storage/#{file}.rb" }
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module Anemone
         | 
| 6 | 
            +
              describe Core do
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                before(:each) do
         | 
| 9 | 
            +
                  FakeWeb.clean_registry
         | 
| 10 | 
            +
                end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                shared_examples_for "crawl" do
         | 
| 13 | 
            +
                  it "should crawl all the html pages in a domain by following <a> href's" do
         | 
| 14 | 
            +
                    pages = []
         | 
| 15 | 
            +
                    pages << FakePage.new('0', :links => ['1', '2'])
         | 
| 16 | 
            +
                    pages << FakePage.new('1', :links => ['3'])
         | 
| 17 | 
            +
                    pages << FakePage.new('2')
         | 
| 18 | 
            +
                    pages << FakePage.new('3')
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    Anemone.crawl(pages[0].url, @opts).should have(4).pages
         | 
| 21 | 
            +
                  end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                  it "should not follow links that leave the original domain" do
         | 
| 24 | 
            +
                    pages = []
         | 
| 25 | 
            +
                    pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
         | 
| 26 | 
            +
                    pages << FakePage.new('1')
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    core = Anemone.crawl(pages[0].url, @opts)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    core.should have(2).pages
         | 
| 31 | 
            +
                    core.pages.keys.should_not include('http://www.other.com/')
         | 
| 32 | 
            +
                  end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                  it "should not follow redirects that leave the original domain" do
         | 
| 35 | 
            +
                    pages = []
         | 
| 36 | 
            +
                    pages << FakePage.new('0', :links => ['1'], :redirect => 'http://www.other.com/')
         | 
| 37 | 
            +
                    pages << FakePage.new('1')
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                    core = Anemone.crawl(pages[0].url, @opts)
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    core.should have(2).pages
         | 
| 42 | 
            +
                    core.pages.keys.should_not include('http://www.other.com/')
         | 
| 43 | 
            +
                  end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                  it "should follow http redirects" do
         | 
| 46 | 
            +
                    pages = []
         | 
| 47 | 
            +
                    pages << FakePage.new('0', :links => ['1'])
         | 
| 48 | 
            +
                    pages << FakePage.new('1', :redirect => '2')
         | 
| 49 | 
            +
                    pages << FakePage.new('2')
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    Anemone.crawl(pages[0].url, @opts).should have(3).pages
         | 
| 52 | 
            +
                  end
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                  it "should follow with HTTP basic authentication" do
         | 
| 55 | 
            +
                    pages = []
         | 
| 56 | 
            +
                    pages << FakePage.new('0', :links => ['1', '2'], :auth => true)
         | 
| 57 | 
            +
                    pages << FakePage.new('1', :links => ['3'], :auth => true)
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                    Anemone.crawl(pages.first.auth_url, @opts).should have(3).pages
         | 
| 60 | 
            +
                  end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                  it "should accept multiple starting URLs" do
         | 
| 63 | 
            +
                    pages = []
         | 
| 64 | 
            +
                    pages << FakePage.new('0', :links => ['1'])
         | 
| 65 | 
            +
                    pages << FakePage.new('1')
         | 
| 66 | 
            +
                    pages << FakePage.new('2', :links => ['3'])
         | 
| 67 | 
            +
                    pages << FakePage.new('3')
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                    Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
         | 
| 70 | 
            +
                  end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                  it "should include the query string when following links" do
         | 
| 73 | 
            +
                    pages = []
         | 
| 74 | 
            +
                    pages << FakePage.new('0', :links => ['1?foo=1'])
         | 
| 75 | 
            +
                    pages << FakePage.new('1?foo=1')
         | 
| 76 | 
            +
                    pages << FakePage.new('1')
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                    core = Anemone.crawl(pages[0].url, @opts)
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                    core.should have(2).pages
         | 
| 81 | 
            +
                    core.pages.keys.should_not include(pages[2].url)
         | 
| 82 | 
            +
                  end
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                  it "should be able to skip links with query strings" do
         | 
| 85 | 
            +
                    pages = []
         | 
| 86 | 
            +
                    pages << FakePage.new('0', :links => ['1?foo=1', '2'])
         | 
| 87 | 
            +
                    pages << FakePage.new('1?foo=1')
         | 
| 88 | 
            +
                    pages << FakePage.new('2')
         | 
| 89 | 
            +
                    
         | 
| 90 | 
            +
                    core = Anemone.crawl(pages[0].url, @opts) do |a|
         | 
| 91 | 
            +
                      a.skip_query_strings = true
         | 
| 92 | 
            +
                    end
         | 
| 93 | 
            +
                    
         | 
| 94 | 
            +
                    core.should have(2).pages
         | 
| 95 | 
            +
                  end
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                  it "should be able to skip links based on a RegEx" do
         | 
| 98 | 
            +
                    pages = []
         | 
| 99 | 
            +
                    pages << FakePage.new('0', :links => ['1', '2'])
         | 
| 100 | 
            +
                    pages << FakePage.new('1')
         | 
| 101 | 
            +
                    pages << FakePage.new('2')
         | 
| 102 | 
            +
                    pages << FakePage.new('3')
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                    core = Anemone.crawl(pages[0].url, @opts) do |a|
         | 
| 105 | 
            +
                      a.skip_links_like /1/, /3/
         | 
| 106 | 
            +
                    end
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                    core.should have(2).pages
         | 
| 109 | 
            +
                    core.pages.keys.should_not include(pages[1].url)
         | 
| 110 | 
            +
                    core.pages.keys.should_not include(pages[3].url)
         | 
| 111 | 
            +
                  end
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                  it "should be able to call a block on every page" do
         | 
| 114 | 
            +
                    pages = []
         | 
| 115 | 
            +
                    pages << FakePage.new('0', :links => ['1', '2'])
         | 
| 116 | 
            +
                    pages << FakePage.new('1')
         | 
| 117 | 
            +
                    pages << FakePage.new('2')
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                    count = 0
         | 
| 120 | 
            +
                    Anemone.crawl(pages[0].url, @opts) do |a|
         | 
| 121 | 
            +
                      a.on_every_page { count += 1 }
         | 
| 122 | 
            +
                    end
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                    count.should == 3
         | 
| 125 | 
            +
                  end
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                  it "should not discard page bodies by default" do
         | 
| 128 | 
            +
                    Anemone.crawl(FakePage.new('0').url, @opts).pages.values#.first.doc.should_not be_nil
         | 
| 129 | 
            +
                  end
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                  it "should optionally discard page bodies to conserve memory" do
         | 
| 132 | 
            +
                   # core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
         | 
| 133 | 
            +
                   # core.pages.values.first.doc.should be_nil
         | 
| 134 | 
            +
                  end
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                  it "should provide a focus_crawl method to select the links on each page to follow" do
         | 
| 137 | 
            +
                    pages = []
         | 
| 138 | 
            +
                    pages << FakePage.new('0', :links => ['1', '2'])
         | 
| 139 | 
            +
                    pages << FakePage.new('1')
         | 
| 140 | 
            +
                    pages << FakePage.new('2')
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                    core = Anemone.crawl(pages[0].url, @opts) do |a|
         | 
| 143 | 
            +
                      a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
         | 
| 144 | 
            +
                    end
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                    core.should have(2).pages
         | 
| 147 | 
            +
                    core.pages.keys.should_not include(pages[1].url)
         | 
| 148 | 
            +
                  end
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                  it "should optionally delay between page requests" do
         | 
| 151 | 
            +
                    delay = 0.25
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                    pages = []
         | 
| 154 | 
            +
                    pages << FakePage.new('0', :links => '1')
         | 
| 155 | 
            +
                    pages << FakePage.new('1')
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                    start = Time.now
         | 
| 158 | 
            +
                    Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
         | 
| 159 | 
            +
                    finish = Time.now
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                    (finish - start).should satisfy {|t| t > delay * 2}
         | 
| 162 | 
            +
                  end
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                  it "should optionally obey the robots exclusion protocol" do
         | 
| 165 | 
            +
                    pages = []
         | 
| 166 | 
            +
                    pages << FakePage.new('0', :links => '1')
         | 
| 167 | 
            +
                    pages << FakePage.new('1')
         | 
| 168 | 
            +
                    pages << FakePage.new('robots.txt',
         | 
| 169 | 
            +
                                          :body => "User-agent: *\nDisallow: /1",
         | 
| 170 | 
            +
                                          :content_type => 'text/plain')
         | 
| 171 | 
            +
             | 
| 172 | 
            +
                    core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
         | 
| 173 | 
            +
                    urls = core.pages.keys
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                    urls.should include(pages[0].url)
         | 
| 176 | 
            +
                    urls.should_not include(pages[1].url)
         | 
| 177 | 
            +
                  end
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                  it "should be able to set cookies to send with HTTP requests" do
         | 
| 180 | 
            +
                    cookies = {:a => '1', :b => '2'}
         | 
| 181 | 
            +
                    core = Anemone.crawl(FakePage.new('0').url) do |anemone|
         | 
| 182 | 
            +
                      anemone.cookies = cookies
         | 
| 183 | 
            +
                    end
         | 
| 184 | 
            +
                    core.opts[:cookies].should == cookies
         | 
| 185 | 
            +
                  end
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                  it "should freeze the options once the crawl begins" do
         | 
| 188 | 
            +
                    core = Anemone.crawl(FakePage.new('0').url) do |anemone|
         | 
| 189 | 
            +
                      anemone.threads = 4
         | 
| 190 | 
            +
                      anemone.on_every_page do
         | 
| 191 | 
            +
                        lambda {anemone.threads = 2}.should raise_error
         | 
| 192 | 
            +
                      end
         | 
| 193 | 
            +
                    end
         | 
| 194 | 
            +
                    core.opts[:threads].should == 4
         | 
| 195 | 
            +
                  end
         | 
| 196 | 
            +
             | 
| 197 | 
            +
                  describe "many pages" do
         | 
| 198 | 
            +
                    before(:each) do
         | 
| 199 | 
            +
                      @pages, size = [], 5
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                      size.times do |n|
         | 
| 202 | 
            +
                        # register this page with a link to the next page
         | 
| 203 | 
            +
                        link = (n + 1).to_s if n + 1 < size
         | 
| 204 | 
            +
                        @pages << FakePage.new(n.to_s, :links => Array(link))
         | 
| 205 | 
            +
                      end
         | 
| 206 | 
            +
                    end
         | 
| 207 | 
            +
             | 
| 208 | 
            +
                    it "should track the page depth and referer" do
         | 
| 209 | 
            +
                      core = Anemone.crawl(@pages[0].url, @opts)
         | 
| 210 | 
            +
                      previous_page = nil
         | 
| 211 | 
            +
             | 
| 212 | 
            +
                      @pages.each_with_index do |page, i|
         | 
| 213 | 
            +
                        page = core.pages[page.url]
         | 
| 214 | 
            +
                        page.should be
         | 
| 215 | 
            +
                        page.depth.should == i
         | 
| 216 | 
            +
             | 
| 217 | 
            +
                        if previous_page
         | 
| 218 | 
            +
                          page.referer.should == previous_page.url
         | 
| 219 | 
            +
                        else
         | 
| 220 | 
            +
                          page.referer.should be_nil
         | 
| 221 | 
            +
                        end
         | 
| 222 | 
            +
                        previous_page = page
         | 
| 223 | 
            +
                      end
         | 
| 224 | 
            +
                    end
         | 
| 225 | 
            +
             | 
| 226 | 
            +
                    it "should optionally limit the depth of the crawl" do
         | 
| 227 | 
            +
                      core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
         | 
| 228 | 
            +
                      core.should have(4).pages
         | 
| 229 | 
            +
                    end
         | 
| 230 | 
            +
                  end
         | 
| 231 | 
            +
             | 
| 232 | 
            +
                end
         | 
| 233 | 
            +
             | 
| 234 | 
            +
                describe Hash do
         | 
| 235 | 
            +
                  it_should_behave_like "crawl"
         | 
| 236 | 
            +
             | 
| 237 | 
            +
                  before(:all) do
         | 
| 238 | 
            +
                    @opts = {}
         | 
| 239 | 
            +
                  end
         | 
| 240 | 
            +
                end
         | 
| 241 | 
            +
             | 
| 242 | 
            +
                describe Storage::PStore do
         | 
| 243 | 
            +
                  it_should_behave_like "crawl"
         | 
| 244 | 
            +
             | 
| 245 | 
            +
                  before(:all) do
         | 
| 246 | 
            +
                    @test_file = 'test.pstore'
         | 
| 247 | 
            +
                  end
         | 
| 248 | 
            +
             | 
| 249 | 
            +
                  before(:each) do
         | 
| 250 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 251 | 
            +
                    @opts = {:storage => Storage.PStore(@test_file)}
         | 
| 252 | 
            +
                  end
         | 
| 253 | 
            +
             | 
| 254 | 
            +
                  after(:each) do
         | 
| 255 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 256 | 
            +
                  end
         | 
| 257 | 
            +
                end
         | 
| 258 | 
            +
             | 
| 259 | 
            +
                describe Storage::TokyoCabinet do
         | 
| 260 | 
            +
                  it_should_behave_like "crawl"
         | 
| 261 | 
            +
             | 
| 262 | 
            +
                  before(:all) do
         | 
| 263 | 
            +
                    @test_file = 'test.tch'
         | 
| 264 | 
            +
                  end
         | 
| 265 | 
            +
             | 
| 266 | 
            +
                  before(:each) do
         | 
| 267 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 268 | 
            +
                    @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
         | 
| 269 | 
            +
                  end
         | 
| 270 | 
            +
             | 
| 271 | 
            +
                  after(:each) do
         | 
| 272 | 
            +
                    @store.close
         | 
| 273 | 
            +
                  end
         | 
| 274 | 
            +
             | 
| 275 | 
            +
                  after(:each) do
         | 
| 276 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 277 | 
            +
                  end
         | 
| 278 | 
            +
                end
         | 
| 279 | 
            +
             | 
| 280 | 
            +
                describe Storage::SQLite3 do
         | 
| 281 | 
            +
                  it_should_behave_like "crawl"
         | 
| 282 | 
            +
             | 
| 283 | 
            +
                  before(:all) do
         | 
| 284 | 
            +
                    @test_file = 'test.db'
         | 
| 285 | 
            +
                  end
         | 
| 286 | 
            +
             | 
| 287 | 
            +
                  before(:each) do
         | 
| 288 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 289 | 
            +
                    @opts = {:storage => @store = Storage.SQLite3(@test_file)}
         | 
| 290 | 
            +
                  end
         | 
| 291 | 
            +
             | 
| 292 | 
            +
                  after(:each) do
         | 
| 293 | 
            +
                    @store.close
         | 
| 294 | 
            +
                  end
         | 
| 295 | 
            +
             | 
| 296 | 
            +
                  after(:each) do
         | 
| 297 | 
            +
                    File.delete(@test_file) if File.exists?(@test_file)
         | 
| 298 | 
            +
                  end
         | 
| 299 | 
            +
                end
         | 
| 300 | 
            +
             | 
| 301 | 
            +
                describe "options" do
         | 
| 302 | 
            +
                  it "should accept options for the crawl" do
         | 
| 303 | 
            +
                    core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
         | 
| 304 | 
            +
                                                      :threads => 2,
         | 
| 305 | 
            +
                                                      :discard_page_bodies => true,
         | 
| 306 | 
            +
                                                      :user_agent => 'test',
         | 
| 307 | 
            +
                                                      :obey_robots_txt => true,
         | 
| 308 | 
            +
                                                      :depth_limit => 3)
         | 
| 309 | 
            +
             | 
| 310 | 
            +
                    core.opts[:verbose].should == false
         | 
| 311 | 
            +
                    core.opts[:threads].should == 2
         | 
| 312 | 
            +
                    core.opts[:discard_page_bodies].should == true
         | 
| 313 | 
            +
                    core.opts[:delay].should == 0
         | 
| 314 | 
            +
                    core.opts[:user_agent].should == 'test'
         | 
| 315 | 
            +
                    core.opts[:obey_robots_txt].should == true
         | 
| 316 | 
            +
                    core.opts[:depth_limit].should == 3
         | 
| 317 | 
            +
                  end
         | 
| 318 | 
            +
             | 
| 319 | 
            +
                  it "should accept options via setter methods in the crawl block" do
         | 
| 320 | 
            +
                    core = Anemone.crawl(SPEC_DOMAIN) do |a|
         | 
| 321 | 
            +
                      a.verbose = false
         | 
| 322 | 
            +
                      a.threads = 2
         | 
| 323 | 
            +
                      a.discard_page_bodies = true
         | 
| 324 | 
            +
                      a.user_agent = 'test'
         | 
| 325 | 
            +
                      a.obey_robots_txt = true
         | 
| 326 | 
            +
                      a.depth_limit = 3
         | 
| 327 | 
            +
                    end
         | 
| 328 | 
            +
             | 
| 329 | 
            +
                    core.opts[:verbose].should == false
         | 
| 330 | 
            +
                    core.opts[:threads].should == 2
         | 
| 331 | 
            +
                    core.opts[:discard_page_bodies].should == true
         | 
| 332 | 
            +
                    core.opts[:delay].should == 0
         | 
| 333 | 
            +
                    core.opts[:user_agent].should == 'test'
         | 
| 334 | 
            +
                    core.opts[:obey_robots_txt].should == true
         | 
| 335 | 
            +
                    core.opts[:depth_limit].should == 3
         | 
| 336 | 
            +
                  end
         | 
| 337 | 
            +
             | 
| 338 | 
            +
                  it "should use 1 thread if a delay is requested" do
         | 
| 339 | 
            +
                    Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
         | 
| 340 | 
            +
                  end
         | 
| 341 | 
            +
                end
         | 
| 342 | 
            +
             | 
| 343 | 
            +
              end
         | 
| 344 | 
            +
            end
         |