RubyGems - spidr - Versions diffs - 0.5.0 → 0.6.0 - Mend

spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +4 -4
data/.travis.yml +14 -0
data/ChangeLog.md +20 -2
data/Gemfile +2 -2
data/README.md +4 -2
data/Rakefile +1 -0
data/gemspec.yml +1 -1
data/lib/spidr/agent.rb +145 -85
data/lib/spidr/agent/filters.rb +1 -9
data/lib/spidr/agent/robots.rb +36 -0
data/lib/spidr/page.rb +76 -28
data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
data/lib/spidr/page/cookies.rb +60 -0
data/lib/spidr/page/{links.rb → html.rb} +47 -23
data/lib/spidr/page/status_codes.rb +112 -0
data/lib/spidr/proxy.rb +56 -0
data/lib/spidr/session_cache.rb +60 -24
data/lib/spidr/settings.rb +3 -0
data/lib/spidr/settings/proxy.rb +61 -0
data/lib/spidr/settings/timeouts.rb +33 -0
data/lib/spidr/settings/user_agent.rb +14 -0
data/lib/spidr/spidr.rb +15 -79
data/lib/spidr/version.rb +1 -1
data/spec/agent/actions_spec.rb +158 -32
data/spec/agent/filters_spec.rb +46 -29
data/spec/agent/sanitizers_spec.rb +25 -31
data/spec/agent_spec.rb +772 -50
data/spec/example_app.rb +27 -0
data/spec/example_page.rb +33 -0
data/spec/page/content_types_spec.rb +150 -0
data/spec/page/cookies_spec.rb +58 -0
data/spec/page/html_spec.rb +524 -0
data/spec/page/status_codes_spec.rb +87 -0
data/spec/page_spec.rb +114 -78
data/spec/proxy_spec.rb +45 -0
data/spec/session_cache.rb +103 -2
data/spec/settings/proxy_examples.rb +82 -0
data/spec/settings/timeouts_examples.rb +93 -0
data/spec/settings/user_agent_examples.rb +25 -0
data/spec/spidr_spec.rb +6 -29
data/spidr.gemspec +38 -109
metadata +35 -31
data/lib/spidr/page/body.rb +0 -98
data/spec/helpers/history.rb +0 -34
data/spec/helpers/page.rb +0 -8
data/spec/helpers/wsoc.rb +0 -83
data/spec/page_examples.rb +0 -21

data/lib/spidr/version.rb CHANGED

@@ -1,4 +1,4 @@
 module Spidr
   # Spidr version
-  VERSION = '0.5.0'
+  VERSION = '0.6.0'
 end

data/spec/agent/actions_spec.rb CHANGED

@@ -1,60 +1,186 @@
-require 'spidr/agent'
 require 'spec_helper'
+require 'example_app'
+require 'spidr/agent'
 describe Agent do
-  describe "actions" do
-    let(:url) { URI('http://spidr.rubyforge.org/') }
+  describe "#continue!" do
+    before { subject.pause = true }
+    before { subject.continue!    }
+    it "should un-pause the Agent" do
+      expect(subject.paused?).to be false
+    end
+  end
+  describe "#pause=" do
+    it "should change the paused state" do
+      subject.pause = true
+      expect(subject.paused?).to be true
+    end
+  end
+  describe "#pause!" do
+    it "should raise Action::Paused" do
+      expect {
+        subject.pause!
+      }.to raise_error(described_class::Actions::Paused)
+    end
+  end
-    it "should be able to pause spidering" do
-      count = 0
-      agent = Agent.host('spidr.rubyforge.org') do |spider|
-        spider.every_page do |page|
-          count += 1
-          spider.pause! if count >= 2
+  describe "#paused?" do
+    context "when the agent is paused" do
+      before do
+        begin
+          subject.pause!
+        rescue described_class::Actions::Paused
         end
       end
-      expect(agent).to be_paused
-      expect(agent.history.length).to eq(2)
+      it { expect(subject.paused?).to be true }
+    end
+    context "when the agent is not paused" do
+      it { expect(subject.paused?).to be false }
     end
+  end
+  describe "#skip_link!" do
+    it "should raise Actions::SkipLink" do
+      expect {
+        subject.skip_link!
+      }.to raise_error(described_class::Actions::SkipLink)
+    end
+  end
+  describe "#skip_page!" do
+    it "should raise Actions::SkipPage" do
+      expect {
+        subject.skip_page!
+      }.to raise_error(described_class::Actions::SkipPage)
+    end
+  end
+  context "when spidering" do
+    include_context "example App"
+    context "when pause! is called" do
+      app do
+        get '/' do
+          %{<html><body><a href="/link">link</a></body></html>}
+        end
-    it "should be able to continue spidering after being paused" do
-      agent = Agent.new do |spider|
-        spider.every_page do |page|
-          spider.pause!
+        get '/link' do
+          %{<html><body>should not get here</body></html>}
         end
       end
-      agent.enqueue(url)
-      agent.continue!
+      subject do
+        described_class.new(host: host) do |agent|
+          agent.every_page do |page|
+            if page.url.path == '/'
+              agent.pause!
+            end
+          end
+        end
+      end
+      it "should pause spidering" do
+        expect(subject).to be_paused
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/")
+        ]
+      end
-      expect(agent.visited?(url)).to eq(true)
+      context "and continue! is called afterwards" do
+        before do
+          subject.enqueue "http://#{host}/link"
+          subject.continue!
+        end
+        it "should continue spidering" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/"),
+            URI("http://#{host}/link")
+          ]
+        end
+      end
     end
-    it "should allow skipping of enqueued links" do
-      agent = Agent.new do |spider|
-        spider.every_url do |url|
-          spider.skip_link!
+    context "when skip_link! is called" do
+      app do
+        get '/' do
+          %{<html><body><a href="/link1">link1</a> <a href="/link2">link2</a> <a href="/link3">link3</a></body></html>}
+        end
+        get '/link1' do
+          %{<html><body>link1</body></html>}
+        end
+        get '/link2' do
+          %{<html><body>link2</body></html>}
+        end
+        get '/link3' do
+          %{<html><body>link3</body></html>}
         end
       end
-      agent.enqueue(url)
+      subject do
+        described_class.new(host: host) do |agent|
+          agent.every_url do |url|
+            if url.path == '/link2'
+              agent.skip_link!
+            end
+          end
+        end
+      end
-      expect(agent.queue).to be_empty
+      it "should skip all links on the page" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/"),
+          URI("http://#{host}/link1"),
+          URI("http://#{host}/link3")
+        ]
+      end
     end
-    it "should allow skipping of visited pages" do
-      agent = Agent.new do |spider|
-        spider.every_page do |url|
-          spider.skip_page!
+    context "when skip_page! is called" do
+      app do
+        get '/' do
+          %{<html><body><a href="/link">entry link</a></body></html>}
+        end
+        get '/link' do
+          %{<html><body><a href="/link1">link1</a> <a href="/link2">link2</a></body></html>}
+        end
+        get '/link1' do
+          %{<html><body>should not get here</body></html>}
+        end
+        get '/link2' do
+          %{<html><body>should not get here</body></html>}
         end
       end
-      agent.visit_page(url)
+      subject do
+        described_class.new(host: host) do |agent|
+          agent.every_page do |page|
+            if page.url.path == '/link'
+              agent.skip_page!
+            end
+          end
+        end
+      end
-      expect(agent.history).to eq(Set[url])
-      expect(agent.queue).to be_empty
+      it "should skip all links on the page" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/"),
+          URI("http://#{host}/link")
+        ]
+      end
     end
   end
 end

data/spec/agent/filters_spec.rb CHANGED

@@ -3,60 +3,77 @@ require 'spidr/agent'
 require 'spec_helper'
 describe Agent do
-  describe "filters" do
-    it "should allow setting the acceptable schemes" do
-      agent = Agent.new
+  describe "#initialize_filters" do
+    describe ":schemes" do
+      it "should override the default schemes" do
+        agent = described_class.new(schemes: [:https])
-      agent.schemes = [:http]
-      expect(agent.schemes).to eq(['http'])
+        expect(agent.schemes).to be == ['https']
+      end
     end
-    it "should provide the hosts that will be visited" do
-      agent = Agent.new(hosts: ['spidr.rubyforge.org'])
+    describe ":hosts" do
+      it "should set the hosts that will be visited" do
+        agent = described_class.new(hosts: ['spidr.rubyforge.org'])
-      expect(agent.visit_hosts).to eq(['spidr.rubyforge.org'])
+        expect(agent.visit_hosts).to be == ['spidr.rubyforge.org']
+      end
     end
-    it "should provide the hosts that will not be visited" do
-      agent = Agent.new(ignore_hosts: ['example.com'])
+    describe ":ignore_hosts" do
+      it "should set the hosts that will not be visited" do
+        agent = described_class.new(ignore_hosts: ['example.com'])
-      expect(agent.ignore_hosts).to eq(['example.com'])
+        expect(agent.ignore_hosts).to be == ['example.com']
+      end
     end
-    it "should provide the ports that will be visited" do
-      agent = Agent.new(ports: [80, 443, 8000])
+    describe ":ports" do
+      it "should set the ports that will be visited" do
+        agent = described_class.new(ports: [80, 443, 8000])
-      expect(agent.visit_ports).to eq([80, 443, 8000])
+        expect(agent.visit_ports).to be == [80, 443, 8000]
+      end
     end
-    it "should provide the ports that will not be visited" do
-      agent = Agent.new(ignore_ports: [8000, 8080])
+    describe ":ignore_ports" do
+      it "should set the ports that will not be visited" do
+        agent = described_class.new(ignore_ports: [8000, 8080])
-      expect(agent.ignore_ports).to eq([8000, 8080])
+        expect(agent.ignore_ports).to be == [8000, 8080]
+      end
     end
-    it "should provide the links that will be visited" do
-      agent = Agent.new(links: ['index.php'])
+    describe ":links" do
+      it "should set the links that will be visited" do
+        agent = described_class.new(links: ['index.php'])
-      expect(agent.visit_links).to eq(['index.php'])
+        expect(agent.visit_links).to be == ['index.php']
+      end
     end
-    it "should provide the links that will not be visited" do
-      agent = Agent.new(ignore_links: [/login/])
+    describe ":ignore_links" do
+      it "should set the links that will not be visited" do
+        agent = described_class.new(ignore_links: [/login/])
-      expect(agent.ignore_links).to eq([/login/])
+        expect(agent.ignore_links).to be == [/login/]
+      end
     end
-    it "should provide the exts that will be visited" do
-      agent = Agent.new(exts: ['htm'])
+    describe ":exts" do
+      it "should set the exts that will be visited" do
+        agent = described_class.new(exts: ['htm'])
-      expect(agent.visit_exts).to eq(['htm'])
+        expect(agent.visit_exts).to be == ['htm']
+      end
     end
-    it "should provide the exts that will not be visited" do
-      agent = Agent.new(ignore_exts: ['cfm'])
+    describe ":ignore_exts" do
+      it "should set the exts that will not be visited" do
+        agent = described_class.new(ignore_exts: ['cfm'])
-      expect(agent.ignore_exts).to eq(['cfm'])
+        expect(agent.ignore_exts).to be == ['cfm']
+      end
     end
   end
 end

data/spec/agent/sanitizers_spec.rb CHANGED

@@ -4,58 +4,52 @@ require 'spec_helper'
 describe Agent do
   describe "sanitizers" do
-    describe "sanitize_url" do
-      let(:url) { 'http://host.com' }
-      before(:all) { @agent = Agent.new }
+    describe "#sanitize_url" do
+      let(:url) { 'http://example.com/page?q=1#fragment' }
+      let(:uri) { URI(url) }
-      it "should sanitize URLs" do
-        agent = Agent.new
-        clean_url = agent.sanitize_url(URI(url))
+      it "should sanitize URIs" do
+        clean_url = subject.sanitize_url(uri)
-        expect(clean_url.host).to eq('host.com')
+        expect(clean_url.host).to eq('example.com')
       end
       it "should sanitize URLs given as Strings" do
-        agent = Agent.new
-        clean_url = agent.sanitize_url(url)
+        clean_url = subject.sanitize_url(url)
-        expect(clean_url.host).to eq('host.com')
+        expect(clean_url.host).to eq('example.com')
       end
-    end
-    describe "strip_fragments" do
-      let(:url) { URI("http://host.com/page#lol") }
       it "should strip fragment components by default" do
-        agent = Agent.new
-        clean_url = agent.sanitize_url(url)
+        clean_url = subject.sanitize_url(url)
         expect(clean_url.fragment).to be_nil
       end
-      it "should allow perserving fragment components" do
-        agent = Agent.new(strip_fragments: false)
-        clean_url = agent.sanitize_url(url)
+      it "should not strip query components by default" do
+        clean_url = subject.sanitize_url(uri)
-        expect(clean_url.fragment).to eq('lol')
+        expect(clean_url.query).to eq('q=1')
       end
-    end
-    describe "strip_query" do
-      let(:url) { URI("http://host.com/page?x=1") }
+      context "when strip_fragments is disabled" do
+        subject { described_class.new(strip_fragments: false) }
-      it "should not strip query components by default" do
-        agent = Agent.new
-        clean_url = agent.sanitize_url(url)
+        it "should perserve the fragment components" do
+          clean_url = subject.sanitize_url(uri)
-        expect(clean_url.query).to eq('x=1')
+          expect(clean_url.fragment).to eq('fragment')
+        end
       end
-      it "should allow stripping of query components" do
-        agent = Agent.new(strip_query: true)
-        clean_url = agent.sanitize_url(url)
+      context "when strip_query is enabled" do
+        subject { described_class.new(strip_query: true) }
+        it "should allow stripping of query components" do
+          clean_url = subject.sanitize_url(uri)
-        expect(clean_url.query).to be_nil
+          expect(clean_url.query).to be_nil
+        end
       end
     end
   end

data/spec/agent_spec.rb CHANGED

@@ -1,81 +1,803 @@
-require 'spidr/agent'
 require 'spec_helper'
-require 'helpers/wsoc'
+require 'example_app'
+require 'settings/user_agent_examples'
+require 'spidr/agent'
 describe Agent do
-  include Helpers::WSOC
+  it_should_behave_like "includes Spidr::Settings::UserAgent"
+  describe "#initialize" do
+    it "should not be running" do
+      expect(subject).to_not be_running
+    end
+    it "should default :delay to 0" do
+      expect(subject.delay).to be 0
+    end
+    it "should initialize #history" do
+      expect(subject.history).to be_empty
+    end
+    it "should initialize #failures" do
+      expect(subject.failures).to be_empty
+    end
+    it "should initialize #queue" do
+      expect(subject.queue).to be_empty
+    end
-  before(:all) do
-    @agent = run_course
+    it "should initialize the #session_cache" do
+      expect(subject.sessions).to be_kind_of(SessionCache)
+    end
+    it "should initialize the #cookie_jar" do
+      expect(subject.cookies).to be_kind_of(CookieJar)
+    end
+    it "should initialize the #auth_store" do
+      expect(subject.authorized).to be_kind_of(AuthStore)
+    end
   end
-  it "should provide the history" do
-    expect(@agent.history).not_to be_empty
+  describe "#history=" do
+    let(:previous_history) { Set[URI('http://example.com')] }
+    before { subject.history = previous_history }
+    it "should be able to restore the history" do
+      expect(subject.history).to eq(previous_history)
+    end
+    context "when given an Array of URIs" do
+      let(:previous_history)  { [URI('http://example.com')] }
+      let(:converted_history) { Set.new(previous_history) }
+      it "should convert the Array to a Set" do
+        expect(subject.history).to eq(converted_history)
+      end
+    end
+    context "when given an Set of Strings" do
+      let(:previous_history)  { Set['http://example.com'] }
+      let(:converted_history) do
+        previous_history.map { |url| URI(url) }.to_set
+      end
+      it "should convert the Strings to URIs" do
+        expect(subject.history).to eq(converted_history)
+      end
+    end
   end
-  it "should provide the queue" do
-    expect(@agent.queue).to be_empty
+  describe "#failures=" do
+    let(:previous_failures) { Set[URI('http://example.com')] }
+    before { subject.failures = previous_failures }
+    it "should be able to restore the failures" do
+      expect(subject.failures).to eq(previous_failures)
+    end
+    context "when given an Array of URIs" do
+      let(:previous_failures)  { [URI('http://example.com')] }
+      let(:converted_failures) { Set.new(previous_failures) }
+      it "should convert the Array to a Set" do
+        expect(subject.failures).to eq(converted_failures)
+      end
+    end
+    context "when given an Set of Strings" do
+      let(:previous_failures)  { Set['http://example.com'] }
+      let(:converted_failures) do
+        previous_failures.map { |url| URI(url) }.to_set
+      end
+      it "should convert the Strings to URIs" do
+        expect(subject.failures).to eq(converted_failures)
+      end
+    end
   end
-  it "should be able to restore the history" do
-    agent = Agent.new
-    previous_history = Set[URI('http://www.example.com')]
+  describe "#queue=" do
+    let(:previous_queue) { [URI('http://example.com')] }
+    before { subject.queue = previous_queue }
+    it "should be able to restore the queue" do
+      expect(subject.queue).to eq(previous_queue)
+    end
-    agent.history = previous_history
-    expect(agent.history).to eq(previous_history)
+    context "when given an Set of URIs" do
+      let(:previous_queue)  { Set[URI('http://example.com')] }
+      let(:converted_queue) { previous_queue.to_a }
+      it "should convert the Set to an Array" do
+        expect(subject.queue).to eq(converted_queue)
+      end
+    end
+    context "when given an Array of Strings" do
+      let(:previous_queue)  { Set['http://example.com'] }
+      let(:converted_queue) { previous_queue.map { |url| URI(url) } }
+      it "should convert the Strings to URIs" do
+        expect(subject.queue).to eq(converted_queue)
+      end
+    end
   end
-  it "should convert new histories to an Set of URIs" do
-    agent = Agent.new
-    previous_history = ['http://www.example.com']
-    expected_history = Set[URI('http://www.example.com')]
+  describe "#to_hash" do
+    let(:queue)   { [URI("http://example.com/link")] }
+    let(:history) { Set[URI("http://example.com/")]  }
+    subject do
+      described_class.new do |agent|
+        agent.queue   = queue
+        agent.history = history
+      end
+    end
-    agent.history = previous_history
-    expect(agent.history).not_to eq(previous_history)
-    expect(agent.history).to eq(expected_history)
+    it "should return the queue and history" do
+      expect(subject.to_hash).to be == {
+        history: history,
+        queue:   queue
+      }
+    end
   end
-  it "should be able to restore the failures" do
-    agent = Agent.new
-    previous_failures = Set[URI('http://localhost/')]
+  context "when spidering" do
+    include_context "example App"
+    context "local links" do
+      context "relative paths" do
+        app do
+          get '/' do
+            %{<html><body><a href="link">relative link</a></body></html>}
+          end
+          get '/link' do
+            '<html><body>got here</body></html>'
+          end
+        end
+        it "should expand relative paths of links" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/"),
+            URI("http://#{host}/link")
+          ]
+        end
+        context "that contain directory escapes" do
+          app do
+            get '/' do
+              %{<html><body><a href="foo/./../../../../link">link</a></body></html>}
+            end
+            get '/link' do
+              '<html><body>got here</body></html>'
+            end
+          end
+          it "should expand relative paths before visiting them" do
+            expect(subject.history).to be == Set[
+              URI("http://#{host}/"),
+              URI("http://#{host}/link")
+            ]
+          end
+        end
+      end
+      context "absolute paths" do
+        app do
+          get '/' do
+            %{<html><body><a href="/link">absolute path</a></body></html>}
+          end
+          get '/link' do
+            '<html><body>got here</body></html>'
+          end
+        end
+        it "should visit links with absolute paths" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/"),
+            URI("http://#{host}/link")
+          ]
+        end
+        context "that contain directory escapes" do
+          app do
+            get '/' do
+              %{<html><body><a href="/foo/./../../../../link">link</a></body></html>}
+            end
+            get '/link' do
+              '<html><body>got here</body></html>'
+            end
+          end
+          it "should expand absolute links before visiting them" do
+            expect(subject.history).to be == Set[
+              URI("http://#{host}/"),
+              URI("http://#{host}/link")
+            ]
+          end
+        end
+      end
+    end
+    context "remote links" do
+      app do
+        get '/' do
+          %{<html><body><a href="http://#{settings.host}/link">absolute link</a></body></html>}
+        end
+        get '/link' do
+          '<html><body>got here</body></html>'
+        end
+      end
+      it "should visit absolute links" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/"),
+          URI("http://#{host}/link")
+        ]
+      end
+      context "that contain directory escapes" do
+        app do
+          get '/' do
+            %{<html><body><a href="http://#{settings.host}/foo/./../../../../link">link</a></body></html>}
+          end
+          get '/link' do
+            '<html><body>got here</body></html>'
+          end
+        end
+        it "should expand absolute links before visiting them" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/"),
+            URI("http://#{host}/link")
+          ]
+        end
+      end
+    end
+    context "self-referential links" do
+      app do
+        get '/' do
+          %{<html><body><a href="/">same page</a></body></html>}
+        end
+      end
+      it "should ignore self-referential links" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/")
+        ]
+      end
+    end
+    context "circular links" do
+      app do
+        get '/' do
+          %{<html><body><a href="/link">link</a></body></html>}
+        end
+        get '/link' do
+          %{<html><body><a href="/">previous page</a></body></html>}
+        end
+      end
+      it "should ignore links that have been previous visited" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/"),
+          URI("http://#{host}/link")
+        ]
+      end
+    end
+    context "link cycles" do
+      app do
+        get '/' do
+          %{<html><body><a href="/link1">first link</a></body></html>}
+        end
+        get '/link1' do
+          %{<html><body><a href="/link2">next link</a></body></html>}
+        end
+        get '/link2' do
+          %{<html><body><a href="/">back to the beginning</a></body></html>}
+        end
+      end
+      it "should ignore links that have been previous visited" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/"),
+          URI("http://#{host}/link1"),
+          URI("http://#{host}/link2"),
+        ]
+      end
+    end
+    context "fragment links" do
+      app do
+        get '/' do
+          %{<html><body><a href="#fragment">fragment link</a></body></html>}
+        end
+      end
+      it "should ignore fragment links" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/")
+        ]
+      end
+    end
+    context "empty links" do
+      context "empty href" do
+        app do
+          get '/' do
+            %{<html><body><a href="">empty link</a> <a href=" ">blank link</a> <a>no href</a></body></html>}
+          end
+        end
+        it "should ignore links with empty hrefs" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/")
+          ]
+        end
+      end
+      context "whitespace href" do
+        app do
+          get '/' do
+            %{<html><body><a href=" ">blank link</a></body></html>}
+          end
+        end
+        it "should ignore links containing only whitespace" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/")
+          ]
+        end
+      end
+      context "missing href" do
+        app do
+          get '/' do
+            %{<html><body><a>no href</a></body></html>}
+          end
+        end
+        it "should ignore links with no href" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/")
+          ]
+        end
+      end
+    end
+    context "frames" do
+      app do
+        get '/' do
+          %{<html><body><frameset><frame src="/frame" /></frameset></body></html>}
+        end
-    agent.failures = previous_failures
-    expect(agent.failures).to eq(previous_failures)
+        get '/frame' do
+          %{<html><body><a href="/link">link</a></body></html>}
+        end
+        get '/link' do
+          %{<html><body>got here</body></html>}
+        end
+      end
+      it "should visit the frame and links within the frame" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/"),
+          URI("http://#{host}/frame"),
+          URI("http://#{host}/link")
+        ]
+      end
+    end
+    context "iframes" do
+      app do
+        get '/' do
+          %{<html><body><iframe src="/iframe" /></body></html>}
+        end
+        get '/iframe' do
+          %{<html><body><a href="/link">link</a></body></html>}
+        end
+        get '/link' do
+          %{<html><body>got here</body></html>}
+        end
+      end
+      it "should visit the iframe and links within the iframe" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/"),
+          URI("http://#{host}/iframe"),
+          URI("http://#{host}/link")
+        ]
+      end
+    end
+    context "javascript links" do
+      app do
+        get '/' do
+          %{<html><body><a href="javascript:fail();">javascript link</a></body></html>}
+        end
+      end
+      it "should ignore javascript: links" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/")
+        ]
+      end
+      context "when the link has an onclick action" do
+        app do
+          get '/' do
+            %{<html><body><a href="#" onclick="javascript:fail();">onclick link</a></body></html>}
+          end
+        end
+        it "should ignore links with onclick actions" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/")
+          ]
+        end
+      end
+    end
+    context "cookies" do
+      app do
+        get '/' do
+          response.set_cookie 'visited', 'true'
+          %{<html><body><a href="/link">link</a></body></html>}
+        end
+        get '/link' do
+          if request.cookies['visited'] == 'true'
+            %{<html><body>got here</body></html>}
+          else
+            halt 401, "Cookie not set"
+          end
+        end
+      end
+      it "should record cookies and send them with each request" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/"),
+          URI("http://#{host}/link"),
+        ]
+        expect(subject.cookies[host]).to be == {'visited' => 'true'}
+      end
+    end
+    context "redirects" do
+      context "300" do
+        app do
+          get '/' do
+            %{<html><body><a href="/redirect">redirect</a></body></html>}
+          end
+          get '/redirect' do
+            redirect to('/link'), 300
+          end
+          get '/link' do
+            %{<html><body>got here</body></html>}
+          end
+        end
+        it "should follow HTTP 300 redirects" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/"),
+            URI("http://#{host}/redirect"),
+            URI("http://#{host}/link"),
+          ]
+        end
+      end
+      context "301" do
+        app do
+          get '/' do
+            %{<html><body><a href="/redirect">redirect</a></body></html>}
+          end
+          get '/redirect' do
+            redirect to('/link'), 301
+          end
+          get '/link' do
+            %{<html><body>got here</body></html>}
+          end
+        end
+        it "should follow HTTP 301 redirects" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/"),
+            URI("http://#{host}/redirect"),
+            URI("http://#{host}/link"),
+          ]
+        end
+      end
+      context "302" do
+        app do
+          get '/' do
+            %{<html><body><a href="/redirect">redirect</a></body></html>}
+          end
+          get '/redirect' do
+            redirect to('/link'), 302
+          end
+          get '/link' do
+            %{<html><body>got here</body></html>}
+          end
+        end
+        it "should follow HTTP 302 redirects" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/"),
+            URI("http://#{host}/redirect"),
+            URI("http://#{host}/link"),
+          ]
+        end
+      end
+      context "303" do
+        app do
+          get '/' do
+            %{<html><body><a href="/redirect">redirect</a></body></html>}
+          end
+          get '/redirect' do
+            redirect to('/link'), 303
+          end
+          get '/link' do
+            %{<html><body>got here</body></html>}
+          end
+        end
+        it "should follow HTTP 303 redirects" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/"),
+            URI("http://#{host}/redirect"),
+            URI("http://#{host}/link"),
+          ]
+        end
+      end
+      context "307" do
+        app do
+          get '/' do
+            %{<html><body><a href="/redirect">redirect</a></body></html>}
+          end
+          get '/redirect' do
+            redirect to('/link'), 307
+          end
+          get '/link' do
+            %{<html><body>got here</body></html>}
+          end
+        end
+        it "should follow HTTP 307 redirects" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/"),
+            URI("http://#{host}/redirect"),
+            URI("http://#{host}/link"),
+          ]
+        end
+      end
+      context "meta-refresh" do
+        app do
+          get '/' do
+            %{<html><body><a href="/redirect">redirect</a></body></html>}
+          end
+          get '/redirect' do
+            %{<html><head><meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" /></head><body>Redirecting...</body></html>}
+          end
+          get '/link' do
+            %{<html><body>got here</body></html>}
+          end
+        end
+        it "should follow meta-refresh redirects" do
+          expect(subject.history).to be == Set[
+            URI("http://#{host}/"),
+            URI("http://#{host}/redirect"),
+            URI("http://#{host}/link"),
+          ]
+        end
+      end
+    end
+    context "Basic-Auth" do
+      app do
+        set :user,     'admin'
+        set :password, 'swordfish'
+        get '/' do
+          %{<html><body><a href="/private">private link</a></body></html>}
+        end
+        get '/private' do
+          auth =  Rack::Auth::Basic::Request.new(request.env)
+          if auth.provided? && auth.basic? && auth.credentials && \
+             auth.credentials == [settings.user, settings.password]
+            %{<html><body>got here</body></html>}
+          else
+            headers['WWW-Authenticate'] = %{Basic realm="Restricted Area"}
+            halt 401, "<html><body><h1>Not authorized</h1></body></html>"
+          end
+        end
+      end
+      before do
+        subject.authorized.add("http://#{host}/private", app.user, app.password)
+      end
+      it "should send HTTP Basic-Auth credentials for protected URLs" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/"),
+          URI("http://#{host}/private")
+        ]
+      end
+    end
   end
-  it "should convert new histories to a Set of URIs" do
-    agent = Agent.new
-    previous_failures = ['http://localhost/']
-    expected_failures = Set[URI('http://localhost/')]
+  context "when :host is specified" do
+    include_context "example App"
-    agent.failures = previous_failures
-    expect(agent.failures).not_to eq(previous_failures)
-    expect(agent.failures).to eq(expected_failures)
+    subject { described_class.new(host: host) }
+    app do
+      get '/' do
+        %{<html><body><a href="http://google.com/">external link</a> <a href="/link">local link</a></body></html>}
+      end
+      get '/link' do
+        %{<html><body>got here</body></html>}
+      end
+    end
+    it "should only visit links on the host" do
+      expect(subject.history).to be == Set[
+        URI("http://#{host}/"),
+        URI("http://#{host}/link")
+      ]
+    end
   end
-  it "should be able to restore the queue" do
-    agent = Agent.new
-    previous_queue = [URI('http://www.example.com')]
+  context "when :limit is set" do
+    include_context "example App"
+    let(:limit) { 10 }
-    agent.queue = previous_queue
-    expect(agent.queue).to eq(previous_queue)
+    subject { described_class.new(host: host, limit: limit) }
+    app do
+      get '/' do
+        i = Integer(params['i'] || 0)
+        %{<html><body><a href="/?i=#{i+1}">next link</a></body></html>}
+      end
+    end
+    it "must only visit the maximum number of links" do
+      expect(subject.history).to be == Set[
+        URI("http://#{host}/"),
+        URI("http://#{host}/?i=1"),
+        URI("http://#{host}/?i=2"),
+        URI("http://#{host}/?i=3"),
+        URI("http://#{host}/?i=4"),
+        URI("http://#{host}/?i=5"),
+        URI("http://#{host}/?i=6"),
+        URI("http://#{host}/?i=7"),
+        URI("http://#{host}/?i=8"),
+        URI("http://#{host}/?i=9"),
+      ]
+    end
   end
-  it "should convert new queues to an Array of URIs" do
-    agent = Agent.new
-    previous_queue = ['http://www.example.com']
-    expected_queue = [URI('http://www.example.com')]
+  context "when :depth is set" do
+    include_context "example App"
+    app do
+      get '/' do
+        %{<html><body><a href="/left?d=1">left</a><a href="/right?d=1">right</a></body></html>}
+      end
+      get %r{^/left|/right} do
+        d = Integer(params['d'])
+        %{<html><body><a href="/left?d=#{d+1}">left</a><a href="/right?d=#{d+1}">right</a></body></html>}
+      end
+    end
+    context "depth 0" do
+      subject { described_class.new(host: host, max_depth: 0) }
+      it "must only visit the first page" do
+        expect(subject.history).to be == Set[URI("http://#{host}/")]
+      end
+    end
-    agent.queue = previous_queue
-    expect(agent.queue).not_to eq(previous_queue)
-    expect(agent.queue).to eq(expected_queue)
+    context "depth > 0" do
+      subject { described_class.new(host: host, max_depth: 2) }
+      it "must visit links below the maximum depth" do
+        expect(subject.history).to be == Set[
+          URI("http://#{host}/"),
+          URI("http://#{host}/left?d=1"),
+          URI("http://#{host}/right?d=1"),
+          URI("http://#{host}/left?d=2"),
+          URI("http://#{host}/right?d=2")
+        ]
+      end
+    end
   end
-  it "should provide a to_hash method that returns the queue and history" do
-    hash = @agent.to_hash
+  context "when :robots is enabled" do
+    include_context "example App"
+    let(:user_agent) { 'Ruby' }
+    subject do
+      described_class.new(
+        host: host,
+        user_agent: user_agent,
+        robots: true
+      )
+    end
+    app do
+      get '/' do
+        %{<html><body><a href="/secret">don't follow this link</a> <a href="/pub">follow this link</a></body></html>}
+      end
+      get '/pub' do
+        %{<html><body>got here</body></html>}
+      end
+      get '/robots.txt' do
+        content_type 'text/plain'
+        [
+          "User-agent: *",
+          'Disallow: /',
+        ].join($/)
+      end
+    end
+    it "should not follow links Disallowed by robots.txt" do
+      pending "https://github.com/bblimke/webmock/issues/642"
-    expect(hash[:queue]).to be_empty
-    expect(hash[:history]).not_to be_empty
+      expect(subject.history).to be == Set[
+        URI("http://#{host}/"),
+        URI("http://#{host}/pub")
+      ]
+    end
   end
 end