spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.5.0'
3
+ VERSION = '0.6.0'
4
4
  end
@@ -1,60 +1,186 @@
1
- require 'spidr/agent'
2
-
3
1
  require 'spec_helper'
2
+ require 'example_app'
3
+
4
+ require 'spidr/agent'
4
5
 
5
6
  describe Agent do
6
- describe "actions" do
7
- let(:url) { URI('http://spidr.rubyforge.org/') }
7
+ describe "#continue!" do
8
+ before { subject.pause = true }
9
+ before { subject.continue! }
10
+
11
+ it "should un-pause the Agent" do
12
+ expect(subject.paused?).to be false
13
+ end
14
+ end
15
+
16
+ describe "#pause=" do
17
+ it "should change the paused state" do
18
+ subject.pause = true
19
+
20
+ expect(subject.paused?).to be true
21
+ end
22
+ end
23
+
24
+ describe "#pause!" do
25
+ it "should raise Action::Paused" do
26
+ expect {
27
+ subject.pause!
28
+ }.to raise_error(described_class::Actions::Paused)
29
+ end
30
+ end
8
31
 
9
- it "should be able to pause spidering" do
10
- count = 0
11
- agent = Agent.host('spidr.rubyforge.org') do |spider|
12
- spider.every_page do |page|
13
- count += 1
14
- spider.pause! if count >= 2
32
+ describe "#paused?" do
33
+ context "when the agent is paused" do
34
+ before do
35
+ begin
36
+ subject.pause!
37
+ rescue described_class::Actions::Paused
15
38
  end
16
39
  end
17
40
 
18
- expect(agent).to be_paused
19
- expect(agent.history.length).to eq(2)
41
+ it { expect(subject.paused?).to be true }
42
+ end
43
+
44
+ context "when the agent is not paused" do
45
+ it { expect(subject.paused?).to be false }
20
46
  end
47
+ end
48
+
49
+ describe "#skip_link!" do
50
+ it "should raise Actions::SkipLink" do
51
+ expect {
52
+ subject.skip_link!
53
+ }.to raise_error(described_class::Actions::SkipLink)
54
+ end
55
+ end
56
+
57
+ describe "#skip_page!" do
58
+ it "should raise Actions::SkipPage" do
59
+ expect {
60
+ subject.skip_page!
61
+ }.to raise_error(described_class::Actions::SkipPage)
62
+ end
63
+ end
64
+
65
+ context "when spidering" do
66
+ include_context "example App"
67
+
68
+ context "when pause! is called" do
69
+ app do
70
+ get '/' do
71
+ %{<html><body><a href="/link">link</a></body></html>}
72
+ end
21
73
 
22
- it "should be able to continue spidering after being paused" do
23
- agent = Agent.new do |spider|
24
- spider.every_page do |page|
25
- spider.pause!
74
+ get '/link' do
75
+ %{<html><body>should not get here</body></html>}
26
76
  end
27
77
  end
28
78
 
29
- agent.enqueue(url)
30
- agent.continue!
79
+ subject do
80
+ described_class.new(host: host) do |agent|
81
+ agent.every_page do |page|
82
+ if page.url.path == '/'
83
+ agent.pause!
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+ it "should pause spidering" do
90
+ expect(subject).to be_paused
91
+ expect(subject.history).to be == Set[
92
+ URI("http://#{host}/")
93
+ ]
94
+ end
31
95
 
32
- expect(agent.visited?(url)).to eq(true)
96
+ context "and continue! is called afterwards" do
97
+ before do
98
+ subject.enqueue "http://#{host}/link"
99
+ subject.continue!
100
+ end
101
+
102
+ it "should continue spidering" do
103
+ expect(subject.history).to be == Set[
104
+ URI("http://#{host}/"),
105
+ URI("http://#{host}/link")
106
+ ]
107
+ end
108
+ end
33
109
  end
34
110
 
35
- it "should allow skipping of enqueued links" do
36
- agent = Agent.new do |spider|
37
- spider.every_url do |url|
38
- spider.skip_link!
111
+ context "when skip_link! is called" do
112
+ app do
113
+ get '/' do
114
+ %{<html><body><a href="/link1">link1</a> <a href="/link2">link2</a> <a href="/link3">link3</a></body></html>}
115
+ end
116
+
117
+ get '/link1' do
118
+ %{<html><body>link1</body></html>}
119
+ end
120
+
121
+ get '/link2' do
122
+ %{<html><body>link2</body></html>}
123
+ end
124
+
125
+ get '/link3' do
126
+ %{<html><body>link3</body></html>}
39
127
  end
40
128
  end
41
129
 
42
- agent.enqueue(url)
130
+ subject do
131
+ described_class.new(host: host) do |agent|
132
+ agent.every_url do |url|
133
+ if url.path == '/link2'
134
+ agent.skip_link!
135
+ end
136
+ end
137
+ end
138
+ end
43
139
 
44
- expect(agent.queue).to be_empty
140
+ it "should skip all links on the page" do
141
+ expect(subject.history).to be == Set[
142
+ URI("http://#{host}/"),
143
+ URI("http://#{host}/link1"),
144
+ URI("http://#{host}/link3")
145
+ ]
146
+ end
45
147
  end
46
148
 
47
- it "should allow skipping of visited pages" do
48
- agent = Agent.new do |spider|
49
- spider.every_page do |url|
50
- spider.skip_page!
149
+ context "when skip_page! is called" do
150
+ app do
151
+ get '/' do
152
+ %{<html><body><a href="/link">entry link</a></body></html>}
153
+ end
154
+
155
+ get '/link' do
156
+ %{<html><body><a href="/link1">link1</a> <a href="/link2">link2</a></body></html>}
157
+ end
158
+
159
+ get '/link1' do
160
+ %{<html><body>should not get here</body></html>}
161
+ end
162
+
163
+ get '/link2' do
164
+ %{<html><body>should not get here</body></html>}
51
165
  end
52
166
  end
53
167
 
54
- agent.visit_page(url)
168
+ subject do
169
+ described_class.new(host: host) do |agent|
170
+ agent.every_page do |page|
171
+ if page.url.path == '/link'
172
+ agent.skip_page!
173
+ end
174
+ end
175
+ end
176
+ end
55
177
 
56
- expect(agent.history).to eq(Set[url])
57
- expect(agent.queue).to be_empty
178
+ it "should skip all links on the page" do
179
+ expect(subject.history).to be == Set[
180
+ URI("http://#{host}/"),
181
+ URI("http://#{host}/link")
182
+ ]
183
+ end
58
184
  end
59
185
  end
60
186
  end
@@ -3,60 +3,77 @@ require 'spidr/agent'
3
3
  require 'spec_helper'
4
4
 
5
5
  describe Agent do
6
- describe "filters" do
7
- it "should allow setting the acceptable schemes" do
8
- agent = Agent.new
6
+ describe "#initialize_filters" do
7
+ describe ":schemes" do
8
+ it "should override the default schemes" do
9
+ agent = described_class.new(schemes: [:https])
9
10
 
10
- agent.schemes = [:http]
11
- expect(agent.schemes).to eq(['http'])
11
+ expect(agent.schemes).to be == ['https']
12
+ end
12
13
  end
13
14
 
14
- it "should provide the hosts that will be visited" do
15
- agent = Agent.new(hosts: ['spidr.rubyforge.org'])
15
+ describe ":hosts" do
16
+ it "should set the hosts that will be visited" do
17
+ agent = described_class.new(hosts: ['spidr.rubyforge.org'])
16
18
 
17
- expect(agent.visit_hosts).to eq(['spidr.rubyforge.org'])
19
+ expect(agent.visit_hosts).to be == ['spidr.rubyforge.org']
20
+ end
18
21
  end
19
22
 
20
- it "should provide the hosts that will not be visited" do
21
- agent = Agent.new(ignore_hosts: ['example.com'])
23
+ describe ":ignore_hosts" do
24
+ it "should set the hosts that will not be visited" do
25
+ agent = described_class.new(ignore_hosts: ['example.com'])
22
26
 
23
- expect(agent.ignore_hosts).to eq(['example.com'])
27
+ expect(agent.ignore_hosts).to be == ['example.com']
28
+ end
24
29
  end
25
30
 
26
- it "should provide the ports that will be visited" do
27
- agent = Agent.new(ports: [80, 443, 8000])
31
+ describe ":ports" do
32
+ it "should set the ports that will be visited" do
33
+ agent = described_class.new(ports: [80, 443, 8000])
28
34
 
29
- expect(agent.visit_ports).to eq([80, 443, 8000])
35
+ expect(agent.visit_ports).to be == [80, 443, 8000]
36
+ end
30
37
  end
31
38
 
32
- it "should provide the ports that will not be visited" do
33
- agent = Agent.new(ignore_ports: [8000, 8080])
39
+ describe ":ignore_ports" do
40
+ it "should set the ports that will not be visited" do
41
+ agent = described_class.new(ignore_ports: [8000, 8080])
34
42
 
35
- expect(agent.ignore_ports).to eq([8000, 8080])
43
+ expect(agent.ignore_ports).to be == [8000, 8080]
44
+ end
36
45
  end
37
46
 
38
- it "should provide the links that will be visited" do
39
- agent = Agent.new(links: ['index.php'])
47
+ describe ":links" do
48
+ it "should set the links that will be visited" do
49
+ agent = described_class.new(links: ['index.php'])
40
50
 
41
- expect(agent.visit_links).to eq(['index.php'])
51
+ expect(agent.visit_links).to be == ['index.php']
52
+ end
42
53
  end
43
54
 
44
- it "should provide the links that will not be visited" do
45
- agent = Agent.new(ignore_links: [/login/])
55
+ describe ":ignore_links" do
56
+ it "should set the links that will not be visited" do
57
+ agent = described_class.new(ignore_links: [/login/])
46
58
 
47
- expect(agent.ignore_links).to eq([/login/])
59
+ expect(agent.ignore_links).to be == [/login/]
60
+ end
48
61
  end
49
62
 
50
- it "should provide the exts that will be visited" do
51
- agent = Agent.new(exts: ['htm'])
63
+ describe ":exts" do
64
+ it "should set the exts that will be visited" do
65
+ agent = described_class.new(exts: ['htm'])
52
66
 
53
- expect(agent.visit_exts).to eq(['htm'])
67
+ expect(agent.visit_exts).to be == ['htm']
68
+ end
54
69
  end
55
70
 
56
- it "should provide the exts that will not be visited" do
57
- agent = Agent.new(ignore_exts: ['cfm'])
71
+ describe ":ignore_exts" do
72
+ it "should set the exts that will not be visited" do
73
+ agent = described_class.new(ignore_exts: ['cfm'])
58
74
 
59
- expect(agent.ignore_exts).to eq(['cfm'])
75
+ expect(agent.ignore_exts).to be == ['cfm']
76
+ end
60
77
  end
61
78
  end
62
79
  end
@@ -4,58 +4,52 @@ require 'spec_helper'
4
4
 
5
5
  describe Agent do
6
6
  describe "sanitizers" do
7
- describe "sanitize_url" do
8
- let(:url) { 'http://host.com' }
9
- before(:all) { @agent = Agent.new }
7
+ describe "#sanitize_url" do
8
+ let(:url) { 'http://example.com/page?q=1#fragment' }
9
+ let(:uri) { URI(url) }
10
10
 
11
- it "should sanitize URLs" do
12
- agent = Agent.new
13
- clean_url = agent.sanitize_url(URI(url))
11
+ it "should sanitize URIs" do
12
+ clean_url = subject.sanitize_url(uri)
14
13
 
15
- expect(clean_url.host).to eq('host.com')
14
+ expect(clean_url.host).to eq('example.com')
16
15
  end
17
16
 
18
17
  it "should sanitize URLs given as Strings" do
19
- agent = Agent.new
20
- clean_url = agent.sanitize_url(url)
18
+ clean_url = subject.sanitize_url(url)
21
19
 
22
- expect(clean_url.host).to eq('host.com')
20
+ expect(clean_url.host).to eq('example.com')
23
21
  end
24
- end
25
-
26
- describe "strip_fragments" do
27
- let(:url) { URI("http://host.com/page#lol") }
28
22
 
29
23
  it "should strip fragment components by default" do
30
- agent = Agent.new
31
- clean_url = agent.sanitize_url(url)
24
+ clean_url = subject.sanitize_url(url)
32
25
 
33
26
  expect(clean_url.fragment).to be_nil
34
27
  end
35
28
 
36
- it "should allow perserving fragment components" do
37
- agent = Agent.new(strip_fragments: false)
38
- clean_url = agent.sanitize_url(url)
29
+ it "should not strip query components by default" do
30
+ clean_url = subject.sanitize_url(uri)
39
31
 
40
- expect(clean_url.fragment).to eq('lol')
32
+ expect(clean_url.query).to eq('q=1')
41
33
  end
42
- end
43
34
 
44
- describe "strip_query" do
45
- let(:url) { URI("http://host.com/page?x=1") }
35
+ context "when strip_fragments is disabled" do
36
+ subject { described_class.new(strip_fragments: false) }
46
37
 
47
- it "should not strip query components by default" do
48
- agent = Agent.new
49
- clean_url = agent.sanitize_url(url)
38
+ it "should perserve the fragment components" do
39
+ clean_url = subject.sanitize_url(uri)
50
40
 
51
- expect(clean_url.query).to eq('x=1')
41
+ expect(clean_url.fragment).to eq('fragment')
42
+ end
52
43
  end
53
44
 
54
- it "should allow stripping of query components" do
55
- agent = Agent.new(strip_query: true)
56
- clean_url = agent.sanitize_url(url)
45
+ context "when strip_query is enabled" do
46
+ subject { described_class.new(strip_query: true) }
47
+
48
+ it "should allow stripping of query components" do
49
+ clean_url = subject.sanitize_url(uri)
57
50
 
58
- expect(clean_url.query).to be_nil
51
+ expect(clean_url.query).to be_nil
52
+ end
59
53
  end
60
54
  end
61
55
  end
@@ -1,81 +1,803 @@
1
- require 'spidr/agent'
2
-
3
1
  require 'spec_helper'
4
- require 'helpers/wsoc'
2
+ require 'example_app'
3
+ require 'settings/user_agent_examples'
4
+
5
+ require 'spidr/agent'
5
6
 
6
7
  describe Agent do
7
- include Helpers::WSOC
8
+ it_should_behave_like "includes Spidr::Settings::UserAgent"
9
+
10
+ describe "#initialize" do
11
+ it "should not be running" do
12
+ expect(subject).to_not be_running
13
+ end
14
+
15
+ it "should default :delay to 0" do
16
+ expect(subject.delay).to be 0
17
+ end
18
+
19
+ it "should initialize #history" do
20
+ expect(subject.history).to be_empty
21
+ end
22
+
23
+ it "should initialize #failures" do
24
+ expect(subject.failures).to be_empty
25
+ end
26
+
27
+ it "should initialize #queue" do
28
+ expect(subject.queue).to be_empty
29
+ end
8
30
 
9
- before(:all) do
10
- @agent = run_course
31
+ it "should initialize the #session_cache" do
32
+ expect(subject.sessions).to be_kind_of(SessionCache)
33
+ end
34
+
35
+ it "should initialize the #cookie_jar" do
36
+ expect(subject.cookies).to be_kind_of(CookieJar)
37
+ end
38
+
39
+ it "should initialize the #auth_store" do
40
+ expect(subject.authorized).to be_kind_of(AuthStore)
41
+ end
11
42
  end
12
43
 
13
- it "should provide the history" do
14
- expect(@agent.history).not_to be_empty
44
+ describe "#history=" do
45
+ let(:previous_history) { Set[URI('http://example.com')] }
46
+
47
+ before { subject.history = previous_history }
48
+
49
+ it "should be able to restore the history" do
50
+ expect(subject.history).to eq(previous_history)
51
+ end
52
+
53
+ context "when given an Array of URIs" do
54
+ let(:previous_history) { [URI('http://example.com')] }
55
+ let(:converted_history) { Set.new(previous_history) }
56
+
57
+ it "should convert the Array to a Set" do
58
+ expect(subject.history).to eq(converted_history)
59
+ end
60
+ end
61
+
62
+ context "when given an Set of Strings" do
63
+ let(:previous_history) { Set['http://example.com'] }
64
+ let(:converted_history) do
65
+ previous_history.map { |url| URI(url) }.to_set
66
+ end
67
+
68
+ it "should convert the Strings to URIs" do
69
+ expect(subject.history).to eq(converted_history)
70
+ end
71
+ end
15
72
  end
16
73
 
17
- it "should provide the queue" do
18
- expect(@agent.queue).to be_empty
74
+ describe "#failures=" do
75
+ let(:previous_failures) { Set[URI('http://example.com')] }
76
+
77
+ before { subject.failures = previous_failures }
78
+
79
+ it "should be able to restore the failures" do
80
+ expect(subject.failures).to eq(previous_failures)
81
+ end
82
+
83
+ context "when given an Array of URIs" do
84
+ let(:previous_failures) { [URI('http://example.com')] }
85
+ let(:converted_failures) { Set.new(previous_failures) }
86
+
87
+ it "should convert the Array to a Set" do
88
+ expect(subject.failures).to eq(converted_failures)
89
+ end
90
+ end
91
+
92
+ context "when given an Set of Strings" do
93
+ let(:previous_failures) { Set['http://example.com'] }
94
+ let(:converted_failures) do
95
+ previous_failures.map { |url| URI(url) }.to_set
96
+ end
97
+
98
+ it "should convert the Strings to URIs" do
99
+ expect(subject.failures).to eq(converted_failures)
100
+ end
101
+ end
19
102
  end
20
103
 
21
- it "should be able to restore the history" do
22
- agent = Agent.new
23
- previous_history = Set[URI('http://www.example.com')]
104
+ describe "#queue=" do
105
+ let(:previous_queue) { [URI('http://example.com')] }
106
+
107
+ before { subject.queue = previous_queue }
108
+
109
+ it "should be able to restore the queue" do
110
+ expect(subject.queue).to eq(previous_queue)
111
+ end
24
112
 
25
- agent.history = previous_history
26
- expect(agent.history).to eq(previous_history)
113
+ context "when given an Set of URIs" do
114
+ let(:previous_queue) { Set[URI('http://example.com')] }
115
+ let(:converted_queue) { previous_queue.to_a }
116
+
117
+ it "should convert the Set to an Array" do
118
+ expect(subject.queue).to eq(converted_queue)
119
+ end
120
+ end
121
+
122
+ context "when given an Array of Strings" do
123
+ let(:previous_queue) { Set['http://example.com'] }
124
+ let(:converted_queue) { previous_queue.map { |url| URI(url) } }
125
+
126
+ it "should convert the Strings to URIs" do
127
+ expect(subject.queue).to eq(converted_queue)
128
+ end
129
+ end
27
130
  end
28
131
 
29
- it "should convert new histories to an Set of URIs" do
30
- agent = Agent.new
31
- previous_history = ['http://www.example.com']
32
- expected_history = Set[URI('http://www.example.com')]
132
+ describe "#to_hash" do
133
+ let(:queue) { [URI("http://example.com/link")] }
134
+ let(:history) { Set[URI("http://example.com/")] }
135
+
136
+ subject do
137
+ described_class.new do |agent|
138
+ agent.queue = queue
139
+ agent.history = history
140
+ end
141
+ end
33
142
 
34
- agent.history = previous_history
35
- expect(agent.history).not_to eq(previous_history)
36
- expect(agent.history).to eq(expected_history)
143
+ it "should return the queue and history" do
144
+ expect(subject.to_hash).to be == {
145
+ history: history,
146
+ queue: queue
147
+ }
148
+ end
37
149
  end
38
150
 
39
- it "should be able to restore the failures" do
40
- agent = Agent.new
41
- previous_failures = Set[URI('http://localhost/')]
151
+ context "when spidering" do
152
+ include_context "example App"
153
+
154
+ context "local links" do
155
+ context "relative paths" do
156
+ app do
157
+ get '/' do
158
+ %{<html><body><a href="link">relative link</a></body></html>}
159
+ end
160
+
161
+ get '/link' do
162
+ '<html><body>got here</body></html>'
163
+ end
164
+ end
165
+
166
+ it "should expand relative paths of links" do
167
+ expect(subject.history).to be == Set[
168
+ URI("http://#{host}/"),
169
+ URI("http://#{host}/link")
170
+ ]
171
+ end
172
+
173
+ context "that contain directory escapes" do
174
+ app do
175
+ get '/' do
176
+ %{<html><body><a href="foo/./../../../../link">link</a></body></html>}
177
+ end
178
+
179
+ get '/link' do
180
+ '<html><body>got here</body></html>'
181
+ end
182
+ end
183
+
184
+ it "should expand relative paths before visiting them" do
185
+ expect(subject.history).to be == Set[
186
+ URI("http://#{host}/"),
187
+ URI("http://#{host}/link")
188
+ ]
189
+ end
190
+ end
191
+ end
192
+
193
+ context "absolute paths" do
194
+ app do
195
+ get '/' do
196
+ %{<html><body><a href="/link">absolute path</a></body></html>}
197
+ end
198
+
199
+ get '/link' do
200
+ '<html><body>got here</body></html>'
201
+ end
202
+ end
203
+
204
+ it "should visit links with absolute paths" do
205
+ expect(subject.history).to be == Set[
206
+ URI("http://#{host}/"),
207
+ URI("http://#{host}/link")
208
+ ]
209
+ end
210
+
211
+ context "that contain directory escapes" do
212
+ app do
213
+ get '/' do
214
+ %{<html><body><a href="/foo/./../../../../link">link</a></body></html>}
215
+ end
216
+
217
+ get '/link' do
218
+ '<html><body>got here</body></html>'
219
+ end
220
+ end
221
+
222
+ it "should expand absolute links before visiting them" do
223
+ expect(subject.history).to be == Set[
224
+ URI("http://#{host}/"),
225
+ URI("http://#{host}/link")
226
+ ]
227
+ end
228
+ end
229
+
230
+ end
231
+ end
232
+
233
+ context "remote links" do
234
+ app do
235
+ get '/' do
236
+ %{<html><body><a href="http://#{settings.host}/link">absolute link</a></body></html>}
237
+ end
238
+
239
+ get '/link' do
240
+ '<html><body>got here</body></html>'
241
+ end
242
+ end
243
+
244
+ it "should visit absolute links" do
245
+ expect(subject.history).to be == Set[
246
+ URI("http://#{host}/"),
247
+ URI("http://#{host}/link")
248
+ ]
249
+ end
250
+
251
+ context "that contain directory escapes" do
252
+ app do
253
+ get '/' do
254
+ %{<html><body><a href="http://#{settings.host}/foo/./../../../../link">link</a></body></html>}
255
+ end
256
+
257
+ get '/link' do
258
+ '<html><body>got here</body></html>'
259
+ end
260
+ end
261
+
262
+ it "should expand absolute links before visiting them" do
263
+ expect(subject.history).to be == Set[
264
+ URI("http://#{host}/"),
265
+ URI("http://#{host}/link")
266
+ ]
267
+ end
268
+ end
269
+ end
270
+
271
+ context "self-referential links" do
272
+ app do
273
+ get '/' do
274
+ %{<html><body><a href="/">same page</a></body></html>}
275
+ end
276
+ end
277
+
278
+ it "should ignore self-referential links" do
279
+ expect(subject.history).to be == Set[
280
+ URI("http://#{host}/")
281
+ ]
282
+ end
283
+ end
284
+
285
+ context "circular links" do
286
+ app do
287
+ get '/' do
288
+ %{<html><body><a href="/link">link</a></body></html>}
289
+ end
290
+
291
+ get '/link' do
292
+ %{<html><body><a href="/">previous page</a></body></html>}
293
+ end
294
+ end
295
+
296
+ it "should ignore links that have been previous visited" do
297
+ expect(subject.history).to be == Set[
298
+ URI("http://#{host}/"),
299
+ URI("http://#{host}/link")
300
+ ]
301
+ end
302
+ end
303
+
304
+ context "link cycles" do
305
+ app do
306
+ get '/' do
307
+ %{<html><body><a href="/link1">first link</a></body></html>}
308
+ end
309
+
310
+ get '/link1' do
311
+ %{<html><body><a href="/link2">next link</a></body></html>}
312
+ end
313
+
314
+ get '/link2' do
315
+ %{<html><body><a href="/">back to the beginning</a></body></html>}
316
+ end
317
+ end
318
+
319
+ it "should ignore links that have been previous visited" do
320
+ expect(subject.history).to be == Set[
321
+ URI("http://#{host}/"),
322
+ URI("http://#{host}/link1"),
323
+ URI("http://#{host}/link2"),
324
+ ]
325
+ end
326
+ end
327
+
328
+ context "fragment links" do
329
+ app do
330
+ get '/' do
331
+ %{<html><body><a href="#fragment">fragment link</a></body></html>}
332
+ end
333
+ end
334
+
335
+ it "should ignore fragment links" do
336
+ expect(subject.history).to be == Set[
337
+ URI("http://#{host}/")
338
+ ]
339
+ end
340
+ end
341
+
342
+ context "empty links" do
343
+ context "empty href" do
344
+ app do
345
+ get '/' do
346
+ %{<html><body><a href="">empty link</a> <a href=" ">blank link</a> <a>no href</a></body></html>}
347
+ end
348
+ end
349
+
350
+ it "should ignore links with empty hrefs" do
351
+ expect(subject.history).to be == Set[
352
+ URI("http://#{host}/")
353
+ ]
354
+ end
355
+ end
356
+
357
+ context "whitespace href" do
358
+ app do
359
+ get '/' do
360
+ %{<html><body><a href=" ">blank link</a></body></html>}
361
+ end
362
+ end
363
+
364
+ it "should ignore links containing only whitespace" do
365
+ expect(subject.history).to be == Set[
366
+ URI("http://#{host}/")
367
+ ]
368
+ end
369
+ end
370
+
371
+ context "missing href" do
372
+ app do
373
+ get '/' do
374
+ %{<html><body><a>no href</a></body></html>}
375
+ end
376
+ end
377
+
378
+ it "should ignore links with no href" do
379
+ expect(subject.history).to be == Set[
380
+ URI("http://#{host}/")
381
+ ]
382
+ end
383
+ end
384
+ end
385
+
386
+ context "frames" do
387
+ app do
388
+ get '/' do
389
+ %{<html><body><frameset><frame src="/frame" /></frameset></body></html>}
390
+ end
42
391
 
43
- agent.failures = previous_failures
44
- expect(agent.failures).to eq(previous_failures)
392
+ get '/frame' do
393
+ %{<html><body><a href="/link">link</a></body></html>}
394
+ end
395
+
396
+ get '/link' do
397
+ %{<html><body>got here</body></html>}
398
+ end
399
+ end
400
+
401
+ it "should visit the frame and links within the frame" do
402
+ expect(subject.history).to be == Set[
403
+ URI("http://#{host}/"),
404
+ URI("http://#{host}/frame"),
405
+ URI("http://#{host}/link")
406
+ ]
407
+ end
408
+ end
409
+
410
+ context "iframes" do
411
+ app do
412
+ get '/' do
413
+ %{<html><body><iframe src="/iframe" /></body></html>}
414
+ end
415
+
416
+ get '/iframe' do
417
+ %{<html><body><a href="/link">link</a></body></html>}
418
+ end
419
+
420
+ get '/link' do
421
+ %{<html><body>got here</body></html>}
422
+ end
423
+ end
424
+
425
+ it "should visit the iframe and links within the iframe" do
426
+ expect(subject.history).to be == Set[
427
+ URI("http://#{host}/"),
428
+ URI("http://#{host}/iframe"),
429
+ URI("http://#{host}/link")
430
+ ]
431
+ end
432
+ end
433
+
434
+ context "javascript links" do
435
+ app do
436
+ get '/' do
437
+ %{<html><body><a href="javascript:fail();">javascript link</a></body></html>}
438
+ end
439
+ end
440
+
441
+ it "should ignore javascript: links" do
442
+ expect(subject.history).to be == Set[
443
+ URI("http://#{host}/")
444
+ ]
445
+ end
446
+
447
+ context "when the link has an onclick action" do
448
+ app do
449
+ get '/' do
450
+ %{<html><body><a href="#" onclick="javascript:fail();">onclick link</a></body></html>}
451
+ end
452
+ end
453
+
454
+ it "should ignore links with onclick actions" do
455
+ expect(subject.history).to be == Set[
456
+ URI("http://#{host}/")
457
+ ]
458
+ end
459
+ end
460
+ end
461
+
462
+ context "cookies" do
463
+ app do
464
+ get '/' do
465
+ response.set_cookie 'visited', 'true'
466
+
467
+ %{<html><body><a href="/link">link</a></body></html>}
468
+ end
469
+
470
+ get '/link' do
471
+ if request.cookies['visited'] == 'true'
472
+ %{<html><body>got here</body></html>}
473
+ else
474
+ halt 401, "Cookie not set"
475
+ end
476
+ end
477
+ end
478
+
479
+ it "should record cookies and send them with each request" do
480
+ expect(subject.history).to be == Set[
481
+ URI("http://#{host}/"),
482
+ URI("http://#{host}/link"),
483
+ ]
484
+
485
+ expect(subject.cookies[host]).to be == {'visited' => 'true'}
486
+ end
487
+ end
488
+
489
+ context "redirects" do
490
+ context "300" do
491
+ app do
492
+ get '/' do
493
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
494
+ end
495
+
496
+ get '/redirect' do
497
+ redirect to('/link'), 300
498
+ end
499
+
500
+ get '/link' do
501
+ %{<html><body>got here</body></html>}
502
+ end
503
+ end
504
+
505
+ it "should follow HTTP 300 redirects" do
506
+ expect(subject.history).to be == Set[
507
+ URI("http://#{host}/"),
508
+ URI("http://#{host}/redirect"),
509
+ URI("http://#{host}/link"),
510
+ ]
511
+ end
512
+ end
513
+
514
+ context "301" do
515
+ app do
516
+ get '/' do
517
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
518
+ end
519
+
520
+ get '/redirect' do
521
+ redirect to('/link'), 301
522
+ end
523
+
524
+ get '/link' do
525
+ %{<html><body>got here</body></html>}
526
+ end
527
+ end
528
+
529
+ it "should follow HTTP 301 redirects" do
530
+ expect(subject.history).to be == Set[
531
+ URI("http://#{host}/"),
532
+ URI("http://#{host}/redirect"),
533
+ URI("http://#{host}/link"),
534
+ ]
535
+ end
536
+ end
537
+
538
+ context "302" do
539
+ app do
540
+ get '/' do
541
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
542
+ end
543
+
544
+ get '/redirect' do
545
+ redirect to('/link'), 302
546
+ end
547
+
548
+ get '/link' do
549
+ %{<html><body>got here</body></html>}
550
+ end
551
+ end
552
+
553
+ it "should follow HTTP 302 redirects" do
554
+ expect(subject.history).to be == Set[
555
+ URI("http://#{host}/"),
556
+ URI("http://#{host}/redirect"),
557
+ URI("http://#{host}/link"),
558
+ ]
559
+ end
560
+ end
561
+
562
+ context "303" do
563
+ app do
564
+ get '/' do
565
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
566
+ end
567
+
568
+ get '/redirect' do
569
+ redirect to('/link'), 303
570
+ end
571
+
572
+ get '/link' do
573
+ %{<html><body>got here</body></html>}
574
+ end
575
+ end
576
+
577
+ it "should follow HTTP 303 redirects" do
578
+ expect(subject.history).to be == Set[
579
+ URI("http://#{host}/"),
580
+ URI("http://#{host}/redirect"),
581
+ URI("http://#{host}/link"),
582
+ ]
583
+ end
584
+ end
585
+
586
+ context "307" do
587
+ app do
588
+ get '/' do
589
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
590
+ end
591
+
592
+ get '/redirect' do
593
+ redirect to('/link'), 307
594
+ end
595
+
596
+ get '/link' do
597
+ %{<html><body>got here</body></html>}
598
+ end
599
+ end
600
+
601
+ it "should follow HTTP 307 redirects" do
602
+ expect(subject.history).to be == Set[
603
+ URI("http://#{host}/"),
604
+ URI("http://#{host}/redirect"),
605
+ URI("http://#{host}/link"),
606
+ ]
607
+ end
608
+ end
609
+
610
+ context "meta-refresh" do
611
+ app do
612
+ get '/' do
613
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
614
+ end
615
+
616
+ get '/redirect' do
617
+ %{<html><head><meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" /></head><body>Redirecting...</body></html>}
618
+ end
619
+
620
+ get '/link' do
621
+ %{<html><body>got here</body></html>}
622
+ end
623
+ end
624
+
625
+ it "should follow meta-refresh redirects" do
626
+ expect(subject.history).to be == Set[
627
+ URI("http://#{host}/"),
628
+ URI("http://#{host}/redirect"),
629
+ URI("http://#{host}/link"),
630
+ ]
631
+ end
632
+ end
633
+ end
634
+
635
+ context "Basic-Auth" do
636
+ app do
637
+ set :user, 'admin'
638
+ set :password, 'swordfish'
639
+
640
+ get '/' do
641
+ %{<html><body><a href="/private">private link</a></body></html>}
642
+ end
643
+
644
+ get '/private' do
645
+ auth = Rack::Auth::Basic::Request.new(request.env)
646
+
647
+ if auth.provided? && auth.basic? && auth.credentials && \
648
+ auth.credentials == [settings.user, settings.password]
649
+ %{<html><body>got here</body></html>}
650
+ else
651
+ headers['WWW-Authenticate'] = %{Basic realm="Restricted Area"}
652
+ halt 401, "<html><body><h1>Not authorized</h1></body></html>"
653
+ end
654
+ end
655
+ end
656
+
657
+ before do
658
+ subject.authorized.add("http://#{host}/private", app.user, app.password)
659
+ end
660
+
661
+ it "should send HTTP Basic-Auth credentials for protected URLs" do
662
+ expect(subject.history).to be == Set[
663
+ URI("http://#{host}/"),
664
+ URI("http://#{host}/private")
665
+ ]
666
+ end
667
+ end
45
668
  end
46
669
 
47
- it "should convert new histories to a Set of URIs" do
48
- agent = Agent.new
49
- previous_failures = ['http://localhost/']
50
- expected_failures = Set[URI('http://localhost/')]
670
+ context "when :host is specified" do
671
+ include_context "example App"
51
672
 
52
- agent.failures = previous_failures
53
- expect(agent.failures).not_to eq(previous_failures)
54
- expect(agent.failures).to eq(expected_failures)
673
+ subject { described_class.new(host: host) }
674
+
675
+ app do
676
+ get '/' do
677
+ %{<html><body><a href="http://google.com/">external link</a> <a href="/link">local link</a></body></html>}
678
+ end
679
+
680
+ get '/link' do
681
+ %{<html><body>got here</body></html>}
682
+ end
683
+ end
684
+
685
+ it "should only visit links on the host" do
686
+ expect(subject.history).to be == Set[
687
+ URI("http://#{host}/"),
688
+ URI("http://#{host}/link")
689
+ ]
690
+ end
55
691
  end
56
692
 
57
- it "should be able to restore the queue" do
58
- agent = Agent.new
59
- previous_queue = [URI('http://www.example.com')]
693
+ context "when :limit is set" do
694
+ include_context "example App"
695
+
696
+ let(:limit) { 10 }
60
697
 
61
- agent.queue = previous_queue
62
- expect(agent.queue).to eq(previous_queue)
698
+ subject { described_class.new(host: host, limit: limit) }
699
+
700
+ app do
701
+ get '/' do
702
+ i = Integer(params['i'] || 0)
703
+
704
+ %{<html><body><a href="/?i=#{i+1}">next link</a></body></html>}
705
+ end
706
+ end
707
+
708
+ it "must only visit the maximum number of links" do
709
+ expect(subject.history).to be == Set[
710
+ URI("http://#{host}/"),
711
+ URI("http://#{host}/?i=1"),
712
+ URI("http://#{host}/?i=2"),
713
+ URI("http://#{host}/?i=3"),
714
+ URI("http://#{host}/?i=4"),
715
+ URI("http://#{host}/?i=5"),
716
+ URI("http://#{host}/?i=6"),
717
+ URI("http://#{host}/?i=7"),
718
+ URI("http://#{host}/?i=8"),
719
+ URI("http://#{host}/?i=9"),
720
+ ]
721
+ end
63
722
  end
64
723
 
65
- it "should convert new queues to an Array of URIs" do
66
- agent = Agent.new
67
- previous_queue = ['http://www.example.com']
68
- expected_queue = [URI('http://www.example.com')]
724
+ context "when :depth is set" do
725
+ include_context "example App"
726
+
727
+ app do
728
+ get '/' do
729
+ %{<html><body><a href="/left?d=1">left</a><a href="/right?d=1">right</a></body></html>}
730
+ end
731
+
732
+ get %r{^/left|/right} do
733
+ d = Integer(params['d'])
734
+
735
+ %{<html><body><a href="/left?d=#{d+1}">left</a><a href="/right?d=#{d+1}">right</a></body></html>}
736
+ end
737
+ end
738
+
739
+ context "depth 0" do
740
+ subject { described_class.new(host: host, max_depth: 0) }
741
+
742
+ it "must only visit the first page" do
743
+ expect(subject.history).to be == Set[URI("http://#{host}/")]
744
+ end
745
+ end
69
746
 
70
- agent.queue = previous_queue
71
- expect(agent.queue).not_to eq(previous_queue)
72
- expect(agent.queue).to eq(expected_queue)
747
+ context "depth > 0" do
748
+ subject { described_class.new(host: host, max_depth: 2) }
749
+
750
+ it "must visit links below the maximum depth" do
751
+ expect(subject.history).to be == Set[
752
+ URI("http://#{host}/"),
753
+ URI("http://#{host}/left?d=1"),
754
+ URI("http://#{host}/right?d=1"),
755
+ URI("http://#{host}/left?d=2"),
756
+ URI("http://#{host}/right?d=2")
757
+ ]
758
+ end
759
+ end
73
760
  end
74
761
 
75
- it "should provide a to_hash method that returns the queue and history" do
76
- hash = @agent.to_hash
762
+ context "when :robots is enabled" do
763
+ include_context "example App"
764
+
765
+ let(:user_agent) { 'Ruby' }
766
+
767
+ subject do
768
+ described_class.new(
769
+ host: host,
770
+ user_agent: user_agent,
771
+ robots: true
772
+ )
773
+ end
774
+
775
+ app do
776
+ get '/' do
777
+ %{<html><body><a href="/secret">don't follow this link</a> <a href="/pub">follow this link</a></body></html>}
778
+ end
779
+
780
+ get '/pub' do
781
+ %{<html><body>got here</body></html>}
782
+ end
783
+
784
+ get '/robots.txt' do
785
+ content_type 'text/plain'
786
+
787
+ [
788
+ "User-agent: *",
789
+ 'Disallow: /',
790
+ ].join($/)
791
+ end
792
+ end
793
+
794
+ it "should not follow links Disallowed by robots.txt" do
795
+ pending "https://github.com/bblimke/webmock/issues/642"
77
796
 
78
- expect(hash[:queue]).to be_empty
79
- expect(hash[:history]).not_to be_empty
797
+ expect(subject.history).to be == Set[
798
+ URI("http://#{host}/"),
799
+ URI("http://#{host}/pub")
800
+ ]
801
+ end
80
802
  end
81
803
  end