spidr 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -1,4 +1,4 @@
1
1
  module Spidr
2
2
  # Spidr version
3
- VERSION = '0.5.0'
3
+ VERSION = '0.6.0'
4
4
  end
@@ -1,60 +1,186 @@
1
- require 'spidr/agent'
2
-
3
1
  require 'spec_helper'
2
+ require 'example_app'
3
+
4
+ require 'spidr/agent'
4
5
 
5
6
  describe Agent do
6
- describe "actions" do
7
- let(:url) { URI('http://spidr.rubyforge.org/') }
7
+ describe "#continue!" do
8
+ before { subject.pause = true }
9
+ before { subject.continue! }
10
+
11
+ it "should un-pause the Agent" do
12
+ expect(subject.paused?).to be false
13
+ end
14
+ end
15
+
16
+ describe "#pause=" do
17
+ it "should change the paused state" do
18
+ subject.pause = true
19
+
20
+ expect(subject.paused?).to be true
21
+ end
22
+ end
23
+
24
+ describe "#pause!" do
25
+ it "should raise Action::Paused" do
26
+ expect {
27
+ subject.pause!
28
+ }.to raise_error(described_class::Actions::Paused)
29
+ end
30
+ end
8
31
 
9
- it "should be able to pause spidering" do
10
- count = 0
11
- agent = Agent.host('spidr.rubyforge.org') do |spider|
12
- spider.every_page do |page|
13
- count += 1
14
- spider.pause! if count >= 2
32
+ describe "#paused?" do
33
+ context "when the agent is paused" do
34
+ before do
35
+ begin
36
+ subject.pause!
37
+ rescue described_class::Actions::Paused
15
38
  end
16
39
  end
17
40
 
18
- expect(agent).to be_paused
19
- expect(agent.history.length).to eq(2)
41
+ it { expect(subject.paused?).to be true }
42
+ end
43
+
44
+ context "when the agent is not paused" do
45
+ it { expect(subject.paused?).to be false }
20
46
  end
47
+ end
48
+
49
+ describe "#skip_link!" do
50
+ it "should raise Actions::SkipLink" do
51
+ expect {
52
+ subject.skip_link!
53
+ }.to raise_error(described_class::Actions::SkipLink)
54
+ end
55
+ end
56
+
57
+ describe "#skip_page!" do
58
+ it "should raise Actions::SkipPage" do
59
+ expect {
60
+ subject.skip_page!
61
+ }.to raise_error(described_class::Actions::SkipPage)
62
+ end
63
+ end
64
+
65
+ context "when spidering" do
66
+ include_context "example App"
67
+
68
+ context "when pause! is called" do
69
+ app do
70
+ get '/' do
71
+ %{<html><body><a href="/link">link</a></body></html>}
72
+ end
21
73
 
22
- it "should be able to continue spidering after being paused" do
23
- agent = Agent.new do |spider|
24
- spider.every_page do |page|
25
- spider.pause!
74
+ get '/link' do
75
+ %{<html><body>should not get here</body></html>}
26
76
  end
27
77
  end
28
78
 
29
- agent.enqueue(url)
30
- agent.continue!
79
+ subject do
80
+ described_class.new(host: host) do |agent|
81
+ agent.every_page do |page|
82
+ if page.url.path == '/'
83
+ agent.pause!
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+ it "should pause spidering" do
90
+ expect(subject).to be_paused
91
+ expect(subject.history).to be == Set[
92
+ URI("http://#{host}/")
93
+ ]
94
+ end
31
95
 
32
- expect(agent.visited?(url)).to eq(true)
96
+ context "and continue! is called afterwards" do
97
+ before do
98
+ subject.enqueue "http://#{host}/link"
99
+ subject.continue!
100
+ end
101
+
102
+ it "should continue spidering" do
103
+ expect(subject.history).to be == Set[
104
+ URI("http://#{host}/"),
105
+ URI("http://#{host}/link")
106
+ ]
107
+ end
108
+ end
33
109
  end
34
110
 
35
- it "should allow skipping of enqueued links" do
36
- agent = Agent.new do |spider|
37
- spider.every_url do |url|
38
- spider.skip_link!
111
+ context "when skip_link! is called" do
112
+ app do
113
+ get '/' do
114
+ %{<html><body><a href="/link1">link1</a> <a href="/link2">link2</a> <a href="/link3">link3</a></body></html>}
115
+ end
116
+
117
+ get '/link1' do
118
+ %{<html><body>link1</body></html>}
119
+ end
120
+
121
+ get '/link2' do
122
+ %{<html><body>link2</body></html>}
123
+ end
124
+
125
+ get '/link3' do
126
+ %{<html><body>link3</body></html>}
39
127
  end
40
128
  end
41
129
 
42
- agent.enqueue(url)
130
+ subject do
131
+ described_class.new(host: host) do |agent|
132
+ agent.every_url do |url|
133
+ if url.path == '/link2'
134
+ agent.skip_link!
135
+ end
136
+ end
137
+ end
138
+ end
43
139
 
44
- expect(agent.queue).to be_empty
140
+ it "should skip all links on the page" do
141
+ expect(subject.history).to be == Set[
142
+ URI("http://#{host}/"),
143
+ URI("http://#{host}/link1"),
144
+ URI("http://#{host}/link3")
145
+ ]
146
+ end
45
147
  end
46
148
 
47
- it "should allow skipping of visited pages" do
48
- agent = Agent.new do |spider|
49
- spider.every_page do |url|
50
- spider.skip_page!
149
+ context "when skip_page! is called" do
150
+ app do
151
+ get '/' do
152
+ %{<html><body><a href="/link">entry link</a></body></html>}
153
+ end
154
+
155
+ get '/link' do
156
+ %{<html><body><a href="/link1">link1</a> <a href="/link2">link2</a></body></html>}
157
+ end
158
+
159
+ get '/link1' do
160
+ %{<html><body>should not get here</body></html>}
161
+ end
162
+
163
+ get '/link2' do
164
+ %{<html><body>should not get here</body></html>}
51
165
  end
52
166
  end
53
167
 
54
- agent.visit_page(url)
168
+ subject do
169
+ described_class.new(host: host) do |agent|
170
+ agent.every_page do |page|
171
+ if page.url.path == '/link'
172
+ agent.skip_page!
173
+ end
174
+ end
175
+ end
176
+ end
55
177
 
56
- expect(agent.history).to eq(Set[url])
57
- expect(agent.queue).to be_empty
178
+ it "should skip all links on the page" do
179
+ expect(subject.history).to be == Set[
180
+ URI("http://#{host}/"),
181
+ URI("http://#{host}/link")
182
+ ]
183
+ end
58
184
  end
59
185
  end
60
186
  end
@@ -3,60 +3,77 @@ require 'spidr/agent'
3
3
  require 'spec_helper'
4
4
 
5
5
  describe Agent do
6
- describe "filters" do
7
- it "should allow setting the acceptable schemes" do
8
- agent = Agent.new
6
+ describe "#initialize_filters" do
7
+ describe ":schemes" do
8
+ it "should override the default schemes" do
9
+ agent = described_class.new(schemes: [:https])
9
10
 
10
- agent.schemes = [:http]
11
- expect(agent.schemes).to eq(['http'])
11
+ expect(agent.schemes).to be == ['https']
12
+ end
12
13
  end
13
14
 
14
- it "should provide the hosts that will be visited" do
15
- agent = Agent.new(hosts: ['spidr.rubyforge.org'])
15
+ describe ":hosts" do
16
+ it "should set the hosts that will be visited" do
17
+ agent = described_class.new(hosts: ['spidr.rubyforge.org'])
16
18
 
17
- expect(agent.visit_hosts).to eq(['spidr.rubyforge.org'])
19
+ expect(agent.visit_hosts).to be == ['spidr.rubyforge.org']
20
+ end
18
21
  end
19
22
 
20
- it "should provide the hosts that will not be visited" do
21
- agent = Agent.new(ignore_hosts: ['example.com'])
23
+ describe ":ignore_hosts" do
24
+ it "should set the hosts that will not be visited" do
25
+ agent = described_class.new(ignore_hosts: ['example.com'])
22
26
 
23
- expect(agent.ignore_hosts).to eq(['example.com'])
27
+ expect(agent.ignore_hosts).to be == ['example.com']
28
+ end
24
29
  end
25
30
 
26
- it "should provide the ports that will be visited" do
27
- agent = Agent.new(ports: [80, 443, 8000])
31
+ describe ":ports" do
32
+ it "should set the ports that will be visited" do
33
+ agent = described_class.new(ports: [80, 443, 8000])
28
34
 
29
- expect(agent.visit_ports).to eq([80, 443, 8000])
35
+ expect(agent.visit_ports).to be == [80, 443, 8000]
36
+ end
30
37
  end
31
38
 
32
- it "should provide the ports that will not be visited" do
33
- agent = Agent.new(ignore_ports: [8000, 8080])
39
+ describe ":ignore_ports" do
40
+ it "should set the ports that will not be visited" do
41
+ agent = described_class.new(ignore_ports: [8000, 8080])
34
42
 
35
- expect(agent.ignore_ports).to eq([8000, 8080])
43
+ expect(agent.ignore_ports).to be == [8000, 8080]
44
+ end
36
45
  end
37
46
 
38
- it "should provide the links that will be visited" do
39
- agent = Agent.new(links: ['index.php'])
47
+ describe ":links" do
48
+ it "should set the links that will be visited" do
49
+ agent = described_class.new(links: ['index.php'])
40
50
 
41
- expect(agent.visit_links).to eq(['index.php'])
51
+ expect(agent.visit_links).to be == ['index.php']
52
+ end
42
53
  end
43
54
 
44
- it "should provide the links that will not be visited" do
45
- agent = Agent.new(ignore_links: [/login/])
55
+ describe ":ignore_links" do
56
+ it "should set the links that will not be visited" do
57
+ agent = described_class.new(ignore_links: [/login/])
46
58
 
47
- expect(agent.ignore_links).to eq([/login/])
59
+ expect(agent.ignore_links).to be == [/login/]
60
+ end
48
61
  end
49
62
 
50
- it "should provide the exts that will be visited" do
51
- agent = Agent.new(exts: ['htm'])
63
+ describe ":exts" do
64
+ it "should set the exts that will be visited" do
65
+ agent = described_class.new(exts: ['htm'])
52
66
 
53
- expect(agent.visit_exts).to eq(['htm'])
67
+ expect(agent.visit_exts).to be == ['htm']
68
+ end
54
69
  end
55
70
 
56
- it "should provide the exts that will not be visited" do
57
- agent = Agent.new(ignore_exts: ['cfm'])
71
+ describe ":ignore_exts" do
72
+ it "should set the exts that will not be visited" do
73
+ agent = described_class.new(ignore_exts: ['cfm'])
58
74
 
59
- expect(agent.ignore_exts).to eq(['cfm'])
75
+ expect(agent.ignore_exts).to be == ['cfm']
76
+ end
60
77
  end
61
78
  end
62
79
  end
@@ -4,58 +4,52 @@ require 'spec_helper'
4
4
 
5
5
  describe Agent do
6
6
  describe "sanitizers" do
7
- describe "sanitize_url" do
8
- let(:url) { 'http://host.com' }
9
- before(:all) { @agent = Agent.new }
7
+ describe "#sanitize_url" do
8
+ let(:url) { 'http://example.com/page?q=1#fragment' }
9
+ let(:uri) { URI(url) }
10
10
 
11
- it "should sanitize URLs" do
12
- agent = Agent.new
13
- clean_url = agent.sanitize_url(URI(url))
11
+ it "should sanitize URIs" do
12
+ clean_url = subject.sanitize_url(uri)
14
13
 
15
- expect(clean_url.host).to eq('host.com')
14
+ expect(clean_url.host).to eq('example.com')
16
15
  end
17
16
 
18
17
  it "should sanitize URLs given as Strings" do
19
- agent = Agent.new
20
- clean_url = agent.sanitize_url(url)
18
+ clean_url = subject.sanitize_url(url)
21
19
 
22
- expect(clean_url.host).to eq('host.com')
20
+ expect(clean_url.host).to eq('example.com')
23
21
  end
24
- end
25
-
26
- describe "strip_fragments" do
27
- let(:url) { URI("http://host.com/page#lol") }
28
22
 
29
23
  it "should strip fragment components by default" do
30
- agent = Agent.new
31
- clean_url = agent.sanitize_url(url)
24
+ clean_url = subject.sanitize_url(url)
32
25
 
33
26
  expect(clean_url.fragment).to be_nil
34
27
  end
35
28
 
36
- it "should allow perserving fragment components" do
37
- agent = Agent.new(strip_fragments: false)
38
- clean_url = agent.sanitize_url(url)
29
+ it "should not strip query components by default" do
30
+ clean_url = subject.sanitize_url(uri)
39
31
 
40
- expect(clean_url.fragment).to eq('lol')
32
+ expect(clean_url.query).to eq('q=1')
41
33
  end
42
- end
43
34
 
44
- describe "strip_query" do
45
- let(:url) { URI("http://host.com/page?x=1") }
35
+ context "when strip_fragments is disabled" do
36
+ subject { described_class.new(strip_fragments: false) }
46
37
 
47
- it "should not strip query components by default" do
48
- agent = Agent.new
49
- clean_url = agent.sanitize_url(url)
38
+ it "should perserve the fragment components" do
39
+ clean_url = subject.sanitize_url(uri)
50
40
 
51
- expect(clean_url.query).to eq('x=1')
41
+ expect(clean_url.fragment).to eq('fragment')
42
+ end
52
43
  end
53
44
 
54
- it "should allow stripping of query components" do
55
- agent = Agent.new(strip_query: true)
56
- clean_url = agent.sanitize_url(url)
45
+ context "when strip_query is enabled" do
46
+ subject { described_class.new(strip_query: true) }
47
+
48
+ it "should allow stripping of query components" do
49
+ clean_url = subject.sanitize_url(uri)
57
50
 
58
- expect(clean_url.query).to be_nil
51
+ expect(clean_url.query).to be_nil
52
+ end
59
53
  end
60
54
  end
61
55
  end
@@ -1,81 +1,803 @@
1
- require 'spidr/agent'
2
-
3
1
  require 'spec_helper'
4
- require 'helpers/wsoc'
2
+ require 'example_app'
3
+ require 'settings/user_agent_examples'
4
+
5
+ require 'spidr/agent'
5
6
 
6
7
  describe Agent do
7
- include Helpers::WSOC
8
+ it_should_behave_like "includes Spidr::Settings::UserAgent"
9
+
10
+ describe "#initialize" do
11
+ it "should not be running" do
12
+ expect(subject).to_not be_running
13
+ end
14
+
15
+ it "should default :delay to 0" do
16
+ expect(subject.delay).to be 0
17
+ end
18
+
19
+ it "should initialize #history" do
20
+ expect(subject.history).to be_empty
21
+ end
22
+
23
+ it "should initialize #failures" do
24
+ expect(subject.failures).to be_empty
25
+ end
26
+
27
+ it "should initialize #queue" do
28
+ expect(subject.queue).to be_empty
29
+ end
8
30
 
9
- before(:all) do
10
- @agent = run_course
31
+ it "should initialize the #session_cache" do
32
+ expect(subject.sessions).to be_kind_of(SessionCache)
33
+ end
34
+
35
+ it "should initialize the #cookie_jar" do
36
+ expect(subject.cookies).to be_kind_of(CookieJar)
37
+ end
38
+
39
+ it "should initialize the #auth_store" do
40
+ expect(subject.authorized).to be_kind_of(AuthStore)
41
+ end
11
42
  end
12
43
 
13
- it "should provide the history" do
14
- expect(@agent.history).not_to be_empty
44
+ describe "#history=" do
45
+ let(:previous_history) { Set[URI('http://example.com')] }
46
+
47
+ before { subject.history = previous_history }
48
+
49
+ it "should be able to restore the history" do
50
+ expect(subject.history).to eq(previous_history)
51
+ end
52
+
53
+ context "when given an Array of URIs" do
54
+ let(:previous_history) { [URI('http://example.com')] }
55
+ let(:converted_history) { Set.new(previous_history) }
56
+
57
+ it "should convert the Array to a Set" do
58
+ expect(subject.history).to eq(converted_history)
59
+ end
60
+ end
61
+
62
+ context "when given an Set of Strings" do
63
+ let(:previous_history) { Set['http://example.com'] }
64
+ let(:converted_history) do
65
+ previous_history.map { |url| URI(url) }.to_set
66
+ end
67
+
68
+ it "should convert the Strings to URIs" do
69
+ expect(subject.history).to eq(converted_history)
70
+ end
71
+ end
15
72
  end
16
73
 
17
- it "should provide the queue" do
18
- expect(@agent.queue).to be_empty
74
+ describe "#failures=" do
75
+ let(:previous_failures) { Set[URI('http://example.com')] }
76
+
77
+ before { subject.failures = previous_failures }
78
+
79
+ it "should be able to restore the failures" do
80
+ expect(subject.failures).to eq(previous_failures)
81
+ end
82
+
83
+ context "when given an Array of URIs" do
84
+ let(:previous_failures) { [URI('http://example.com')] }
85
+ let(:converted_failures) { Set.new(previous_failures) }
86
+
87
+ it "should convert the Array to a Set" do
88
+ expect(subject.failures).to eq(converted_failures)
89
+ end
90
+ end
91
+
92
+ context "when given an Set of Strings" do
93
+ let(:previous_failures) { Set['http://example.com'] }
94
+ let(:converted_failures) do
95
+ previous_failures.map { |url| URI(url) }.to_set
96
+ end
97
+
98
+ it "should convert the Strings to URIs" do
99
+ expect(subject.failures).to eq(converted_failures)
100
+ end
101
+ end
19
102
  end
20
103
 
21
- it "should be able to restore the history" do
22
- agent = Agent.new
23
- previous_history = Set[URI('http://www.example.com')]
104
+ describe "#queue=" do
105
+ let(:previous_queue) { [URI('http://example.com')] }
106
+
107
+ before { subject.queue = previous_queue }
108
+
109
+ it "should be able to restore the queue" do
110
+ expect(subject.queue).to eq(previous_queue)
111
+ end
24
112
 
25
- agent.history = previous_history
26
- expect(agent.history).to eq(previous_history)
113
+ context "when given an Set of URIs" do
114
+ let(:previous_queue) { Set[URI('http://example.com')] }
115
+ let(:converted_queue) { previous_queue.to_a }
116
+
117
+ it "should convert the Set to an Array" do
118
+ expect(subject.queue).to eq(converted_queue)
119
+ end
120
+ end
121
+
122
+ context "when given an Array of Strings" do
123
+ let(:previous_queue) { Set['http://example.com'] }
124
+ let(:converted_queue) { previous_queue.map { |url| URI(url) } }
125
+
126
+ it "should convert the Strings to URIs" do
127
+ expect(subject.queue).to eq(converted_queue)
128
+ end
129
+ end
27
130
  end
28
131
 
29
- it "should convert new histories to an Set of URIs" do
30
- agent = Agent.new
31
- previous_history = ['http://www.example.com']
32
- expected_history = Set[URI('http://www.example.com')]
132
+ describe "#to_hash" do
133
+ let(:queue) { [URI("http://example.com/link")] }
134
+ let(:history) { Set[URI("http://example.com/")] }
135
+
136
+ subject do
137
+ described_class.new do |agent|
138
+ agent.queue = queue
139
+ agent.history = history
140
+ end
141
+ end
33
142
 
34
- agent.history = previous_history
35
- expect(agent.history).not_to eq(previous_history)
36
- expect(agent.history).to eq(expected_history)
143
+ it "should return the queue and history" do
144
+ expect(subject.to_hash).to be == {
145
+ history: history,
146
+ queue: queue
147
+ }
148
+ end
37
149
  end
38
150
 
39
- it "should be able to restore the failures" do
40
- agent = Agent.new
41
- previous_failures = Set[URI('http://localhost/')]
151
+ context "when spidering" do
152
+ include_context "example App"
153
+
154
+ context "local links" do
155
+ context "relative paths" do
156
+ app do
157
+ get '/' do
158
+ %{<html><body><a href="link">relative link</a></body></html>}
159
+ end
160
+
161
+ get '/link' do
162
+ '<html><body>got here</body></html>'
163
+ end
164
+ end
165
+
166
+ it "should expand relative paths of links" do
167
+ expect(subject.history).to be == Set[
168
+ URI("http://#{host}/"),
169
+ URI("http://#{host}/link")
170
+ ]
171
+ end
172
+
173
+ context "that contain directory escapes" do
174
+ app do
175
+ get '/' do
176
+ %{<html><body><a href="foo/./../../../../link">link</a></body></html>}
177
+ end
178
+
179
+ get '/link' do
180
+ '<html><body>got here</body></html>'
181
+ end
182
+ end
183
+
184
+ it "should expand relative paths before visiting them" do
185
+ expect(subject.history).to be == Set[
186
+ URI("http://#{host}/"),
187
+ URI("http://#{host}/link")
188
+ ]
189
+ end
190
+ end
191
+ end
192
+
193
+ context "absolute paths" do
194
+ app do
195
+ get '/' do
196
+ %{<html><body><a href="/link">absolute path</a></body></html>}
197
+ end
198
+
199
+ get '/link' do
200
+ '<html><body>got here</body></html>'
201
+ end
202
+ end
203
+
204
+ it "should visit links with absolute paths" do
205
+ expect(subject.history).to be == Set[
206
+ URI("http://#{host}/"),
207
+ URI("http://#{host}/link")
208
+ ]
209
+ end
210
+
211
+ context "that contain directory escapes" do
212
+ app do
213
+ get '/' do
214
+ %{<html><body><a href="/foo/./../../../../link">link</a></body></html>}
215
+ end
216
+
217
+ get '/link' do
218
+ '<html><body>got here</body></html>'
219
+ end
220
+ end
221
+
222
+ it "should expand absolute links before visiting them" do
223
+ expect(subject.history).to be == Set[
224
+ URI("http://#{host}/"),
225
+ URI("http://#{host}/link")
226
+ ]
227
+ end
228
+ end
229
+
230
+ end
231
+ end
232
+
233
+ context "remote links" do
234
+ app do
235
+ get '/' do
236
+ %{<html><body><a href="http://#{settings.host}/link">absolute link</a></body></html>}
237
+ end
238
+
239
+ get '/link' do
240
+ '<html><body>got here</body></html>'
241
+ end
242
+ end
243
+
244
+ it "should visit absolute links" do
245
+ expect(subject.history).to be == Set[
246
+ URI("http://#{host}/"),
247
+ URI("http://#{host}/link")
248
+ ]
249
+ end
250
+
251
+ context "that contain directory escapes" do
252
+ app do
253
+ get '/' do
254
+ %{<html><body><a href="http://#{settings.host}/foo/./../../../../link">link</a></body></html>}
255
+ end
256
+
257
+ get '/link' do
258
+ '<html><body>got here</body></html>'
259
+ end
260
+ end
261
+
262
+ it "should expand absolute links before visiting them" do
263
+ expect(subject.history).to be == Set[
264
+ URI("http://#{host}/"),
265
+ URI("http://#{host}/link")
266
+ ]
267
+ end
268
+ end
269
+ end
270
+
271
+ context "self-referential links" do
272
+ app do
273
+ get '/' do
274
+ %{<html><body><a href="/">same page</a></body></html>}
275
+ end
276
+ end
277
+
278
+ it "should ignore self-referential links" do
279
+ expect(subject.history).to be == Set[
280
+ URI("http://#{host}/")
281
+ ]
282
+ end
283
+ end
284
+
285
+ context "circular links" do
286
+ app do
287
+ get '/' do
288
+ %{<html><body><a href="/link">link</a></body></html>}
289
+ end
290
+
291
+ get '/link' do
292
+ %{<html><body><a href="/">previous page</a></body></html>}
293
+ end
294
+ end
295
+
296
+ it "should ignore links that have been previous visited" do
297
+ expect(subject.history).to be == Set[
298
+ URI("http://#{host}/"),
299
+ URI("http://#{host}/link")
300
+ ]
301
+ end
302
+ end
303
+
304
+ context "link cycles" do
305
+ app do
306
+ get '/' do
307
+ %{<html><body><a href="/link1">first link</a></body></html>}
308
+ end
309
+
310
+ get '/link1' do
311
+ %{<html><body><a href="/link2">next link</a></body></html>}
312
+ end
313
+
314
+ get '/link2' do
315
+ %{<html><body><a href="/">back to the beginning</a></body></html>}
316
+ end
317
+ end
318
+
319
+ it "should ignore links that have been previous visited" do
320
+ expect(subject.history).to be == Set[
321
+ URI("http://#{host}/"),
322
+ URI("http://#{host}/link1"),
323
+ URI("http://#{host}/link2"),
324
+ ]
325
+ end
326
+ end
327
+
328
+ context "fragment links" do
329
+ app do
330
+ get '/' do
331
+ %{<html><body><a href="#fragment">fragment link</a></body></html>}
332
+ end
333
+ end
334
+
335
+ it "should ignore fragment links" do
336
+ expect(subject.history).to be == Set[
337
+ URI("http://#{host}/")
338
+ ]
339
+ end
340
+ end
341
+
342
+ context "empty links" do
343
+ context "empty href" do
344
+ app do
345
+ get '/' do
346
+ %{<html><body><a href="">empty link</a> <a href=" ">blank link</a> <a>no href</a></body></html>}
347
+ end
348
+ end
349
+
350
+ it "should ignore links with empty hrefs" do
351
+ expect(subject.history).to be == Set[
352
+ URI("http://#{host}/")
353
+ ]
354
+ end
355
+ end
356
+
357
+ context "whitespace href" do
358
+ app do
359
+ get '/' do
360
+ %{<html><body><a href=" ">blank link</a></body></html>}
361
+ end
362
+ end
363
+
364
+ it "should ignore links containing only whitespace" do
365
+ expect(subject.history).to be == Set[
366
+ URI("http://#{host}/")
367
+ ]
368
+ end
369
+ end
370
+
371
+ context "missing href" do
372
+ app do
373
+ get '/' do
374
+ %{<html><body><a>no href</a></body></html>}
375
+ end
376
+ end
377
+
378
+ it "should ignore links with no href" do
379
+ expect(subject.history).to be == Set[
380
+ URI("http://#{host}/")
381
+ ]
382
+ end
383
+ end
384
+ end
385
+
386
+ context "frames" do
387
+ app do
388
+ get '/' do
389
+ %{<html><body><frameset><frame src="/frame" /></frameset></body></html>}
390
+ end
42
391
 
43
- agent.failures = previous_failures
44
- expect(agent.failures).to eq(previous_failures)
392
+ get '/frame' do
393
+ %{<html><body><a href="/link">link</a></body></html>}
394
+ end
395
+
396
+ get '/link' do
397
+ %{<html><body>got here</body></html>}
398
+ end
399
+ end
400
+
401
+ it "should visit the frame and links within the frame" do
402
+ expect(subject.history).to be == Set[
403
+ URI("http://#{host}/"),
404
+ URI("http://#{host}/frame"),
405
+ URI("http://#{host}/link")
406
+ ]
407
+ end
408
+ end
409
+
410
+ context "iframes" do
411
+ app do
412
+ get '/' do
413
+ %{<html><body><iframe src="/iframe" /></body></html>}
414
+ end
415
+
416
+ get '/iframe' do
417
+ %{<html><body><a href="/link">link</a></body></html>}
418
+ end
419
+
420
+ get '/link' do
421
+ %{<html><body>got here</body></html>}
422
+ end
423
+ end
424
+
425
+ it "should visit the iframe and links within the iframe" do
426
+ expect(subject.history).to be == Set[
427
+ URI("http://#{host}/"),
428
+ URI("http://#{host}/iframe"),
429
+ URI("http://#{host}/link")
430
+ ]
431
+ end
432
+ end
433
+
434
+ context "javascript links" do
435
+ app do
436
+ get '/' do
437
+ %{<html><body><a href="javascript:fail();">javascript link</a></body></html>}
438
+ end
439
+ end
440
+
441
+ it "should ignore javascript: links" do
442
+ expect(subject.history).to be == Set[
443
+ URI("http://#{host}/")
444
+ ]
445
+ end
446
+
447
+ context "when the link has an onclick action" do
448
+ app do
449
+ get '/' do
450
+ %{<html><body><a href="#" onclick="javascript:fail();">onclick link</a></body></html>}
451
+ end
452
+ end
453
+
454
+ it "should ignore links with onclick actions" do
455
+ expect(subject.history).to be == Set[
456
+ URI("http://#{host}/")
457
+ ]
458
+ end
459
+ end
460
+ end
461
+
462
+ context "cookies" do
463
+ app do
464
+ get '/' do
465
+ response.set_cookie 'visited', 'true'
466
+
467
+ %{<html><body><a href="/link">link</a></body></html>}
468
+ end
469
+
470
+ get '/link' do
471
+ if request.cookies['visited'] == 'true'
472
+ %{<html><body>got here</body></html>}
473
+ else
474
+ halt 401, "Cookie not set"
475
+ end
476
+ end
477
+ end
478
+
479
+ it "should record cookies and send them with each request" do
480
+ expect(subject.history).to be == Set[
481
+ URI("http://#{host}/"),
482
+ URI("http://#{host}/link"),
483
+ ]
484
+
485
+ expect(subject.cookies[host]).to be == {'visited' => 'true'}
486
+ end
487
+ end
488
+
489
+ context "redirects" do
490
+ context "300" do
491
+ app do
492
+ get '/' do
493
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
494
+ end
495
+
496
+ get '/redirect' do
497
+ redirect to('/link'), 300
498
+ end
499
+
500
+ get '/link' do
501
+ %{<html><body>got here</body></html>}
502
+ end
503
+ end
504
+
505
+ it "should follow HTTP 300 redirects" do
506
+ expect(subject.history).to be == Set[
507
+ URI("http://#{host}/"),
508
+ URI("http://#{host}/redirect"),
509
+ URI("http://#{host}/link"),
510
+ ]
511
+ end
512
+ end
513
+
514
+ context "301" do
515
+ app do
516
+ get '/' do
517
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
518
+ end
519
+
520
+ get '/redirect' do
521
+ redirect to('/link'), 301
522
+ end
523
+
524
+ get '/link' do
525
+ %{<html><body>got here</body></html>}
526
+ end
527
+ end
528
+
529
+ it "should follow HTTP 301 redirects" do
530
+ expect(subject.history).to be == Set[
531
+ URI("http://#{host}/"),
532
+ URI("http://#{host}/redirect"),
533
+ URI("http://#{host}/link"),
534
+ ]
535
+ end
536
+ end
537
+
538
+ context "302" do
539
+ app do
540
+ get '/' do
541
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
542
+ end
543
+
544
+ get '/redirect' do
545
+ redirect to('/link'), 302
546
+ end
547
+
548
+ get '/link' do
549
+ %{<html><body>got here</body></html>}
550
+ end
551
+ end
552
+
553
+ it "should follow HTTP 302 redirects" do
554
+ expect(subject.history).to be == Set[
555
+ URI("http://#{host}/"),
556
+ URI("http://#{host}/redirect"),
557
+ URI("http://#{host}/link"),
558
+ ]
559
+ end
560
+ end
561
+
562
+ context "303" do
563
+ app do
564
+ get '/' do
565
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
566
+ end
567
+
568
+ get '/redirect' do
569
+ redirect to('/link'), 303
570
+ end
571
+
572
+ get '/link' do
573
+ %{<html><body>got here</body></html>}
574
+ end
575
+ end
576
+
577
+ it "should follow HTTP 303 redirects" do
578
+ expect(subject.history).to be == Set[
579
+ URI("http://#{host}/"),
580
+ URI("http://#{host}/redirect"),
581
+ URI("http://#{host}/link"),
582
+ ]
583
+ end
584
+ end
585
+
586
+ context "307" do
587
+ app do
588
+ get '/' do
589
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
590
+ end
591
+
592
+ get '/redirect' do
593
+ redirect to('/link'), 307
594
+ end
595
+
596
+ get '/link' do
597
+ %{<html><body>got here</body></html>}
598
+ end
599
+ end
600
+
601
+ it "should follow HTTP 307 redirects" do
602
+ expect(subject.history).to be == Set[
603
+ URI("http://#{host}/"),
604
+ URI("http://#{host}/redirect"),
605
+ URI("http://#{host}/link"),
606
+ ]
607
+ end
608
+ end
609
+
610
+ context "meta-refresh" do
611
+ app do
612
+ get '/' do
613
+ %{<html><body><a href="/redirect">redirect</a></body></html>}
614
+ end
615
+
616
+ get '/redirect' do
617
+ %{<html><head><meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" /></head><body>Redirecting...</body></html>}
618
+ end
619
+
620
+ get '/link' do
621
+ %{<html><body>got here</body></html>}
622
+ end
623
+ end
624
+
625
+ it "should follow meta-refresh redirects" do
626
+ expect(subject.history).to be == Set[
627
+ URI("http://#{host}/"),
628
+ URI("http://#{host}/redirect"),
629
+ URI("http://#{host}/link"),
630
+ ]
631
+ end
632
+ end
633
+ end
634
+
635
+ context "Basic-Auth" do
636
+ app do
637
+ set :user, 'admin'
638
+ set :password, 'swordfish'
639
+
640
+ get '/' do
641
+ %{<html><body><a href="/private">private link</a></body></html>}
642
+ end
643
+
644
+ get '/private' do
645
+ auth = Rack::Auth::Basic::Request.new(request.env)
646
+
647
+ if auth.provided? && auth.basic? && auth.credentials && \
648
+ auth.credentials == [settings.user, settings.password]
649
+ %{<html><body>got here</body></html>}
650
+ else
651
+ headers['WWW-Authenticate'] = %{Basic realm="Restricted Area"}
652
+ halt 401, "<html><body><h1>Not authorized</h1></body></html>"
653
+ end
654
+ end
655
+ end
656
+
657
+ before do
658
+ subject.authorized.add("http://#{host}/private", app.user, app.password)
659
+ end
660
+
661
+ it "should send HTTP Basic-Auth credentials for protected URLs" do
662
+ expect(subject.history).to be == Set[
663
+ URI("http://#{host}/"),
664
+ URI("http://#{host}/private")
665
+ ]
666
+ end
667
+ end
45
668
  end
46
669
 
47
- it "should convert new histories to a Set of URIs" do
48
- agent = Agent.new
49
- previous_failures = ['http://localhost/']
50
- expected_failures = Set[URI('http://localhost/')]
670
+ context "when :host is specified" do
671
+ include_context "example App"
51
672
 
52
- agent.failures = previous_failures
53
- expect(agent.failures).not_to eq(previous_failures)
54
- expect(agent.failures).to eq(expected_failures)
673
+ subject { described_class.new(host: host) }
674
+
675
+ app do
676
+ get '/' do
677
+ %{<html><body><a href="http://google.com/">external link</a> <a href="/link">local link</a></body></html>}
678
+ end
679
+
680
+ get '/link' do
681
+ %{<html><body>got here</body></html>}
682
+ end
683
+ end
684
+
685
+ it "should only visit links on the host" do
686
+ expect(subject.history).to be == Set[
687
+ URI("http://#{host}/"),
688
+ URI("http://#{host}/link")
689
+ ]
690
+ end
55
691
  end
56
692
 
57
- it "should be able to restore the queue" do
58
- agent = Agent.new
59
- previous_queue = [URI('http://www.example.com')]
693
+ context "when :limit is set" do
694
+ include_context "example App"
695
+
696
+ let(:limit) { 10 }
60
697
 
61
- agent.queue = previous_queue
62
- expect(agent.queue).to eq(previous_queue)
698
+ subject { described_class.new(host: host, limit: limit) }
699
+
700
+ app do
701
+ get '/' do
702
+ i = Integer(params['i'] || 0)
703
+
704
+ %{<html><body><a href="/?i=#{i+1}">next link</a></body></html>}
705
+ end
706
+ end
707
+
708
+ it "must only visit the maximum number of links" do
709
+ expect(subject.history).to be == Set[
710
+ URI("http://#{host}/"),
711
+ URI("http://#{host}/?i=1"),
712
+ URI("http://#{host}/?i=2"),
713
+ URI("http://#{host}/?i=3"),
714
+ URI("http://#{host}/?i=4"),
715
+ URI("http://#{host}/?i=5"),
716
+ URI("http://#{host}/?i=6"),
717
+ URI("http://#{host}/?i=7"),
718
+ URI("http://#{host}/?i=8"),
719
+ URI("http://#{host}/?i=9"),
720
+ ]
721
+ end
63
722
  end
64
723
 
65
- it "should convert new queues to an Array of URIs" do
66
- agent = Agent.new
67
- previous_queue = ['http://www.example.com']
68
- expected_queue = [URI('http://www.example.com')]
724
+ context "when :depth is set" do
725
+ include_context "example App"
726
+
727
+ app do
728
+ get '/' do
729
+ %{<html><body><a href="/left?d=1">left</a><a href="/right?d=1">right</a></body></html>}
730
+ end
731
+
732
+ get %r{^/left|/right} do
733
+ d = Integer(params['d'])
734
+
735
+ %{<html><body><a href="/left?d=#{d+1}">left</a><a href="/right?d=#{d+1}">right</a></body></html>}
736
+ end
737
+ end
738
+
739
+ context "depth 0" do
740
+ subject { described_class.new(host: host, max_depth: 0) }
741
+
742
+ it "must only visit the first page" do
743
+ expect(subject.history).to be == Set[URI("http://#{host}/")]
744
+ end
745
+ end
69
746
 
70
- agent.queue = previous_queue
71
- expect(agent.queue).not_to eq(previous_queue)
72
- expect(agent.queue).to eq(expected_queue)
747
+ context "depth > 0" do
748
+ subject { described_class.new(host: host, max_depth: 2) }
749
+
750
+ it "must visit links below the maximum depth" do
751
+ expect(subject.history).to be == Set[
752
+ URI("http://#{host}/"),
753
+ URI("http://#{host}/left?d=1"),
754
+ URI("http://#{host}/right?d=1"),
755
+ URI("http://#{host}/left?d=2"),
756
+ URI("http://#{host}/right?d=2")
757
+ ]
758
+ end
759
+ end
73
760
  end
74
761
 
75
- it "should provide a to_hash method that returns the queue and history" do
76
- hash = @agent.to_hash
762
+ context "when :robots is enabled" do
763
+ include_context "example App"
764
+
765
+ let(:user_agent) { 'Ruby' }
766
+
767
+ subject do
768
+ described_class.new(
769
+ host: host,
770
+ user_agent: user_agent,
771
+ robots: true
772
+ )
773
+ end
774
+
775
+ app do
776
+ get '/' do
777
+ %{<html><body><a href="/secret">don't follow this link</a> <a href="/pub">follow this link</a></body></html>}
778
+ end
779
+
780
+ get '/pub' do
781
+ %{<html><body>got here</body></html>}
782
+ end
783
+
784
+ get '/robots.txt' do
785
+ content_type 'text/plain'
786
+
787
+ [
788
+ "User-agent: *",
789
+ 'Disallow: /',
790
+ ].join($/)
791
+ end
792
+ end
793
+
794
+ it "should not follow links Disallowed by robots.txt" do
795
+ pending "https://github.com/bblimke/webmock/issues/642"
77
796
 
78
- expect(hash[:queue]).to be_empty
79
- expect(hash[:history]).not_to be_empty
797
+ expect(subject.history).to be == Set[
798
+ URI("http://#{host}/"),
799
+ URI("http://#{host}/pub")
800
+ ]
801
+ end
80
802
  end
81
803
  end