spidr 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +17 -0
- data/Gemfile +8 -5
- data/LICENSE.txt +1 -1
- data/README.md +137 -78
- data/Rakefile +1 -0
- data/gemspec.yml +8 -1
- data/lib/spidr/agent/actions.rb +1 -1
- data/lib/spidr/agent/events.rb +1 -1
- data/lib/spidr/agent/filters.rb +55 -56
- data/lib/spidr/agent/sanitizers.rb +6 -9
- data/lib/spidr/agent.rb +230 -120
- data/lib/spidr/auth_store.rb +10 -6
- data/lib/spidr/page/content_types.rb +51 -0
- data/lib/spidr/page/html.rb +17 -19
- data/lib/spidr/page/status_codes.rb +12 -10
- data/lib/spidr/proxy.rb +6 -14
- data/lib/spidr/rules.rb +5 -8
- data/lib/spidr/session_cache.rb +23 -21
- data/lib/spidr/settings/proxy.rb +19 -5
- data/lib/spidr/spidr.rb +16 -6
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +357 -10
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- metadata +19 -19
- data/.travis.yml +0 -14
data/spec/agent_spec.rb
CHANGED
@@ -7,6 +7,250 @@ require 'spidr/agent'
|
|
7
7
|
describe Agent do
|
8
8
|
it_should_behave_like "includes Spidr::Settings::UserAgent"
|
9
9
|
|
10
|
+
describe ".start_at" do
|
11
|
+
module TestAgentStartAt
|
12
|
+
class ExampleApp < Sinatra::Base
|
13
|
+
|
14
|
+
set :host, 'example.com'
|
15
|
+
set :port, 80
|
16
|
+
|
17
|
+
get '/' do
|
18
|
+
'<html><body>should not get here</body></html>'
|
19
|
+
end
|
20
|
+
|
21
|
+
get '/entry-point' do
|
22
|
+
<<~HTML
|
23
|
+
<html>
|
24
|
+
<body>
|
25
|
+
<a href="/link1">link1</a>
|
26
|
+
<a href="http://other.com/offsite-link">offsite link</a>
|
27
|
+
<a href="/link2">link2</a>
|
28
|
+
</body>
|
29
|
+
</html>
|
30
|
+
HTML
|
31
|
+
end
|
32
|
+
|
33
|
+
get '/link1' do
|
34
|
+
'<html><body>got here</body></html>'
|
35
|
+
end
|
36
|
+
|
37
|
+
get '/link2' do
|
38
|
+
'<html><body>got here</body></html>'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class OtherApp < Sinatra::Base
|
43
|
+
|
44
|
+
set :host, 'other.com'
|
45
|
+
set :port, 80
|
46
|
+
|
47
|
+
get '/offsite-link' do
|
48
|
+
'<html><body>should not get here</body></html>'
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
subject { described_class }
|
55
|
+
|
56
|
+
let(:host) { 'example.com' }
|
57
|
+
let(:other_host) { 'other.com' }
|
58
|
+
let(:url) { URI("http://#{host}/entry-point") }
|
59
|
+
|
60
|
+
let(:app) { TestAgentStartAt::ExampleApp }
|
61
|
+
let(:other_app) { TestAgentStartAt::OtherApp }
|
62
|
+
|
63
|
+
before do
|
64
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
65
|
+
stub_request(:any, /#{Regexp.escape(other_host)}/).to_rack(other_app)
|
66
|
+
end
|
67
|
+
|
68
|
+
it "must spider the website starting at the given URL" do
|
69
|
+
agent = subject.start_at(url)
|
70
|
+
|
71
|
+
expect(agent.history).to be == Set[
|
72
|
+
URI("http://#{host}/entry-point"),
|
73
|
+
URI("http://#{host}/link1"),
|
74
|
+
URI("http://#{other_host}/offsite-link"),
|
75
|
+
URI("http://#{host}/link2")
|
76
|
+
]
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
describe ".site" do
|
81
|
+
module TestAgentSite
|
82
|
+
class ExampleApp < Sinatra::Base
|
83
|
+
|
84
|
+
set :host, 'example.com'
|
85
|
+
set :port, 80
|
86
|
+
|
87
|
+
get '/' do
|
88
|
+
'<html><body>should not get here</body></html>'
|
89
|
+
end
|
90
|
+
|
91
|
+
get '/entry-point' do
|
92
|
+
<<~HTML
|
93
|
+
<html>
|
94
|
+
<body>
|
95
|
+
<a href="/link1">link1</a>
|
96
|
+
<a href="http://other.com/offsite-link">offsite link</a>
|
97
|
+
<a href="/link2">link2</a>
|
98
|
+
</body>
|
99
|
+
</html>
|
100
|
+
HTML
|
101
|
+
end
|
102
|
+
|
103
|
+
get '/link1' do
|
104
|
+
'<html><body>got here</body></html>'
|
105
|
+
end
|
106
|
+
|
107
|
+
get '/link2' do
|
108
|
+
'<html><body>got here</body></html>'
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
subject { described_class }
|
115
|
+
|
116
|
+
let(:host) { 'example.com' }
|
117
|
+
let(:url) { URI("http://#{host}/entry-point") }
|
118
|
+
|
119
|
+
let(:app) { TestAgentSite::ExampleApp }
|
120
|
+
|
121
|
+
before do
|
122
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
123
|
+
end
|
124
|
+
|
125
|
+
it "must spider the website starting at the given URL" do
|
126
|
+
agent = subject.site(url)
|
127
|
+
|
128
|
+
expect(agent.history).to be == Set[
|
129
|
+
URI("http://#{host}/entry-point"),
|
130
|
+
URI("http://#{host}/link1"),
|
131
|
+
URI("http://#{host}/link2")
|
132
|
+
]
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
describe ".host" do
|
137
|
+
module TestAgentHost
|
138
|
+
class ExampleApp < Sinatra::Base
|
139
|
+
|
140
|
+
set :host, 'example.com'
|
141
|
+
set :port, 80
|
142
|
+
|
143
|
+
get '/' do
|
144
|
+
<<~HTML
|
145
|
+
<html>
|
146
|
+
<body>
|
147
|
+
<a href="/link1">link1</a>
|
148
|
+
<a href="http://other.com/offsite-link">offsite link</a>
|
149
|
+
<a href="/link2">link2</a>
|
150
|
+
</body>
|
151
|
+
</html>
|
152
|
+
HTML
|
153
|
+
end
|
154
|
+
|
155
|
+
get '/link1' do
|
156
|
+
'<html><body>got here</body></html>'
|
157
|
+
end
|
158
|
+
|
159
|
+
get '/link2' do
|
160
|
+
'<html><body>got here</body></html>'
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
subject { described_class }
|
167
|
+
|
168
|
+
let(:host) { 'example.com' }
|
169
|
+
let(:app) { TestAgentHost::ExampleApp }
|
170
|
+
|
171
|
+
before do
|
172
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
173
|
+
end
|
174
|
+
|
175
|
+
it "must spider the website starting at the given URL" do
|
176
|
+
agent = subject.host(host)
|
177
|
+
|
178
|
+
# XXX: for some reason Set#== was returning false, so convert to an Array
|
179
|
+
expect(agent.history.to_a).to be == [
|
180
|
+
URI("http://#{host}/"),
|
181
|
+
URI("http://#{host}/link1"),
|
182
|
+
URI("http://#{host}/link2")
|
183
|
+
]
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
describe ".domain" do
|
188
|
+
module TestAgentDomain
|
189
|
+
class ExampleApp < Sinatra::Base
|
190
|
+
|
191
|
+
set :host, 'example.com'
|
192
|
+
set :port, 80
|
193
|
+
|
194
|
+
get '/' do
|
195
|
+
<<~HTML
|
196
|
+
<html>
|
197
|
+
<body>
|
198
|
+
<a href="/link1">link1</a>
|
199
|
+
<a href="http://sub.example.com/subdomain-link">subdomain link</a>
|
200
|
+
<a href="/link2">link2</a>
|
201
|
+
</body>
|
202
|
+
</html>
|
203
|
+
HTML
|
204
|
+
end
|
205
|
+
|
206
|
+
get '/link1' do
|
207
|
+
'<html><body>got here</body></html>'
|
208
|
+
end
|
209
|
+
|
210
|
+
get '/link2' do
|
211
|
+
'<html><body>got here</body></html>'
|
212
|
+
end
|
213
|
+
|
214
|
+
end
|
215
|
+
|
216
|
+
class SubDomainApp < Sinatra::Base
|
217
|
+
|
218
|
+
set :host, 'sub.example.com'
|
219
|
+
set :port, 80
|
220
|
+
|
221
|
+
get '/subdomain-link' do
|
222
|
+
'<html><body>should get here</body></html>'
|
223
|
+
end
|
224
|
+
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
subject { described_class }
|
229
|
+
|
230
|
+
let(:domain) { 'example.com' }
|
231
|
+
let(:domain_app) { TestAgentDomain::ExampleApp }
|
232
|
+
|
233
|
+
let(:subdomain) { 'sub.example.com' }
|
234
|
+
let(:subdomain_app) { TestAgentDomain::SubDomainApp }
|
235
|
+
|
236
|
+
before do
|
237
|
+
stub_request(:any, /#{Regexp.escape(subdomain)}/).to_rack(subdomain_app)
|
238
|
+
stub_request(:any, /#{Regexp.escape(domain)}/).to_rack(domain_app)
|
239
|
+
end
|
240
|
+
|
241
|
+
it "must spider the domain and subdomains starting at the given domain" do
|
242
|
+
agent = subject.domain(domain)
|
243
|
+
|
244
|
+
# XXX: for some reason Set#== was returning false, so convert to an Array
|
245
|
+
expect(agent.history.to_a).to be == [
|
246
|
+
URI("http://#{domain}/"),
|
247
|
+
URI("http://#{domain}/link1"),
|
248
|
+
URI("http://#{subdomain}/subdomain-link"),
|
249
|
+
URI("http://#{domain}/link2")
|
250
|
+
]
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
10
254
|
describe "#initialize" do
|
11
255
|
it "should not be running" do
|
12
256
|
expect(subject).to_not be_running
|
@@ -32,6 +276,68 @@ describe Agent do
|
|
32
276
|
expect(subject.sessions).to be_kind_of(SessionCache)
|
33
277
|
end
|
34
278
|
|
279
|
+
context "when the proxy: keyword argument is given" do
|
280
|
+
let(:proxy) do
|
281
|
+
Spidr::Proxy.new(host: 'example.com')
|
282
|
+
end
|
283
|
+
|
284
|
+
subject { described_class.new(proxy: proxy) }
|
285
|
+
|
286
|
+
it "must initialize the #proxy of #session_cache" do
|
287
|
+
expect(subject.sessions.proxy).to be(proxy)
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
context "when the open_timeout: keyword argument is given" do
|
292
|
+
let(:open_timeout) { 5 }
|
293
|
+
|
294
|
+
subject { described_class.new(open_timeout: open_timeout) }
|
295
|
+
|
296
|
+
it "must initialize the #open_timeout of #session_cache" do
|
297
|
+
expect(subject.sessions.open_timeout).to eq(open_timeout)
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
context "when the ssl_timeout: keyword argument is given" do
|
302
|
+
let(:ssl_timeout) { 5 }
|
303
|
+
|
304
|
+
subject { described_class.new(ssl_timeout: ssl_timeout) }
|
305
|
+
|
306
|
+
it "must initialize the #ssl_timeout of #session_cache" do
|
307
|
+
expect(subject.sessions.ssl_timeout).to eq(ssl_timeout)
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
context "when the read_timeout: keyword argument is given" do
|
312
|
+
let(:read_timeout) { 5 }
|
313
|
+
|
314
|
+
subject { described_class.new(read_timeout: read_timeout) }
|
315
|
+
|
316
|
+
it "must initialize the #read_timeout of #session_cache" do
|
317
|
+
expect(subject.sessions.read_timeout).to eq(read_timeout)
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
context "when the continue_timeout: keyword argument is given" do
|
322
|
+
let(:continue_timeout) { 5 }
|
323
|
+
|
324
|
+
subject { described_class.new(continue_timeout: continue_timeout) }
|
325
|
+
|
326
|
+
it "must initialize the #continue_timeout of #session_cache" do
|
327
|
+
expect(subject.sessions.continue_timeout).to eq(continue_timeout)
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
context "when the keep_alive_timeout: keyword argument is given" do
|
332
|
+
let(:keep_alive_timeout) { 5 }
|
333
|
+
|
334
|
+
subject { described_class.new(keep_alive_timeout: keep_alive_timeout) }
|
335
|
+
|
336
|
+
it "must initialize the #keep_alive_timeout of #session_cache" do
|
337
|
+
expect(subject.sessions.keep_alive_timeout).to eq(keep_alive_timeout)
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
35
341
|
it "should initialize the #cookie_jar" do
|
36
342
|
expect(subject.cookies).to be_kind_of(CookieJar)
|
37
343
|
end
|
@@ -386,7 +692,15 @@ describe Agent do
|
|
386
692
|
context "frames" do
|
387
693
|
app do
|
388
694
|
get '/' do
|
389
|
-
|
695
|
+
<<~HTML
|
696
|
+
<html>
|
697
|
+
<body>
|
698
|
+
<frameset>
|
699
|
+
<frame src="/frame" />
|
700
|
+
</frameset>
|
701
|
+
</body>
|
702
|
+
</html>
|
703
|
+
HTML
|
390
704
|
end
|
391
705
|
|
392
706
|
get '/frame' do
|
@@ -614,7 +928,14 @@ describe Agent do
|
|
614
928
|
end
|
615
929
|
|
616
930
|
get '/redirect' do
|
617
|
-
|
931
|
+
<<~HTML
|
932
|
+
<html>
|
933
|
+
<head>
|
934
|
+
<meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" />
|
935
|
+
</head>
|
936
|
+
<body>Redirecting...</body>
|
937
|
+
</html>
|
938
|
+
HTML
|
618
939
|
end
|
619
940
|
|
620
941
|
get '/link' do
|
@@ -674,7 +995,14 @@ describe Agent do
|
|
674
995
|
|
675
996
|
app do
|
676
997
|
get '/' do
|
677
|
-
|
998
|
+
<<~HTML
|
999
|
+
<html>
|
1000
|
+
<body>
|
1001
|
+
<a href="http://google.com/">external link</a>
|
1002
|
+
<a href="/link">local link</a>
|
1003
|
+
</body>
|
1004
|
+
</html>
|
1005
|
+
HTML
|
678
1006
|
end
|
679
1007
|
|
680
1008
|
get '/link' do
|
@@ -726,13 +1054,27 @@ describe Agent do
|
|
726
1054
|
|
727
1055
|
app do
|
728
1056
|
get '/' do
|
729
|
-
|
1057
|
+
<<~HTML
|
1058
|
+
<html>
|
1059
|
+
<body>
|
1060
|
+
<a href="/left?d=1">left</a>
|
1061
|
+
<a href="/right?d=1">right</a>
|
1062
|
+
</body>
|
1063
|
+
</html>
|
1064
|
+
HTML
|
730
1065
|
end
|
731
1066
|
|
732
|
-
get %r{
|
1067
|
+
get %r{/left|/right} do
|
733
1068
|
d = Integer(params['d'])
|
734
1069
|
|
735
|
-
|
1070
|
+
<<~HTML
|
1071
|
+
<html>
|
1072
|
+
<body>
|
1073
|
+
<a href="/left?d=#{d+1}">left</a>
|
1074
|
+
<a href="/right?d=#{d+1}">right</a>
|
1075
|
+
</body>
|
1076
|
+
</html>
|
1077
|
+
HTML
|
736
1078
|
end
|
737
1079
|
end
|
738
1080
|
|
@@ -774,7 +1116,14 @@ describe Agent do
|
|
774
1116
|
|
775
1117
|
app do
|
776
1118
|
get '/' do
|
777
|
-
|
1119
|
+
<<~HTML
|
1120
|
+
<html>
|
1121
|
+
<body>
|
1122
|
+
<a href="/secret">don't follow this link</a>
|
1123
|
+
<a href="/pub">follow this link</a>
|
1124
|
+
</body>
|
1125
|
+
</html>
|
1126
|
+
HTML
|
778
1127
|
end
|
779
1128
|
|
780
1129
|
get '/pub' do
|
@@ -786,14 +1135,12 @@ describe Agent do
|
|
786
1135
|
|
787
1136
|
[
|
788
1137
|
"User-agent: *",
|
789
|
-
'Disallow: /',
|
1138
|
+
'Disallow: /secret',
|
790
1139
|
].join($/)
|
791
1140
|
end
|
792
1141
|
end
|
793
1142
|
|
794
1143
|
it "should not follow links Disallowed by robots.txt" do
|
795
|
-
pending "https://github.com/bblimke/webmock/issues/642"
|
796
|
-
|
797
1144
|
expect(subject.history).to be == Set[
|
798
1145
|
URI("http://#{host}/"),
|
799
1146
|
URI("http://#{host}/pub")
|
data/spec/example_page.rb
CHANGED
@@ -147,4 +147,26 @@ describe Page do
|
|
147
147
|
describe "#zip?" do
|
148
148
|
include_examples "Content-Type method", :zip?, 'application/zip'
|
149
149
|
end
|
150
|
+
|
151
|
+
describe "#png?" do
|
152
|
+
include_examples "Content-Type method", :png?, 'image/png'
|
153
|
+
end
|
154
|
+
|
155
|
+
describe "#gif?" do
|
156
|
+
include_examples "Content-Type method", :gif?, 'image/gif'
|
157
|
+
end
|
158
|
+
|
159
|
+
describe "#jpeg?" do
|
160
|
+
include_examples "Content-Type method", :jpeg?, 'image/jpeg'
|
161
|
+
end
|
162
|
+
|
163
|
+
describe "#ico?" do
|
164
|
+
context "when 'Content-Type' is 'image/x-icon'" do
|
165
|
+
include_examples "Content-Type method", :ico?, 'image/x-icon'
|
166
|
+
end
|
167
|
+
|
168
|
+
context "when 'Content-Type' is 'image/vnd.microsoft.icon'" do
|
169
|
+
include_examples "Content-Type method", :ico?, 'image/vnd.microsoft.icon'
|
170
|
+
end
|
171
|
+
end
|
150
172
|
end
|