ronin-web-spider 0.1.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.github/workflows/ruby.yml +31 -0
- data/.gitignore +13 -0
- data/.rspec +1 -0
- data/.ruby-version +1 -0
- data/.yardopts +1 -0
- data/COPYING.txt +165 -0
- data/ChangeLog.md +19 -0
- data/Gemfile +31 -0
- data/README.md +139 -0
- data/Rakefile +31 -0
- data/gemspec.yml +27 -0
- data/lib/ronin/web/spider/agent.rb +302 -0
- data/lib/ronin/web/spider/archive.rb +116 -0
- data/lib/ronin/web/spider/exceptions.rb +36 -0
- data/lib/ronin/web/spider/git_archive.rb +194 -0
- data/lib/ronin/web/spider/version.rb +27 -0
- data/lib/ronin/web/spider.rb +115 -0
- data/ronin-web-spider.gemspec +61 -0
- data/spec/agent_spec.rb +585 -0
- data/spec/archive_spec.rb +91 -0
- data/spec/example_app.rb +27 -0
- data/spec/git_archive_spec.rb +137 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/spider_spec.rb +252 -0
- metadata +122 -0
data/spec/agent_spec.rb
ADDED
@@ -0,0 +1,585 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'ronin/web/spider/agent'
|
3
|
+
|
4
|
+
require 'webmock/rspec'
|
5
|
+
require 'sinatra/base'
|
6
|
+
|
7
|
+
describe Ronin::Web::Spider::Agent do
|
8
|
+
describe "#initialize" do
|
9
|
+
context "when Ronin::Support::Network::HTTP.proxy is set" do
|
10
|
+
let(:proxy_host) { 'example.com' }
|
11
|
+
let(:proxy_port) { 8080 }
|
12
|
+
let(:proxy_uri) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
|
13
|
+
|
14
|
+
before { Ronin::Support::Network::HTTP.proxy = proxy_uri }
|
15
|
+
|
16
|
+
it "must parse ENV['RONIN_HTTP_USER_AGENT'] and set #proxy" do
|
17
|
+
expect(subject.proxy).to be_kind_of(Spidr::Proxy)
|
18
|
+
expect(subject.proxy.host).to eq(proxy_host)
|
19
|
+
expect(subject.proxy.port).to eq(proxy_port)
|
20
|
+
end
|
21
|
+
|
22
|
+
after { Ronin::Support::Network::HTTP.proxy = nil }
|
23
|
+
end
|
24
|
+
|
25
|
+
context "when Ronin::Support::Network::HTTP.user_agent is set" do
|
26
|
+
let(:user_agent) { 'Foo Bar' }
|
27
|
+
|
28
|
+
before { Ronin::Support::Network::HTTP.user_agent = user_agent }
|
29
|
+
|
30
|
+
it "must default #user_agent to ENV['RONIN_HTTP_USER_AGENT']" do
|
31
|
+
expect(subject.user_agent).to eq(user_agent)
|
32
|
+
end
|
33
|
+
|
34
|
+
after { Ronin::Support::Network::HTTP.user_agent = nil }
|
35
|
+
end
|
36
|
+
|
37
|
+
context "when given the proxy: keyword argument" do
|
38
|
+
let(:proxy_host) { 'example.com' }
|
39
|
+
let(:proxy_port) { 8080 }
|
40
|
+
|
41
|
+
context "and it's an Addressable::URI" do
|
42
|
+
let(:proxy) { Addressable::URI.new(host: proxy_host, port: proxy_port) }
|
43
|
+
|
44
|
+
subject { described_class.new(proxy: proxy) }
|
45
|
+
|
46
|
+
it "must convert it to a Spidr::Proxy object" do
|
47
|
+
expect(subject.proxy).to be_kind_of(Spidr::Proxy)
|
48
|
+
expect(subject.proxy.host).to eq(proxy_host)
|
49
|
+
expect(subject.proxy.port).to eq(proxy_port)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
context "and it's an URI::HTTP" do
|
54
|
+
let(:proxy) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
|
55
|
+
|
56
|
+
subject { described_class.new(proxy: proxy) }
|
57
|
+
|
58
|
+
it "must convert it to a Spidr::Proxy object" do
|
59
|
+
expect(subject.proxy).to be_kind_of(Spidr::Proxy)
|
60
|
+
expect(subject.proxy.host).to eq(proxy_host)
|
61
|
+
expect(subject.proxy.port).to eq(proxy_port)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
context "and it's a Hash" do
|
66
|
+
let(:proxy) do
|
67
|
+
{host: proxy_host, port: proxy_port}
|
68
|
+
end
|
69
|
+
|
70
|
+
subject { described_class.new(proxy: proxy) }
|
71
|
+
|
72
|
+
it "must convert it to a Spidr::Proxy object" do
|
73
|
+
expect(subject.proxy).to be_kind_of(Spidr::Proxy)
|
74
|
+
expect(subject.proxy.host).to eq(proxy_host)
|
75
|
+
expect(subject.proxy.port).to eq(proxy_port)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context "and it's a String" do
|
80
|
+
let(:proxy) { "http://#{proxy_host}:#{proxy_port}" }
|
81
|
+
|
82
|
+
subject { described_class.new(proxy: proxy) }
|
83
|
+
|
84
|
+
it "must convert it to a Spidr::Proxy object" do
|
85
|
+
expect(subject.proxy).to be_kind_of(Spidr::Proxy)
|
86
|
+
expect(subject.proxy.host).to eq(proxy_host)
|
87
|
+
expect(subject.proxy.port).to eq(proxy_port)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
context "when given the user_agent: keyword argument" do
|
93
|
+
context "and it's a String" do
|
94
|
+
let(:user_agent) { "test user-agent" }
|
95
|
+
|
96
|
+
subject { described_class.new(user_agent: user_agent) }
|
97
|
+
|
98
|
+
it "must set the #user_agent" do
|
99
|
+
expect(subject.user_agent).to eq(user_agent)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
context "and it's a Symbol" do
|
104
|
+
let(:user_agent) { :chrome_linux }
|
105
|
+
let(:expected_user_agent) do
|
106
|
+
Ronin::Support::Network::HTTP::UserAgents[user_agent]
|
107
|
+
end
|
108
|
+
|
109
|
+
subject { described_class.new(user_agent: user_agent) }
|
110
|
+
|
111
|
+
it "must map the Symbol to one of Ronin::Support::Network::HTTP::UserAgents" do
|
112
|
+
expect(subject.user_agent).to eq(expected_user_agent)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
it "must default #visited_hosts to nil" do
|
118
|
+
expect(subject.visited_hosts).to be(nil)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
describe "#every_host" do
|
123
|
+
module TestAgentEveryHost
|
124
|
+
class Host1 < Sinatra::Base
|
125
|
+
|
126
|
+
set :host, 'host1.example.com'
|
127
|
+
set :port, 80
|
128
|
+
|
129
|
+
get '/' do
|
130
|
+
<<~HTML
|
131
|
+
<html>
|
132
|
+
<body>
|
133
|
+
<a href="/link1">link1</a>
|
134
|
+
<a href="http://host2.example.com/offsite-link">offsite link</a>
|
135
|
+
<a href="/link2">link2</a>
|
136
|
+
</body>
|
137
|
+
</html>
|
138
|
+
HTML
|
139
|
+
end
|
140
|
+
|
141
|
+
get '/link1' do
|
142
|
+
'<html><body>got here</body></html>'
|
143
|
+
end
|
144
|
+
|
145
|
+
get '/link2' do
|
146
|
+
'<html><body>got here</body></html>'
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
class Host2 < Sinatra::Base
|
151
|
+
|
152
|
+
set :host, 'host2.example.com'
|
153
|
+
set :port, 80
|
154
|
+
|
155
|
+
get '/offsite-link' do
|
156
|
+
'<html><body>should not get here</body></html>'
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
let(:host1) { 'host1.example.com' }
|
163
|
+
let(:host2) { 'host2.example.com' }
|
164
|
+
|
165
|
+
let(:host1_app) { TestAgentEveryHost::Host1 }
|
166
|
+
let(:host2_app) { TestAgentEveryHost::Host2 }
|
167
|
+
|
168
|
+
before do
|
169
|
+
stub_request(:any, /#{Regexp.escape(host1)}/).to_rack(host1_app)
|
170
|
+
stub_request(:any, /#{Regexp.escape(host2)}/).to_rack(host2_app)
|
171
|
+
end
|
172
|
+
|
173
|
+
it "must yield every newly discovered hostname while spidering" do
|
174
|
+
yielded_hosts = []
|
175
|
+
|
176
|
+
subject.every_host do |host|
|
177
|
+
yielded_hosts << host
|
178
|
+
end
|
179
|
+
|
180
|
+
subject.start_at("http://#{host1}/")
|
181
|
+
|
182
|
+
expect(yielded_hosts).to eq([host1, host2])
|
183
|
+
end
|
184
|
+
|
185
|
+
it "must popualte #visited_hosts" do
|
186
|
+
subject.every_host { |host| }
|
187
|
+
subject.start_at("http://#{host1}/")
|
188
|
+
|
189
|
+
expect(subject.visited_hosts).to be_kind_of(Set)
|
190
|
+
expect(subject.visited_hosts.entries).to eq([host1, host2])
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
# TODO: need to figure out how to test #every_cert using webmock.
|
195
|
+
describe "#every_cert"
|
196
|
+
|
197
|
+
describe "#every_favicon" do
|
198
|
+
module TestAgentEveryHost
|
199
|
+
class TestApp < Sinatra::Base
|
200
|
+
|
201
|
+
set :host, 'example.com'
|
202
|
+
set :port, 80
|
203
|
+
|
204
|
+
get '/' do
|
205
|
+
<<~HTML
|
206
|
+
<html>
|
207
|
+
<head>
|
208
|
+
<link rel="favicon" href="/favicon1.ico" type="image/x-icon"/>
|
209
|
+
</head>
|
210
|
+
<body>
|
211
|
+
<a href="/link1">link1</a>
|
212
|
+
<a href="http://host2.example.com/offsite-link">offsite link</a>
|
213
|
+
<a href="/link2">link2</a>
|
214
|
+
</body>
|
215
|
+
</html>
|
216
|
+
HTML
|
217
|
+
end
|
218
|
+
|
219
|
+
get '/favicon1.ico' do
|
220
|
+
content_type 'image/x-icon'
|
221
|
+
|
222
|
+
"favicon1"
|
223
|
+
end
|
224
|
+
|
225
|
+
get '/favicon2.ico' do
|
226
|
+
content_type 'image/vnd.microsoft.icon'
|
227
|
+
|
228
|
+
"favicon2"
|
229
|
+
end
|
230
|
+
|
231
|
+
get '/link1' do
|
232
|
+
'<html><body>got here</body></html>'
|
233
|
+
end
|
234
|
+
|
235
|
+
get '/link2' do
|
236
|
+
<<~HTML
|
237
|
+
<html>
|
238
|
+
<head>
|
239
|
+
<link rel="favicon" href="/favicon2.ico" type="image/x-icon"/>
|
240
|
+
</head>
|
241
|
+
<body>got here</body>
|
242
|
+
</html>
|
243
|
+
HTML
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
let(:host) { 'example.com' }
|
249
|
+
|
250
|
+
let(:test_app) { TestAgentEveryHost::TestApp }
|
251
|
+
|
252
|
+
before do
|
253
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
|
254
|
+
end
|
255
|
+
|
256
|
+
it "must yield Spidr::Page objects for each encountered .ico file" do
|
257
|
+
yielded_favicons = []
|
258
|
+
|
259
|
+
subject.every_favicon do |favicon|
|
260
|
+
yielded_favicons << favicon
|
261
|
+
end
|
262
|
+
|
263
|
+
subject.start_at("http://#{host}/")
|
264
|
+
|
265
|
+
expect(yielded_favicons).to_not be_empty
|
266
|
+
|
267
|
+
expect(yielded_favicons[0]).to be_kind_of(Spidr::Page)
|
268
|
+
expect(yielded_favicons[0].content_type).to eq('image/x-icon')
|
269
|
+
expect(yielded_favicons[0].url).to eq(URI("http://#{host}/favicon1.ico"))
|
270
|
+
|
271
|
+
expect(yielded_favicons[1]).to be_kind_of(Spidr::Page)
|
272
|
+
expect(yielded_favicons[1].content_type).to eq('image/vnd.microsoft.icon')
|
273
|
+
expect(yielded_favicons[1].url).to eq(URI("http://#{host}/favicon2.ico"))
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
describe "#every_html_comment" do
|
278
|
+
module TestAgentEveryHTMLComment
|
279
|
+
class TestApp < Sinatra::Base
|
280
|
+
|
281
|
+
set :host, 'example.com'
|
282
|
+
set :port, 80
|
283
|
+
|
284
|
+
get '/' do
|
285
|
+
<<~HTML
|
286
|
+
<html>
|
287
|
+
<head>
|
288
|
+
<!-- comment 1 -->
|
289
|
+
</head>
|
290
|
+
<!-- -->
|
291
|
+
<body>
|
292
|
+
<!-- comment 2 -->
|
293
|
+
</body>
|
294
|
+
</html>
|
295
|
+
HTML
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
let(:host) { 'example.com' }
|
301
|
+
|
302
|
+
let(:test_app) { TestAgentEveryHTMLComment::TestApp }
|
303
|
+
|
304
|
+
before do
|
305
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
|
306
|
+
end
|
307
|
+
|
308
|
+
it "must yield every non-empty/non-whitespace HTML comment String" do
|
309
|
+
yielded_comments = []
|
310
|
+
|
311
|
+
subject.every_html_comment do |comment|
|
312
|
+
yielded_comments << comment
|
313
|
+
end
|
314
|
+
|
315
|
+
subject.start_at("http://#{host}/")
|
316
|
+
|
317
|
+
expect(yielded_comments).to match_array(
|
318
|
+
[
|
319
|
+
'comment 1',
|
320
|
+
'comment 2'
|
321
|
+
]
|
322
|
+
)
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
describe "#every_javascript" do
|
327
|
+
module TestAgentEveryJavaScript
|
328
|
+
class TestApp < Sinatra::Base
|
329
|
+
|
330
|
+
set :host, 'example.com'
|
331
|
+
set :port, 80
|
332
|
+
|
333
|
+
get '/' do
|
334
|
+
<<~HTML
|
335
|
+
<html>
|
336
|
+
<head>
|
337
|
+
<script type="text/javascript" src="/javascript1.js"></script>
|
338
|
+
<script type="text/javascript">javascript2</script>
|
339
|
+
</head>
|
340
|
+
<body>
|
341
|
+
<a href="/link1">link1</a>
|
342
|
+
<a href="http://host2.example.com/offsite-link">offsite link</a>
|
343
|
+
<a href="/link2">link2</a>
|
344
|
+
</body>
|
345
|
+
</html>
|
346
|
+
HTML
|
347
|
+
end
|
348
|
+
|
349
|
+
get '/javascript1.js' do
|
350
|
+
content_type 'text/javascript'
|
351
|
+
"javascript1"
|
352
|
+
end
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
let(:host) { 'example.com' }
|
357
|
+
|
358
|
+
let(:test_app) { TestAgentEveryJavaScript::TestApp }
|
359
|
+
|
360
|
+
before do
|
361
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
|
362
|
+
end
|
363
|
+
|
364
|
+
it "must yield both the contents of .js files and inline <script> tags" do
|
365
|
+
yielded_javascripts = []
|
366
|
+
|
367
|
+
subject.every_javascript do |js|
|
368
|
+
yielded_javascripts << js
|
369
|
+
end
|
370
|
+
|
371
|
+
subject.start_at("http://#{host}/")
|
372
|
+
|
373
|
+
expect(yielded_javascripts).to match_array(%w[javascript1 javascript2])
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
describe "#every_javascript_string" do
|
378
|
+
module TestAgentEveryJavaScriptString
|
379
|
+
class TestApp < Sinatra::Base
|
380
|
+
|
381
|
+
set :host, 'example.com'
|
382
|
+
set :port, 80
|
383
|
+
|
384
|
+
get '/' do
|
385
|
+
<<~HTML
|
386
|
+
<html>
|
387
|
+
<head>
|
388
|
+
<script type="text/javascript" src="/javascript1.js"></script>
|
389
|
+
<script type="text/javascript">
|
390
|
+
var str3 = "string #3";
|
391
|
+
var str4 = 'string #4';
|
392
|
+
</script>
|
393
|
+
</head>
|
394
|
+
<body>
|
395
|
+
<a href="/link1">link1</a>
|
396
|
+
<a href="http://host2.example.com/offsite-link">offsite link</a>
|
397
|
+
<a href="/link2">link2</a>
|
398
|
+
</body>
|
399
|
+
</html>
|
400
|
+
HTML
|
401
|
+
end
|
402
|
+
|
403
|
+
get '/javascript1.js' do
|
404
|
+
content_type 'text/javascript'
|
405
|
+
<<~JS
|
406
|
+
var str1 = "string #1";
|
407
|
+
var str2 = 'string #2';
|
408
|
+
JS
|
409
|
+
end
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
let(:host) { 'example.com' }
|
414
|
+
|
415
|
+
let(:test_app) { TestAgentEveryJavaScriptString::TestApp }
|
416
|
+
|
417
|
+
before do
|
418
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
|
419
|
+
end
|
420
|
+
|
421
|
+
it "must yield every JavaScript string from any <script> tag" do
|
422
|
+
yielded_javascript_strings = []
|
423
|
+
|
424
|
+
subject.every_javascript_string do |string|
|
425
|
+
yielded_javascript_strings << string
|
426
|
+
end
|
427
|
+
|
428
|
+
subject.start_at("http://#{host}/")
|
429
|
+
|
430
|
+
expect(yielded_javascript_strings).to match_array(
|
431
|
+
[
|
432
|
+
'string #1',
|
433
|
+
'string #2',
|
434
|
+
'string #3',
|
435
|
+
'string #4'
|
436
|
+
]
|
437
|
+
)
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
describe "#every_javascript_comment" do
|
442
|
+
module TestAgentEveryJavaScriptComment
|
443
|
+
class TestApp < Sinatra::Base
|
444
|
+
|
445
|
+
set :host, 'example.com'
|
446
|
+
set :port, 80
|
447
|
+
|
448
|
+
get '/' do
|
449
|
+
<<~HTML
|
450
|
+
<html>
|
451
|
+
<head>
|
452
|
+
<script type="text/javascript" src="/javascript1.js"></script>
|
453
|
+
<script type="text/javascript">
|
454
|
+
// comment 3
|
455
|
+
var str3 = "string #3";
|
456
|
+
/*
|
457
|
+
comment 4
|
458
|
+
*/
|
459
|
+
var str4 = 'string #4';
|
460
|
+
</script>
|
461
|
+
</head>
|
462
|
+
<body>
|
463
|
+
<a href="/link1">link1</a>
|
464
|
+
<a href="http://host2.example.com/offsite-link">offsite link</a>
|
465
|
+
<a href="/link2">link2</a>
|
466
|
+
</body>
|
467
|
+
</html>
|
468
|
+
HTML
|
469
|
+
end
|
470
|
+
|
471
|
+
get '/javascript1.js' do
|
472
|
+
content_type 'text/javascript'
|
473
|
+
<<~JS
|
474
|
+
// comment 1
|
475
|
+
var str1 = "string #1";
|
476
|
+
/* comment 2 */
|
477
|
+
var str2 = 'string #2';
|
478
|
+
JS
|
479
|
+
end
|
480
|
+
end
|
481
|
+
end
|
482
|
+
|
483
|
+
let(:host) { 'example.com' }
|
484
|
+
|
485
|
+
let(:test_app) { TestAgentEveryJavaScriptComment::TestApp }
|
486
|
+
|
487
|
+
before do
|
488
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
|
489
|
+
end
|
490
|
+
|
491
|
+
it "must yield every JavaScript comment from any <script> tag" do
|
492
|
+
yielded_javascript_comments = []
|
493
|
+
|
494
|
+
subject.every_javascript_comment do |comment|
|
495
|
+
yielded_javascript_comments << comment
|
496
|
+
end
|
497
|
+
|
498
|
+
subject.start_at("http://#{host}/")
|
499
|
+
|
500
|
+
expect(yielded_javascript_comments).to match_array(
|
501
|
+
[
|
502
|
+
"// comment 1\n",
|
503
|
+
"/* comment 2 */",
|
504
|
+
"// comment 3\n",
|
505
|
+
"/*\n comment 4\n */"
|
506
|
+
]
|
507
|
+
)
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
describe "#every_comment" do
|
512
|
+
module TestAgentEveryComment
|
513
|
+
class TestApp < Sinatra::Base
|
514
|
+
|
515
|
+
set :host, 'example.com'
|
516
|
+
set :port, 80
|
517
|
+
|
518
|
+
get '/' do
|
519
|
+
<<~HTML
|
520
|
+
<html>
|
521
|
+
<head>
|
522
|
+
<!-- HTML comment 1 -->
|
523
|
+
<script type="text/javascript" src="/javascript1.js"></script>
|
524
|
+
<script type="text/javascript">
|
525
|
+
// JavaScript comment 3
|
526
|
+
var str3 = "string #3";
|
527
|
+
/*
|
528
|
+
JavaScript comment 4
|
529
|
+
*/
|
530
|
+
var str4 = 'string #4';
|
531
|
+
</script>
|
532
|
+
</head>
|
533
|
+
<!-- -->
|
534
|
+
<body>
|
535
|
+
<!-- HTML comment 2 -->
|
536
|
+
<a href="/link1">link1</a>
|
537
|
+
<a href="http://host2.example.com/offsite-link">offsite link</a>
|
538
|
+
<a href="/link2">link2</a>
|
539
|
+
</body>
|
540
|
+
</html>
|
541
|
+
HTML
|
542
|
+
end
|
543
|
+
|
544
|
+
get '/javascript1.js' do
|
545
|
+
content_type 'text/javascript'
|
546
|
+
<<~JS
|
547
|
+
// JavaScript comment 1
|
548
|
+
var str1 = "string #1";
|
549
|
+
/* JavaScript comment 2 */
|
550
|
+
var str2 = 'string #2';
|
551
|
+
JS
|
552
|
+
end
|
553
|
+
end
|
554
|
+
end
|
555
|
+
|
556
|
+
let(:host) { 'example.com' }
|
557
|
+
|
558
|
+
let(:test_app) { TestAgentEveryComment::TestApp }
|
559
|
+
|
560
|
+
before do
|
561
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
|
562
|
+
end
|
563
|
+
|
564
|
+
it "must yield every HTML and JavaScript comment from any <script> tag" do
|
565
|
+
yielded_comments = []
|
566
|
+
|
567
|
+
subject.every_comment do |comment|
|
568
|
+
yielded_comments << comment
|
569
|
+
end
|
570
|
+
|
571
|
+
subject.start_at("http://#{host}/")
|
572
|
+
|
573
|
+
expect(yielded_comments).to match_array(
|
574
|
+
[
|
575
|
+
"HTML comment 1",
|
576
|
+
"// JavaScript comment 1\n",
|
577
|
+
"/* JavaScript comment 2 */",
|
578
|
+
"// JavaScript comment 3\n",
|
579
|
+
"/*\n JavaScript comment 4\n */",
|
580
|
+
"HTML comment 2"
|
581
|
+
]
|
582
|
+
)
|
583
|
+
end
|
584
|
+
end
|
585
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'ronin/web/spider/archive'
|
3
|
+
|
4
|
+
require 'tmpdir'
|
5
|
+
|
6
|
+
describe Ronin::Web::Spider::Archive do
|
7
|
+
let(:root) { File.join(Dir.mktmpdir('ronin-web-spider')) }
|
8
|
+
|
9
|
+
subject { described_class.new(root) }
|
10
|
+
|
11
|
+
describe "#initialize" do
|
12
|
+
it "must set #root" do
|
13
|
+
expect(subject.root).to eq(root)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe ".open" do
|
18
|
+
subject { described_class.open(root) }
|
19
|
+
|
20
|
+
it "must return a new #{described_class}" do
|
21
|
+
expect(subject).to be_kind_of(described_class)
|
22
|
+
end
|
23
|
+
|
24
|
+
context "when given a block" do
|
25
|
+
it "must yield the new #{described_class}" do
|
26
|
+
expect { |b|
|
27
|
+
described_class.open(root,&b)
|
28
|
+
}.to yield_with_args(described_class)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context "when the root directory does not exist" do
|
33
|
+
let(:root) { File.join(super(),'does-not-exist-yet') }
|
34
|
+
|
35
|
+
it "must create the given root directory" do
|
36
|
+
described_class.open(root)
|
37
|
+
|
38
|
+
expect(File.directory?(root)).to be(true)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
context "when the root directory does exist" do
|
43
|
+
let(:root) { File.join(super(),'does-not-exist-yet') }
|
44
|
+
|
45
|
+
before { FileUtils.mkdir(root) }
|
46
|
+
|
47
|
+
it "must not raise an error" do
|
48
|
+
expect {
|
49
|
+
described_class.open(root)
|
50
|
+
}.to_not raise_error
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
describe "#write" do
|
56
|
+
let(:url) { URI('https://example.com/foo/bar.html') }
|
57
|
+
let(:body) { 'test file' }
|
58
|
+
|
59
|
+
before { subject.write(url,body) }
|
60
|
+
|
61
|
+
it "must automatically create parent directory" do
|
62
|
+
expect(File.directory?(File.join(root,'foo'))).to be(true)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "must write the body into the file" do
|
66
|
+
expect(File.read(File.join(root,'foo','bar.html'))).to eq(body)
|
67
|
+
end
|
68
|
+
|
69
|
+
context "when the URL has a query string" do
|
70
|
+
let(:url) { URI('https://example.com/foo/bar.php?q=1') }
|
71
|
+
|
72
|
+
it "must include the query string as part of the file name" do
|
73
|
+
expect(File.read(File.join(root,'foo','bar.php?q=1'))).to eq(body)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
context "when the URL path ends with a '/'" do
|
78
|
+
let(:url) { URI('https://example.com/foo/bar/') }
|
79
|
+
|
80
|
+
it "must write the body to an index.html file within the URL's path" do
|
81
|
+
expect(File.read(File.join(root,'foo','bar','index.html'))).to eq(body)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
describe "#to_s" do
|
87
|
+
it "must return the root directory" do
|
88
|
+
expect(subject.to_s).to eq(root)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
data/spec/example_app.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'sinatra/base'
|
3
|
+
require 'webmock/rspec'
|
4
|
+
|
5
|
+
require 'ronin/web/spider/agent'
|
6
|
+
|
7
|
+
RSpec.shared_context "example App" do
|
8
|
+
let(:host) { 'example.com' }
|
9
|
+
|
10
|
+
subject { Ronin::Web::Spider::Agent.new(host: host) }
|
11
|
+
|
12
|
+
def self.app(&block)
|
13
|
+
let(:app) do
|
14
|
+
klass = Class.new(Sinatra::Base)
|
15
|
+
klass.set :host, host
|
16
|
+
klass.set :port, 80
|
17
|
+
klass.class_eval(&block)
|
18
|
+
return klass
|
19
|
+
end
|
20
|
+
|
21
|
+
before do
|
22
|
+
stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
|
23
|
+
|
24
|
+
subject.start_at("http://#{host}/")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|