ronin-web-spider 0.1.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,585 @@
1
+ require 'spec_helper'
2
+ require 'ronin/web/spider/agent'
3
+
4
+ require 'webmock/rspec'
5
+ require 'sinatra/base'
6
+
7
+ describe Ronin::Web::Spider::Agent do
8
+ describe "#initialize" do
9
+ context "when Ronin::Support::Network::HTTP.proxy is set" do
10
+ let(:proxy_host) { 'example.com' }
11
+ let(:proxy_port) { 8080 }
12
+ let(:proxy_uri) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
13
+
14
+ before { Ronin::Support::Network::HTTP.proxy = proxy_uri }
15
+
16
+ it "must parse ENV['RONIN_HTTP_USER_AGENT'] and set #proxy" do
17
+ expect(subject.proxy).to be_kind_of(Spidr::Proxy)
18
+ expect(subject.proxy.host).to eq(proxy_host)
19
+ expect(subject.proxy.port).to eq(proxy_port)
20
+ end
21
+
22
+ after { Ronin::Support::Network::HTTP.proxy = nil }
23
+ end
24
+
25
+ context "when Ronin::Support::Network::HTTP.user_agent is set" do
26
+ let(:user_agent) { 'Foo Bar' }
27
+
28
+ before { Ronin::Support::Network::HTTP.user_agent = user_agent }
29
+
30
+ it "must default #user_agent to ENV['RONIN_HTTP_USER_AGENT']" do
31
+ expect(subject.user_agent).to eq(user_agent)
32
+ end
33
+
34
+ after { Ronin::Support::Network::HTTP.user_agent = nil }
35
+ end
36
+
37
+ context "when given the proxy: keyword argument" do
38
+ let(:proxy_host) { 'example.com' }
39
+ let(:proxy_port) { 8080 }
40
+
41
+ context "and it's an Addressable::URI" do
42
+ let(:proxy) { Addressable::URI.new(host: proxy_host, port: proxy_port) }
43
+
44
+ subject { described_class.new(proxy: proxy) }
45
+
46
+ it "must convert it to a Spidr::Proxy object" do
47
+ expect(subject.proxy).to be_kind_of(Spidr::Proxy)
48
+ expect(subject.proxy.host).to eq(proxy_host)
49
+ expect(subject.proxy.port).to eq(proxy_port)
50
+ end
51
+ end
52
+
53
+ context "and it's an URI::HTTP" do
54
+ let(:proxy) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
55
+
56
+ subject { described_class.new(proxy: proxy) }
57
+
58
+ it "must convert it to a Spidr::Proxy object" do
59
+ expect(subject.proxy).to be_kind_of(Spidr::Proxy)
60
+ expect(subject.proxy.host).to eq(proxy_host)
61
+ expect(subject.proxy.port).to eq(proxy_port)
62
+ end
63
+ end
64
+
65
+ context "and it's a Hash" do
66
+ let(:proxy) do
67
+ {host: proxy_host, port: proxy_port}
68
+ end
69
+
70
+ subject { described_class.new(proxy: proxy) }
71
+
72
+ it "must convert it to a Spidr::Proxy object" do
73
+ expect(subject.proxy).to be_kind_of(Spidr::Proxy)
74
+ expect(subject.proxy.host).to eq(proxy_host)
75
+ expect(subject.proxy.port).to eq(proxy_port)
76
+ end
77
+ end
78
+
79
+ context "and it's a String" do
80
+ let(:proxy) { "http://#{proxy_host}:#{proxy_port}" }
81
+
82
+ subject { described_class.new(proxy: proxy) }
83
+
84
+ it "must convert it to a Spidr::Proxy object" do
85
+ expect(subject.proxy).to be_kind_of(Spidr::Proxy)
86
+ expect(subject.proxy.host).to eq(proxy_host)
87
+ expect(subject.proxy.port).to eq(proxy_port)
88
+ end
89
+ end
90
+ end
91
+
92
+ context "when given the user_agent: keyword argument" do
93
+ context "and it's a String" do
94
+ let(:user_agent) { "test user-agent" }
95
+
96
+ subject { described_class.new(user_agent: user_agent) }
97
+
98
+ it "must set the #user_agent" do
99
+ expect(subject.user_agent).to eq(user_agent)
100
+ end
101
+ end
102
+
103
+ context "and it's a Symbol" do
104
+ let(:user_agent) { :chrome_linux }
105
+ let(:expected_user_agent) do
106
+ Ronin::Support::Network::HTTP::UserAgents[user_agent]
107
+ end
108
+
109
+ subject { described_class.new(user_agent: user_agent) }
110
+
111
+ it "must map the Symbol to one of Ronin::Support::Network::HTTP::UserAgents" do
112
+ expect(subject.user_agent).to eq(expected_user_agent)
113
+ end
114
+ end
115
+ end
116
+
117
+ it "must default #visited_hosts to nil" do
118
+ expect(subject.visited_hosts).to be(nil)
119
+ end
120
+ end
121
+
122
+ describe "#every_host" do
123
+ module TestAgentEveryHost
124
+ class Host1 < Sinatra::Base
125
+
126
+ set :host, 'host1.example.com'
127
+ set :port, 80
128
+
129
+ get '/' do
130
+ <<~HTML
131
+ <html>
132
+ <body>
133
+ <a href="/link1">link1</a>
134
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
135
+ <a href="/link2">link2</a>
136
+ </body>
137
+ </html>
138
+ HTML
139
+ end
140
+
141
+ get '/link1' do
142
+ '<html><body>got here</body></html>'
143
+ end
144
+
145
+ get '/link2' do
146
+ '<html><body>got here</body></html>'
147
+ end
148
+ end
149
+
150
+ class Host2 < Sinatra::Base
151
+
152
+ set :host, 'host2.example.com'
153
+ set :port, 80
154
+
155
+ get '/offsite-link' do
156
+ '<html><body>should not get here</body></html>'
157
+ end
158
+
159
+ end
160
+ end
161
+
162
+ let(:host1) { 'host1.example.com' }
163
+ let(:host2) { 'host2.example.com' }
164
+
165
+ let(:host1_app) { TestAgentEveryHost::Host1 }
166
+ let(:host2_app) { TestAgentEveryHost::Host2 }
167
+
168
+ before do
169
+ stub_request(:any, /#{Regexp.escape(host1)}/).to_rack(host1_app)
170
+ stub_request(:any, /#{Regexp.escape(host2)}/).to_rack(host2_app)
171
+ end
172
+
173
+ it "must yield every newly discovered hostname while spidering" do
174
+ yielded_hosts = []
175
+
176
+ subject.every_host do |host|
177
+ yielded_hosts << host
178
+ end
179
+
180
+ subject.start_at("http://#{host1}/")
181
+
182
+ expect(yielded_hosts).to eq([host1, host2])
183
+ end
184
+
185
+ it "must popualte #visited_hosts" do
186
+ subject.every_host { |host| }
187
+ subject.start_at("http://#{host1}/")
188
+
189
+ expect(subject.visited_hosts).to be_kind_of(Set)
190
+ expect(subject.visited_hosts.entries).to eq([host1, host2])
191
+ end
192
+ end
193
+
194
+ # TODO: need to figure out how to test #every_cert using webmock.
195
+ describe "#every_cert"
196
+
197
+ describe "#every_favicon" do
198
+ module TestAgentEveryHost
199
+ class TestApp < Sinatra::Base
200
+
201
+ set :host, 'example.com'
202
+ set :port, 80
203
+
204
+ get '/' do
205
+ <<~HTML
206
+ <html>
207
+ <head>
208
+ <link rel="favicon" href="/favicon1.ico" type="image/x-icon"/>
209
+ </head>
210
+ <body>
211
+ <a href="/link1">link1</a>
212
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
213
+ <a href="/link2">link2</a>
214
+ </body>
215
+ </html>
216
+ HTML
217
+ end
218
+
219
+ get '/favicon1.ico' do
220
+ content_type 'image/x-icon'
221
+
222
+ "favicon1"
223
+ end
224
+
225
+ get '/favicon2.ico' do
226
+ content_type 'image/vnd.microsoft.icon'
227
+
228
+ "favicon2"
229
+ end
230
+
231
+ get '/link1' do
232
+ '<html><body>got here</body></html>'
233
+ end
234
+
235
+ get '/link2' do
236
+ <<~HTML
237
+ <html>
238
+ <head>
239
+ <link rel="favicon" href="/favicon2.ico" type="image/x-icon"/>
240
+ </head>
241
+ <body>got here</body>
242
+ </html>
243
+ HTML
244
+ end
245
+ end
246
+ end
247
+
248
+ let(:host) { 'example.com' }
249
+
250
+ let(:test_app) { TestAgentEveryHost::TestApp }
251
+
252
+ before do
253
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
254
+ end
255
+
256
+ it "must yield Spidr::Page objects for each encountered .ico file" do
257
+ yielded_favicons = []
258
+
259
+ subject.every_favicon do |favicon|
260
+ yielded_favicons << favicon
261
+ end
262
+
263
+ subject.start_at("http://#{host}/")
264
+
265
+ expect(yielded_favicons).to_not be_empty
266
+
267
+ expect(yielded_favicons[0]).to be_kind_of(Spidr::Page)
268
+ expect(yielded_favicons[0].content_type).to eq('image/x-icon')
269
+ expect(yielded_favicons[0].url).to eq(URI("http://#{host}/favicon1.ico"))
270
+
271
+ expect(yielded_favicons[1]).to be_kind_of(Spidr::Page)
272
+ expect(yielded_favicons[1].content_type).to eq('image/vnd.microsoft.icon')
273
+ expect(yielded_favicons[1].url).to eq(URI("http://#{host}/favicon2.ico"))
274
+ end
275
+ end
276
+
277
+ describe "#every_html_comment" do
278
+ module TestAgentEveryHTMLComment
279
+ class TestApp < Sinatra::Base
280
+
281
+ set :host, 'example.com'
282
+ set :port, 80
283
+
284
+ get '/' do
285
+ <<~HTML
286
+ <html>
287
+ <head>
288
+ <!-- comment 1 -->
289
+ </head>
290
+ <!-- -->
291
+ <body>
292
+ <!-- comment 2 -->
293
+ </body>
294
+ </html>
295
+ HTML
296
+ end
297
+ end
298
+ end
299
+
300
+ let(:host) { 'example.com' }
301
+
302
+ let(:test_app) { TestAgentEveryHTMLComment::TestApp }
303
+
304
+ before do
305
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
306
+ end
307
+
308
+ it "must yield every non-empty/non-whitespace HTML comment String" do
309
+ yielded_comments = []
310
+
311
+ subject.every_html_comment do |comment|
312
+ yielded_comments << comment
313
+ end
314
+
315
+ subject.start_at("http://#{host}/")
316
+
317
+ expect(yielded_comments).to match_array(
318
+ [
319
+ 'comment 1',
320
+ 'comment 2'
321
+ ]
322
+ )
323
+ end
324
+ end
325
+
326
+ describe "#every_javascript" do
327
+ module TestAgentEveryJavaScript
328
+ class TestApp < Sinatra::Base
329
+
330
+ set :host, 'example.com'
331
+ set :port, 80
332
+
333
+ get '/' do
334
+ <<~HTML
335
+ <html>
336
+ <head>
337
+ <script type="text/javascript" src="/javascript1.js"></script>
338
+ <script type="text/javascript">javascript2</script>
339
+ </head>
340
+ <body>
341
+ <a href="/link1">link1</a>
342
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
343
+ <a href="/link2">link2</a>
344
+ </body>
345
+ </html>
346
+ HTML
347
+ end
348
+
349
+ get '/javascript1.js' do
350
+ content_type 'text/javascript'
351
+ "javascript1"
352
+ end
353
+ end
354
+ end
355
+
356
+ let(:host) { 'example.com' }
357
+
358
+ let(:test_app) { TestAgentEveryJavaScript::TestApp }
359
+
360
+ before do
361
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
362
+ end
363
+
364
+ it "must yield both the contents of .js files and inline <script> tags" do
365
+ yielded_javascripts = []
366
+
367
+ subject.every_javascript do |js|
368
+ yielded_javascripts << js
369
+ end
370
+
371
+ subject.start_at("http://#{host}/")
372
+
373
+ expect(yielded_javascripts).to match_array(%w[javascript1 javascript2])
374
+ end
375
+ end
376
+
377
+ describe "#every_javascript_string" do
378
+ module TestAgentEveryJavaScriptString
379
+ class TestApp < Sinatra::Base
380
+
381
+ set :host, 'example.com'
382
+ set :port, 80
383
+
384
+ get '/' do
385
+ <<~HTML
386
+ <html>
387
+ <head>
388
+ <script type="text/javascript" src="/javascript1.js"></script>
389
+ <script type="text/javascript">
390
+ var str3 = "string #3";
391
+ var str4 = 'string #4';
392
+ </script>
393
+ </head>
394
+ <body>
395
+ <a href="/link1">link1</a>
396
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
397
+ <a href="/link2">link2</a>
398
+ </body>
399
+ </html>
400
+ HTML
401
+ end
402
+
403
+ get '/javascript1.js' do
404
+ content_type 'text/javascript'
405
+ <<~JS
406
+ var str1 = "string #1";
407
+ var str2 = 'string #2';
408
+ JS
409
+ end
410
+ end
411
+ end
412
+
413
+ let(:host) { 'example.com' }
414
+
415
+ let(:test_app) { TestAgentEveryJavaScriptString::TestApp }
416
+
417
+ before do
418
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
419
+ end
420
+
421
+ it "must yield every JavaScript string from any <script> tag" do
422
+ yielded_javascript_strings = []
423
+
424
+ subject.every_javascript_string do |string|
425
+ yielded_javascript_strings << string
426
+ end
427
+
428
+ subject.start_at("http://#{host}/")
429
+
430
+ expect(yielded_javascript_strings).to match_array(
431
+ [
432
+ 'string #1',
433
+ 'string #2',
434
+ 'string #3',
435
+ 'string #4'
436
+ ]
437
+ )
438
+ end
439
+ end
440
+
441
+ describe "#every_javascript_comment" do
442
+ module TestAgentEveryJavaScriptComment
443
+ class TestApp < Sinatra::Base
444
+
445
+ set :host, 'example.com'
446
+ set :port, 80
447
+
448
+ get '/' do
449
+ <<~HTML
450
+ <html>
451
+ <head>
452
+ <script type="text/javascript" src="/javascript1.js"></script>
453
+ <script type="text/javascript">
454
+ // comment 3
455
+ var str3 = "string #3";
456
+ /*
457
+ comment 4
458
+ */
459
+ var str4 = 'string #4';
460
+ </script>
461
+ </head>
462
+ <body>
463
+ <a href="/link1">link1</a>
464
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
465
+ <a href="/link2">link2</a>
466
+ </body>
467
+ </html>
468
+ HTML
469
+ end
470
+
471
+ get '/javascript1.js' do
472
+ content_type 'text/javascript'
473
+ <<~JS
474
+ // comment 1
475
+ var str1 = "string #1";
476
+ /* comment 2 */
477
+ var str2 = 'string #2';
478
+ JS
479
+ end
480
+ end
481
+ end
482
+
483
+ let(:host) { 'example.com' }
484
+
485
+ let(:test_app) { TestAgentEveryJavaScriptComment::TestApp }
486
+
487
+ before do
488
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
489
+ end
490
+
491
+ it "must yield every JavaScript comment from any <script> tag" do
492
+ yielded_javascript_comments = []
493
+
494
+ subject.every_javascript_comment do |comment|
495
+ yielded_javascript_comments << comment
496
+ end
497
+
498
+ subject.start_at("http://#{host}/")
499
+
500
+ expect(yielded_javascript_comments).to match_array(
501
+ [
502
+ "// comment 1\n",
503
+ "/* comment 2 */",
504
+ "// comment 3\n",
505
+ "/*\n comment 4\n */"
506
+ ]
507
+ )
508
+ end
509
+ end
510
+
511
+ describe "#every_comment" do
512
+ module TestAgentEveryComment
513
+ class TestApp < Sinatra::Base
514
+
515
+ set :host, 'example.com'
516
+ set :port, 80
517
+
518
+ get '/' do
519
+ <<~HTML
520
+ <html>
521
+ <head>
522
+ <!-- HTML comment 1 -->
523
+ <script type="text/javascript" src="/javascript1.js"></script>
524
+ <script type="text/javascript">
525
+ // JavaScript comment 3
526
+ var str3 = "string #3";
527
+ /*
528
+ JavaScript comment 4
529
+ */
530
+ var str4 = 'string #4';
531
+ </script>
532
+ </head>
533
+ <!-- -->
534
+ <body>
535
+ <!-- HTML comment 2 -->
536
+ <a href="/link1">link1</a>
537
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
538
+ <a href="/link2">link2</a>
539
+ </body>
540
+ </html>
541
+ HTML
542
+ end
543
+
544
+ get '/javascript1.js' do
545
+ content_type 'text/javascript'
546
+ <<~JS
547
+ // JavaScript comment 1
548
+ var str1 = "string #1";
549
+ /* JavaScript comment 2 */
550
+ var str2 = 'string #2';
551
+ JS
552
+ end
553
+ end
554
+ end
555
+
556
+ let(:host) { 'example.com' }
557
+
558
+ let(:test_app) { TestAgentEveryComment::TestApp }
559
+
560
+ before do
561
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
562
+ end
563
+
564
+ it "must yield every HTML and JavaScript comment from any <script> tag" do
565
+ yielded_comments = []
566
+
567
+ subject.every_comment do |comment|
568
+ yielded_comments << comment
569
+ end
570
+
571
+ subject.start_at("http://#{host}/")
572
+
573
+ expect(yielded_comments).to match_array(
574
+ [
575
+ "HTML comment 1",
576
+ "// JavaScript comment 1\n",
577
+ "/* JavaScript comment 2 */",
578
+ "// JavaScript comment 3\n",
579
+ "/*\n JavaScript comment 4\n */",
580
+ "HTML comment 2"
581
+ ]
582
+ )
583
+ end
584
+ end
585
+ end
@@ -0,0 +1,91 @@
1
+ require 'spec_helper'
2
+ require 'ronin/web/spider/archive'
3
+
4
+ require 'tmpdir'
5
+
6
+ describe Ronin::Web::Spider::Archive do
7
+ let(:root) { File.join(Dir.mktmpdir('ronin-web-spider')) }
8
+
9
+ subject { described_class.new(root) }
10
+
11
+ describe "#initialize" do
12
+ it "must set #root" do
13
+ expect(subject.root).to eq(root)
14
+ end
15
+ end
16
+
17
+ describe ".open" do
18
+ subject { described_class.open(root) }
19
+
20
+ it "must return a new #{described_class}" do
21
+ expect(subject).to be_kind_of(described_class)
22
+ end
23
+
24
+ context "when given a block" do
25
+ it "must yield the new #{described_class}" do
26
+ expect { |b|
27
+ described_class.open(root,&b)
28
+ }.to yield_with_args(described_class)
29
+ end
30
+ end
31
+
32
+ context "when the root directory does not exist" do
33
+ let(:root) { File.join(super(),'does-not-exist-yet') }
34
+
35
+ it "must create the given root directory" do
36
+ described_class.open(root)
37
+
38
+ expect(File.directory?(root)).to be(true)
39
+ end
40
+ end
41
+
42
+ context "when the root directory does exist" do
43
+ let(:root) { File.join(super(),'does-not-exist-yet') }
44
+
45
+ before { FileUtils.mkdir(root) }
46
+
47
+ it "must not raise an error" do
48
+ expect {
49
+ described_class.open(root)
50
+ }.to_not raise_error
51
+ end
52
+ end
53
+ end
54
+
55
+ describe "#write" do
56
+ let(:url) { URI('https://example.com/foo/bar.html') }
57
+ let(:body) { 'test file' }
58
+
59
+ before { subject.write(url,body) }
60
+
61
+ it "must automatically create parent directory" do
62
+ expect(File.directory?(File.join(root,'foo'))).to be(true)
63
+ end
64
+
65
+ it "must write the body into the file" do
66
+ expect(File.read(File.join(root,'foo','bar.html'))).to eq(body)
67
+ end
68
+
69
+ context "when the URL has a query string" do
70
+ let(:url) { URI('https://example.com/foo/bar.php?q=1') }
71
+
72
+ it "must include the query string as part of the file name" do
73
+ expect(File.read(File.join(root,'foo','bar.php?q=1'))).to eq(body)
74
+ end
75
+ end
76
+
77
+ context "when the URL path ends with a '/'" do
78
+ let(:url) { URI('https://example.com/foo/bar/') }
79
+
80
+ it "must write the body to an index.html file within the URL's path" do
81
+ expect(File.read(File.join(root,'foo','bar','index.html'))).to eq(body)
82
+ end
83
+ end
84
+ end
85
+
86
+ describe "#to_s" do
87
+ it "must return the root directory" do
88
+ expect(subject.to_s).to eq(root)
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,27 @@
1
+ require 'rspec'
2
+ require 'sinatra/base'
3
+ require 'webmock/rspec'
4
+
5
+ require 'ronin/web/spider/agent'
6
+
7
+ RSpec.shared_context "example App" do
8
+ let(:host) { 'example.com' }
9
+
10
+ subject { Ronin::Web::Spider::Agent.new(host: host) }
11
+
12
+ def self.app(&block)
13
+ let(:app) do
14
+ klass = Class.new(Sinatra::Base)
15
+ klass.set :host, host
16
+ klass.set :port, 80
17
+ klass.class_eval(&block)
18
+ return klass
19
+ end
20
+
21
+ before do
22
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
23
+
24
+ subject.start_at("http://#{host}/")
25
+ end
26
+ end
27
+ end