ronin-web-spider 0.1.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,585 @@
1
+ require 'spec_helper'
2
+ require 'ronin/web/spider/agent'
3
+
4
+ require 'webmock/rspec'
5
+ require 'sinatra/base'
6
+
7
+ describe Ronin::Web::Spider::Agent do
8
+ describe "#initialize" do
9
+ context "when Ronin::Support::Network::HTTP.proxy is set" do
10
+ let(:proxy_host) { 'example.com' }
11
+ let(:proxy_port) { 8080 }
12
+ let(:proxy_uri) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
13
+
14
+ before { Ronin::Support::Network::HTTP.proxy = proxy_uri }
15
+
16
+ it "must parse ENV['RONIN_HTTP_USER_AGENT'] and set #proxy" do
17
+ expect(subject.proxy).to be_kind_of(Spidr::Proxy)
18
+ expect(subject.proxy.host).to eq(proxy_host)
19
+ expect(subject.proxy.port).to eq(proxy_port)
20
+ end
21
+
22
+ after { Ronin::Support::Network::HTTP.proxy = nil }
23
+ end
24
+
25
+ context "when Ronin::Support::Network::HTTP.user_agent is set" do
26
+ let(:user_agent) { 'Foo Bar' }
27
+
28
+ before { Ronin::Support::Network::HTTP.user_agent = user_agent }
29
+
30
+ it "must default #user_agent to ENV['RONIN_HTTP_USER_AGENT']" do
31
+ expect(subject.user_agent).to eq(user_agent)
32
+ end
33
+
34
+ after { Ronin::Support::Network::HTTP.user_agent = nil }
35
+ end
36
+
37
+ context "when given the proxy: keyword argument" do
38
+ let(:proxy_host) { 'example.com' }
39
+ let(:proxy_port) { 8080 }
40
+
41
+ context "and it's an Addressable::URI" do
42
+ let(:proxy) { Addressable::URI.new(host: proxy_host, port: proxy_port) }
43
+
44
+ subject { described_class.new(proxy: proxy) }
45
+
46
+ it "must convert it to a Spidr::Proxy object" do
47
+ expect(subject.proxy).to be_kind_of(Spidr::Proxy)
48
+ expect(subject.proxy.host).to eq(proxy_host)
49
+ expect(subject.proxy.port).to eq(proxy_port)
50
+ end
51
+ end
52
+
53
+ context "and it's an URI::HTTP" do
54
+ let(:proxy) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
55
+
56
+ subject { described_class.new(proxy: proxy) }
57
+
58
+ it "must convert it to a Spidr::Proxy object" do
59
+ expect(subject.proxy).to be_kind_of(Spidr::Proxy)
60
+ expect(subject.proxy.host).to eq(proxy_host)
61
+ expect(subject.proxy.port).to eq(proxy_port)
62
+ end
63
+ end
64
+
65
+ context "and it's a Hash" do
66
+ let(:proxy) do
67
+ {host: proxy_host, port: proxy_port}
68
+ end
69
+
70
+ subject { described_class.new(proxy: proxy) }
71
+
72
+ it "must convert it to a Spidr::Proxy object" do
73
+ expect(subject.proxy).to be_kind_of(Spidr::Proxy)
74
+ expect(subject.proxy.host).to eq(proxy_host)
75
+ expect(subject.proxy.port).to eq(proxy_port)
76
+ end
77
+ end
78
+
79
+ context "and it's a String" do
80
+ let(:proxy) { "http://#{proxy_host}:#{proxy_port}" }
81
+
82
+ subject { described_class.new(proxy: proxy) }
83
+
84
+ it "must convert it to a Spidr::Proxy object" do
85
+ expect(subject.proxy).to be_kind_of(Spidr::Proxy)
86
+ expect(subject.proxy.host).to eq(proxy_host)
87
+ expect(subject.proxy.port).to eq(proxy_port)
88
+ end
89
+ end
90
+ end
91
+
92
+ context "when given the user_agent: keyword argument" do
93
+ context "and it's a String" do
94
+ let(:user_agent) { "test user-agent" }
95
+
96
+ subject { described_class.new(user_agent: user_agent) }
97
+
98
+ it "must set the #user_agent" do
99
+ expect(subject.user_agent).to eq(user_agent)
100
+ end
101
+ end
102
+
103
+ context "and it's a Symbol" do
104
+ let(:user_agent) { :chrome_linux }
105
+ let(:expected_user_agent) do
106
+ Ronin::Support::Network::HTTP::UserAgents[user_agent]
107
+ end
108
+
109
+ subject { described_class.new(user_agent: user_agent) }
110
+
111
+ it "must map the Symbol to one of Ronin::Support::Network::HTTP::UserAgents" do
112
+ expect(subject.user_agent).to eq(expected_user_agent)
113
+ end
114
+ end
115
+ end
116
+
117
+ it "must default #visited_hosts to nil" do
118
+ expect(subject.visited_hosts).to be(nil)
119
+ end
120
+ end
121
+
122
+ describe "#every_host" do
123
+ module TestAgentEveryHost
124
+ class Host1 < Sinatra::Base
125
+
126
+ set :host, 'host1.example.com'
127
+ set :port, 80
128
+
129
+ get '/' do
130
+ <<~HTML
131
+ <html>
132
+ <body>
133
+ <a href="/link1">link1</a>
134
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
135
+ <a href="/link2">link2</a>
136
+ </body>
137
+ </html>
138
+ HTML
139
+ end
140
+
141
+ get '/link1' do
142
+ '<html><body>got here</body></html>'
143
+ end
144
+
145
+ get '/link2' do
146
+ '<html><body>got here</body></html>'
147
+ end
148
+ end
149
+
150
+ class Host2 < Sinatra::Base
151
+
152
+ set :host, 'host2.example.com'
153
+ set :port, 80
154
+
155
+ get '/offsite-link' do
156
+ '<html><body>should not get here</body></html>'
157
+ end
158
+
159
+ end
160
+ end
161
+
162
+ let(:host1) { 'host1.example.com' }
163
+ let(:host2) { 'host2.example.com' }
164
+
165
+ let(:host1_app) { TestAgentEveryHost::Host1 }
166
+ let(:host2_app) { TestAgentEveryHost::Host2 }
167
+
168
+ before do
169
+ stub_request(:any, /#{Regexp.escape(host1)}/).to_rack(host1_app)
170
+ stub_request(:any, /#{Regexp.escape(host2)}/).to_rack(host2_app)
171
+ end
172
+
173
+ it "must yield every newly discovered hostname while spidering" do
174
+ yielded_hosts = []
175
+
176
+ subject.every_host do |host|
177
+ yielded_hosts << host
178
+ end
179
+
180
+ subject.start_at("http://#{host1}/")
181
+
182
+ expect(yielded_hosts).to eq([host1, host2])
183
+ end
184
+
185
+ it "must popualte #visited_hosts" do
186
+ subject.every_host { |host| }
187
+ subject.start_at("http://#{host1}/")
188
+
189
+ expect(subject.visited_hosts).to be_kind_of(Set)
190
+ expect(subject.visited_hosts.entries).to eq([host1, host2])
191
+ end
192
+ end
193
+
194
+ # TODO: need to figure out how to test #every_cert using webmock.
195
+ describe "#every_cert"
196
+
197
+ describe "#every_favicon" do
198
+ module TestAgentEveryHost
199
+ class TestApp < Sinatra::Base
200
+
201
+ set :host, 'example.com'
202
+ set :port, 80
203
+
204
+ get '/' do
205
+ <<~HTML
206
+ <html>
207
+ <head>
208
+ <link rel="favicon" href="/favicon1.ico" type="image/x-icon"/>
209
+ </head>
210
+ <body>
211
+ <a href="/link1">link1</a>
212
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
213
+ <a href="/link2">link2</a>
214
+ </body>
215
+ </html>
216
+ HTML
217
+ end
218
+
219
+ get '/favicon1.ico' do
220
+ content_type 'image/x-icon'
221
+
222
+ "favicon1"
223
+ end
224
+
225
+ get '/favicon2.ico' do
226
+ content_type 'image/vnd.microsoft.icon'
227
+
228
+ "favicon2"
229
+ end
230
+
231
+ get '/link1' do
232
+ '<html><body>got here</body></html>'
233
+ end
234
+
235
+ get '/link2' do
236
+ <<~HTML
237
+ <html>
238
+ <head>
239
+ <link rel="favicon" href="/favicon2.ico" type="image/x-icon"/>
240
+ </head>
241
+ <body>got here</body>
242
+ </html>
243
+ HTML
244
+ end
245
+ end
246
+ end
247
+
248
+ let(:host) { 'example.com' }
249
+
250
+ let(:test_app) { TestAgentEveryHost::TestApp }
251
+
252
+ before do
253
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
254
+ end
255
+
256
+ it "must yield Spidr::Page objects for each encountered .ico file" do
257
+ yielded_favicons = []
258
+
259
+ subject.every_favicon do |favicon|
260
+ yielded_favicons << favicon
261
+ end
262
+
263
+ subject.start_at("http://#{host}/")
264
+
265
+ expect(yielded_favicons).to_not be_empty
266
+
267
+ expect(yielded_favicons[0]).to be_kind_of(Spidr::Page)
268
+ expect(yielded_favicons[0].content_type).to eq('image/x-icon')
269
+ expect(yielded_favicons[0].url).to eq(URI("http://#{host}/favicon1.ico"))
270
+
271
+ expect(yielded_favicons[1]).to be_kind_of(Spidr::Page)
272
+ expect(yielded_favicons[1].content_type).to eq('image/vnd.microsoft.icon')
273
+ expect(yielded_favicons[1].url).to eq(URI("http://#{host}/favicon2.ico"))
274
+ end
275
+ end
276
+
277
+ describe "#every_html_comment" do
278
+ module TestAgentEveryHTMLComment
279
+ class TestApp < Sinatra::Base
280
+
281
+ set :host, 'example.com'
282
+ set :port, 80
283
+
284
+ get '/' do
285
+ <<~HTML
286
+ <html>
287
+ <head>
288
+ <!-- comment 1 -->
289
+ </head>
290
+ <!-- -->
291
+ <body>
292
+ <!-- comment 2 -->
293
+ </body>
294
+ </html>
295
+ HTML
296
+ end
297
+ end
298
+ end
299
+
300
+ let(:host) { 'example.com' }
301
+
302
+ let(:test_app) { TestAgentEveryHTMLComment::TestApp }
303
+
304
+ before do
305
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
306
+ end
307
+
308
+ it "must yield every non-empty/non-whitespace HTML comment String" do
309
+ yielded_comments = []
310
+
311
+ subject.every_html_comment do |comment|
312
+ yielded_comments << comment
313
+ end
314
+
315
+ subject.start_at("http://#{host}/")
316
+
317
+ expect(yielded_comments).to match_array(
318
+ [
319
+ 'comment 1',
320
+ 'comment 2'
321
+ ]
322
+ )
323
+ end
324
+ end
325
+
326
+ describe "#every_javascript" do
327
+ module TestAgentEveryJavaScript
328
+ class TestApp < Sinatra::Base
329
+
330
+ set :host, 'example.com'
331
+ set :port, 80
332
+
333
+ get '/' do
334
+ <<~HTML
335
+ <html>
336
+ <head>
337
+ <script type="text/javascript" src="/javascript1.js"></script>
338
+ <script type="text/javascript">javascript2</script>
339
+ </head>
340
+ <body>
341
+ <a href="/link1">link1</a>
342
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
343
+ <a href="/link2">link2</a>
344
+ </body>
345
+ </html>
346
+ HTML
347
+ end
348
+
349
+ get '/javascript1.js' do
350
+ content_type 'text/javascript'
351
+ "javascript1"
352
+ end
353
+ end
354
+ end
355
+
356
+ let(:host) { 'example.com' }
357
+
358
+ let(:test_app) { TestAgentEveryJavaScript::TestApp }
359
+
360
+ before do
361
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
362
+ end
363
+
364
+ it "must yield both the contents of .js files and inline <script> tags" do
365
+ yielded_javascripts = []
366
+
367
+ subject.every_javascript do |js|
368
+ yielded_javascripts << js
369
+ end
370
+
371
+ subject.start_at("http://#{host}/")
372
+
373
+ expect(yielded_javascripts).to match_array(%w[javascript1 javascript2])
374
+ end
375
+ end
376
+
377
+ describe "#every_javascript_string" do
378
+ module TestAgentEveryJavaScriptString
379
+ class TestApp < Sinatra::Base
380
+
381
+ set :host, 'example.com'
382
+ set :port, 80
383
+
384
+ get '/' do
385
+ <<~HTML
386
+ <html>
387
+ <head>
388
+ <script type="text/javascript" src="/javascript1.js"></script>
389
+ <script type="text/javascript">
390
+ var str3 = "string #3";
391
+ var str4 = 'string #4';
392
+ </script>
393
+ </head>
394
+ <body>
395
+ <a href="/link1">link1</a>
396
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
397
+ <a href="/link2">link2</a>
398
+ </body>
399
+ </html>
400
+ HTML
401
+ end
402
+
403
+ get '/javascript1.js' do
404
+ content_type 'text/javascript'
405
+ <<~JS
406
+ var str1 = "string #1";
407
+ var str2 = 'string #2';
408
+ JS
409
+ end
410
+ end
411
+ end
412
+
413
+ let(:host) { 'example.com' }
414
+
415
+ let(:test_app) { TestAgentEveryJavaScriptString::TestApp }
416
+
417
+ before do
418
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
419
+ end
420
+
421
+ it "must yield every JavaScript string from any <script> tag" do
422
+ yielded_javascript_strings = []
423
+
424
+ subject.every_javascript_string do |string|
425
+ yielded_javascript_strings << string
426
+ end
427
+
428
+ subject.start_at("http://#{host}/")
429
+
430
+ expect(yielded_javascript_strings).to match_array(
431
+ [
432
+ 'string #1',
433
+ 'string #2',
434
+ 'string #3',
435
+ 'string #4'
436
+ ]
437
+ )
438
+ end
439
+ end
440
+
441
+ describe "#every_javascript_comment" do
442
+ module TestAgentEveryJavaScriptComment
443
+ class TestApp < Sinatra::Base
444
+
445
+ set :host, 'example.com'
446
+ set :port, 80
447
+
448
+ get '/' do
449
+ <<~HTML
450
+ <html>
451
+ <head>
452
+ <script type="text/javascript" src="/javascript1.js"></script>
453
+ <script type="text/javascript">
454
+ // comment 3
455
+ var str3 = "string #3";
456
+ /*
457
+ comment 4
458
+ */
459
+ var str4 = 'string #4';
460
+ </script>
461
+ </head>
462
+ <body>
463
+ <a href="/link1">link1</a>
464
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
465
+ <a href="/link2">link2</a>
466
+ </body>
467
+ </html>
468
+ HTML
469
+ end
470
+
471
+ get '/javascript1.js' do
472
+ content_type 'text/javascript'
473
+ <<~JS
474
+ // comment 1
475
+ var str1 = "string #1";
476
+ /* comment 2 */
477
+ var str2 = 'string #2';
478
+ JS
479
+ end
480
+ end
481
+ end
482
+
483
+ let(:host) { 'example.com' }
484
+
485
+ let(:test_app) { TestAgentEveryJavaScriptComment::TestApp }
486
+
487
+ before do
488
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
489
+ end
490
+
491
+ it "must yield every JavaScript comment from any <script> tag" do
492
+ yielded_javascript_comments = []
493
+
494
+ subject.every_javascript_comment do |comment|
495
+ yielded_javascript_comments << comment
496
+ end
497
+
498
+ subject.start_at("http://#{host}/")
499
+
500
+ expect(yielded_javascript_comments).to match_array(
501
+ [
502
+ "// comment 1\n",
503
+ "/* comment 2 */",
504
+ "// comment 3\n",
505
+ "/*\n comment 4\n */"
506
+ ]
507
+ )
508
+ end
509
+ end
510
+
511
+ describe "#every_comment" do
512
+ module TestAgentEveryComment
513
+ class TestApp < Sinatra::Base
514
+
515
+ set :host, 'example.com'
516
+ set :port, 80
517
+
518
+ get '/' do
519
+ <<~HTML
520
+ <html>
521
+ <head>
522
+ <!-- HTML comment 1 -->
523
+ <script type="text/javascript" src="/javascript1.js"></script>
524
+ <script type="text/javascript">
525
+ // JavaScript comment 3
526
+ var str3 = "string #3";
527
+ /*
528
+ JavaScript comment 4
529
+ */
530
+ var str4 = 'string #4';
531
+ </script>
532
+ </head>
533
+ <!-- -->
534
+ <body>
535
+ <!-- HTML comment 2 -->
536
+ <a href="/link1">link1</a>
537
+ <a href="http://host2.example.com/offsite-link">offsite link</a>
538
+ <a href="/link2">link2</a>
539
+ </body>
540
+ </html>
541
+ HTML
542
+ end
543
+
544
+ get '/javascript1.js' do
545
+ content_type 'text/javascript'
546
+ <<~JS
547
+ // JavaScript comment 1
548
+ var str1 = "string #1";
549
+ /* JavaScript comment 2 */
550
+ var str2 = 'string #2';
551
+ JS
552
+ end
553
+ end
554
+ end
555
+
556
+ let(:host) { 'example.com' }
557
+
558
+ let(:test_app) { TestAgentEveryComment::TestApp }
559
+
560
+ before do
561
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
562
+ end
563
+
564
+ it "must yield every HTML and JavaScript comment from any <script> tag" do
565
+ yielded_comments = []
566
+
567
+ subject.every_comment do |comment|
568
+ yielded_comments << comment
569
+ end
570
+
571
+ subject.start_at("http://#{host}/")
572
+
573
+ expect(yielded_comments).to match_array(
574
+ [
575
+ "HTML comment 1",
576
+ "// JavaScript comment 1\n",
577
+ "/* JavaScript comment 2 */",
578
+ "// JavaScript comment 3\n",
579
+ "/*\n JavaScript comment 4\n */",
580
+ "HTML comment 2"
581
+ ]
582
+ )
583
+ end
584
+ end
585
+ end
@@ -0,0 +1,91 @@
1
+ require 'spec_helper'
2
+ require 'ronin/web/spider/archive'
3
+
4
+ require 'tmpdir'
5
+
6
+ describe Ronin::Web::Spider::Archive do
7
+ let(:root) { File.join(Dir.mktmpdir('ronin-web-spider')) }
8
+
9
+ subject { described_class.new(root) }
10
+
11
+ describe "#initialize" do
12
+ it "must set #root" do
13
+ expect(subject.root).to eq(root)
14
+ end
15
+ end
16
+
17
+ describe ".open" do
18
+ subject { described_class.open(root) }
19
+
20
+ it "must return a new #{described_class}" do
21
+ expect(subject).to be_kind_of(described_class)
22
+ end
23
+
24
+ context "when given a block" do
25
+ it "must yield the new #{described_class}" do
26
+ expect { |b|
27
+ described_class.open(root,&b)
28
+ }.to yield_with_args(described_class)
29
+ end
30
+ end
31
+
32
+ context "when the root directory does not exist" do
33
+ let(:root) { File.join(super(),'does-not-exist-yet') }
34
+
35
+ it "must create the given root directory" do
36
+ described_class.open(root)
37
+
38
+ expect(File.directory?(root)).to be(true)
39
+ end
40
+ end
41
+
42
+ context "when the root directory does exist" do
43
+ let(:root) { File.join(super(),'does-not-exist-yet') }
44
+
45
+ before { FileUtils.mkdir(root) }
46
+
47
+ it "must not raise an error" do
48
+ expect {
49
+ described_class.open(root)
50
+ }.to_not raise_error
51
+ end
52
+ end
53
+ end
54
+
55
+ describe "#write" do
56
+ let(:url) { URI('https://example.com/foo/bar.html') }
57
+ let(:body) { 'test file' }
58
+
59
+ before { subject.write(url,body) }
60
+
61
+ it "must automatically create parent directory" do
62
+ expect(File.directory?(File.join(root,'foo'))).to be(true)
63
+ end
64
+
65
+ it "must write the body into the file" do
66
+ expect(File.read(File.join(root,'foo','bar.html'))).to eq(body)
67
+ end
68
+
69
+ context "when the URL has a query string" do
70
+ let(:url) { URI('https://example.com/foo/bar.php?q=1') }
71
+
72
+ it "must include the query string as part of the file name" do
73
+ expect(File.read(File.join(root,'foo','bar.php?q=1'))).to eq(body)
74
+ end
75
+ end
76
+
77
+ context "when the URL path ends with a '/'" do
78
+ let(:url) { URI('https://example.com/foo/bar/') }
79
+
80
+ it "must write the body to an index.html file within the URL's path" do
81
+ expect(File.read(File.join(root,'foo','bar','index.html'))).to eq(body)
82
+ end
83
+ end
84
+ end
85
+
86
+ describe "#to_s" do
87
+ it "must return the root directory" do
88
+ expect(subject.to_s).to eq(root)
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,27 @@
1
+ require 'rspec'
2
+ require 'sinatra/base'
3
+ require 'webmock/rspec'
4
+
5
+ require 'ronin/web/spider/agent'
6
+
7
+ RSpec.shared_context "example App" do
8
+ let(:host) { 'example.com' }
9
+
10
+ subject { Ronin::Web::Spider::Agent.new(host: host) }
11
+
12
+ def self.app(&block)
13
+ let(:app) do
14
+ klass = Class.new(Sinatra::Base)
15
+ klass.set :host, host
16
+ klass.set :port, 80
17
+ klass.class_eval(&block)
18
+ return klass
19
+ end
20
+
21
+ before do
22
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
23
+
24
+ subject.start_at("http://#{host}/")
25
+ end
26
+ end
27
+ end