ronin-web-spider 0.1.0.beta2 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/spec/agent_spec.rb DELETED
@@ -1,585 +0,0 @@
1
- require 'spec_helper'
2
- require 'ronin/web/spider/agent'
3
-
4
- require 'webmock/rspec'
5
- require 'sinatra/base'
6
-
7
- describe Ronin::Web::Spider::Agent do
8
- describe "#initialize" do
9
- context "when Ronin::Support::Network::HTTP.proxy is set" do
10
- let(:proxy_host) { 'example.com' }
11
- let(:proxy_port) { 8080 }
12
- let(:proxy_uri) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
13
-
14
- before { Ronin::Support::Network::HTTP.proxy = proxy_uri }
15
-
16
- it "must parse ENV['RONIN_HTTP_USER_AGENT'] and set #proxy" do
17
- expect(subject.proxy).to be_kind_of(Spidr::Proxy)
18
- expect(subject.proxy.host).to eq(proxy_host)
19
- expect(subject.proxy.port).to eq(proxy_port)
20
- end
21
-
22
- after { Ronin::Support::Network::HTTP.proxy = nil }
23
- end
24
-
25
- context "when Ronin::Support::Network::HTTP.user_agent is set" do
26
- let(:user_agent) { 'Foo Bar' }
27
-
28
- before { Ronin::Support::Network::HTTP.user_agent = user_agent }
29
-
30
- it "must default #user_agent to ENV['RONIN_HTTP_USER_AGENT']" do
31
- expect(subject.user_agent).to eq(user_agent)
32
- end
33
-
34
- after { Ronin::Support::Network::HTTP.user_agent = nil }
35
- end
36
-
37
- context "when given the proxy: keyword argument" do
38
- let(:proxy_host) { 'example.com' }
39
- let(:proxy_port) { 8080 }
40
-
41
- context "and it's an Addressable::URI" do
42
- let(:proxy) { Addressable::URI.new(host: proxy_host, port: proxy_port) }
43
-
44
- subject { described_class.new(proxy: proxy) }
45
-
46
- it "must convert it to a Spidr::Proxy object" do
47
- expect(subject.proxy).to be_kind_of(Spidr::Proxy)
48
- expect(subject.proxy.host).to eq(proxy_host)
49
- expect(subject.proxy.port).to eq(proxy_port)
50
- end
51
- end
52
-
53
- context "and it's an URI::HTTP" do
54
- let(:proxy) { URI::HTTP.build(host: proxy_host, port: proxy_port) }
55
-
56
- subject { described_class.new(proxy: proxy) }
57
-
58
- it "must convert it to a Spidr::Proxy object" do
59
- expect(subject.proxy).to be_kind_of(Spidr::Proxy)
60
- expect(subject.proxy.host).to eq(proxy_host)
61
- expect(subject.proxy.port).to eq(proxy_port)
62
- end
63
- end
64
-
65
- context "and it's a Hash" do
66
- let(:proxy) do
67
- {host: proxy_host, port: proxy_port}
68
- end
69
-
70
- subject { described_class.new(proxy: proxy) }
71
-
72
- it "must convert it to a Spidr::Proxy object" do
73
- expect(subject.proxy).to be_kind_of(Spidr::Proxy)
74
- expect(subject.proxy.host).to eq(proxy_host)
75
- expect(subject.proxy.port).to eq(proxy_port)
76
- end
77
- end
78
-
79
- context "and it's a String" do
80
- let(:proxy) { "http://#{proxy_host}:#{proxy_port}" }
81
-
82
- subject { described_class.new(proxy: proxy) }
83
-
84
- it "must convert it to a Spidr::Proxy object" do
85
- expect(subject.proxy).to be_kind_of(Spidr::Proxy)
86
- expect(subject.proxy.host).to eq(proxy_host)
87
- expect(subject.proxy.port).to eq(proxy_port)
88
- end
89
- end
90
- end
91
-
92
- context "when given the user_agent: keyword argument" do
93
- context "and it's a String" do
94
- let(:user_agent) { "test user-agent" }
95
-
96
- subject { described_class.new(user_agent: user_agent) }
97
-
98
- it "must set the #user_agent" do
99
- expect(subject.user_agent).to eq(user_agent)
100
- end
101
- end
102
-
103
- context "and it's a Symbol" do
104
- let(:user_agent) { :chrome_linux }
105
- let(:expected_user_agent) do
106
- Ronin::Support::Network::HTTP::UserAgents[user_agent]
107
- end
108
-
109
- subject { described_class.new(user_agent: user_agent) }
110
-
111
- it "must map the Symbol to one of Ronin::Support::Network::HTTP::UserAgents" do
112
- expect(subject.user_agent).to eq(expected_user_agent)
113
- end
114
- end
115
- end
116
-
117
- it "must default #visited_hosts to nil" do
118
- expect(subject.visited_hosts).to be(nil)
119
- end
120
- end
121
-
122
- describe "#every_host" do
123
- module TestAgentEveryHost
124
- class Host1 < Sinatra::Base
125
-
126
- set :host, 'host1.example.com'
127
- set :port, 80
128
-
129
- get '/' do
130
- <<~HTML
131
- <html>
132
- <body>
133
- <a href="/link1">link1</a>
134
- <a href="http://host2.example.com/offsite-link">offsite link</a>
135
- <a href="/link2">link2</a>
136
- </body>
137
- </html>
138
- HTML
139
- end
140
-
141
- get '/link1' do
142
- '<html><body>got here</body></html>'
143
- end
144
-
145
- get '/link2' do
146
- '<html><body>got here</body></html>'
147
- end
148
- end
149
-
150
- class Host2 < Sinatra::Base
151
-
152
- set :host, 'host2.example.com'
153
- set :port, 80
154
-
155
- get '/offsite-link' do
156
- '<html><body>should not get here</body></html>'
157
- end
158
-
159
- end
160
- end
161
-
162
- let(:host1) { 'host1.example.com' }
163
- let(:host2) { 'host2.example.com' }
164
-
165
- let(:host1_app) { TestAgentEveryHost::Host1 }
166
- let(:host2_app) { TestAgentEveryHost::Host2 }
167
-
168
- before do
169
- stub_request(:any, /#{Regexp.escape(host1)}/).to_rack(host1_app)
170
- stub_request(:any, /#{Regexp.escape(host2)}/).to_rack(host2_app)
171
- end
172
-
173
- it "must yield every newly discovered hostname while spidering" do
174
- yielded_hosts = []
175
-
176
- subject.every_host do |host|
177
- yielded_hosts << host
178
- end
179
-
180
- subject.start_at("http://#{host1}/")
181
-
182
- expect(yielded_hosts).to eq([host1, host2])
183
- end
184
-
185
- it "must popualte #visited_hosts" do
186
- subject.every_host { |host| }
187
- subject.start_at("http://#{host1}/")
188
-
189
- expect(subject.visited_hosts).to be_kind_of(Set)
190
- expect(subject.visited_hosts.entries).to eq([host1, host2])
191
- end
192
- end
193
-
194
- # TODO: need to figure out how to test #every_cert using webmock.
195
- describe "#every_cert"
196
-
197
- describe "#every_favicon" do
198
- module TestAgentEveryHost
199
- class TestApp < Sinatra::Base
200
-
201
- set :host, 'example.com'
202
- set :port, 80
203
-
204
- get '/' do
205
- <<~HTML
206
- <html>
207
- <head>
208
- <link rel="favicon" href="/favicon1.ico" type="image/x-icon"/>
209
- </head>
210
- <body>
211
- <a href="/link1">link1</a>
212
- <a href="http://host2.example.com/offsite-link">offsite link</a>
213
- <a href="/link2">link2</a>
214
- </body>
215
- </html>
216
- HTML
217
- end
218
-
219
- get '/favicon1.ico' do
220
- content_type 'image/x-icon'
221
-
222
- "favicon1"
223
- end
224
-
225
- get '/favicon2.ico' do
226
- content_type 'image/vnd.microsoft.icon'
227
-
228
- "favicon2"
229
- end
230
-
231
- get '/link1' do
232
- '<html><body>got here</body></html>'
233
- end
234
-
235
- get '/link2' do
236
- <<~HTML
237
- <html>
238
- <head>
239
- <link rel="favicon" href="/favicon2.ico" type="image/x-icon"/>
240
- </head>
241
- <body>got here</body>
242
- </html>
243
- HTML
244
- end
245
- end
246
- end
247
-
248
- let(:host) { 'example.com' }
249
-
250
- let(:test_app) { TestAgentEveryHost::TestApp }
251
-
252
- before do
253
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
254
- end
255
-
256
- it "must yield Spidr::Page objects for each encountered .ico file" do
257
- yielded_favicons = []
258
-
259
- subject.every_favicon do |favicon|
260
- yielded_favicons << favicon
261
- end
262
-
263
- subject.start_at("http://#{host}/")
264
-
265
- expect(yielded_favicons).to_not be_empty
266
-
267
- expect(yielded_favicons[0]).to be_kind_of(Spidr::Page)
268
- expect(yielded_favicons[0].content_type).to eq('image/x-icon')
269
- expect(yielded_favicons[0].url).to eq(URI("http://#{host}/favicon1.ico"))
270
-
271
- expect(yielded_favicons[1]).to be_kind_of(Spidr::Page)
272
- expect(yielded_favicons[1].content_type).to eq('image/vnd.microsoft.icon')
273
- expect(yielded_favicons[1].url).to eq(URI("http://#{host}/favicon2.ico"))
274
- end
275
- end
276
-
277
- describe "#every_html_comment" do
278
- module TestAgentEveryHTMLComment
279
- class TestApp < Sinatra::Base
280
-
281
- set :host, 'example.com'
282
- set :port, 80
283
-
284
- get '/' do
285
- <<~HTML
286
- <html>
287
- <head>
288
- <!-- comment 1 -->
289
- </head>
290
- <!-- -->
291
- <body>
292
- <!-- comment 2 -->
293
- </body>
294
- </html>
295
- HTML
296
- end
297
- end
298
- end
299
-
300
- let(:host) { 'example.com' }
301
-
302
- let(:test_app) { TestAgentEveryHTMLComment::TestApp }
303
-
304
- before do
305
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
306
- end
307
-
308
- it "must yield every non-empty/non-whitespace HTML comment String" do
309
- yielded_comments = []
310
-
311
- subject.every_html_comment do |comment|
312
- yielded_comments << comment
313
- end
314
-
315
- subject.start_at("http://#{host}/")
316
-
317
- expect(yielded_comments).to match_array(
318
- [
319
- 'comment 1',
320
- 'comment 2'
321
- ]
322
- )
323
- end
324
- end
325
-
326
- describe "#every_javascript" do
327
- module TestAgentEveryJavaScript
328
- class TestApp < Sinatra::Base
329
-
330
- set :host, 'example.com'
331
- set :port, 80
332
-
333
- get '/' do
334
- <<~HTML
335
- <html>
336
- <head>
337
- <script type="text/javascript" src="/javascript1.js"></script>
338
- <script type="text/javascript">javascript2</script>
339
- </head>
340
- <body>
341
- <a href="/link1">link1</a>
342
- <a href="http://host2.example.com/offsite-link">offsite link</a>
343
- <a href="/link2">link2</a>
344
- </body>
345
- </html>
346
- HTML
347
- end
348
-
349
- get '/javascript1.js' do
350
- content_type 'text/javascript'
351
- "javascript1"
352
- end
353
- end
354
- end
355
-
356
- let(:host) { 'example.com' }
357
-
358
- let(:test_app) { TestAgentEveryJavaScript::TestApp }
359
-
360
- before do
361
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
362
- end
363
-
364
- it "must yield both the contents of .js files and inline <script> tags" do
365
- yielded_javascripts = []
366
-
367
- subject.every_javascript do |js|
368
- yielded_javascripts << js
369
- end
370
-
371
- subject.start_at("http://#{host}/")
372
-
373
- expect(yielded_javascripts).to match_array(%w[javascript1 javascript2])
374
- end
375
- end
376
-
377
- describe "#every_javascript_string" do
378
- module TestAgentEveryJavaScriptString
379
- class TestApp < Sinatra::Base
380
-
381
- set :host, 'example.com'
382
- set :port, 80
383
-
384
- get '/' do
385
- <<~HTML
386
- <html>
387
- <head>
388
- <script type="text/javascript" src="/javascript1.js"></script>
389
- <script type="text/javascript">
390
- var str3 = "string #3";
391
- var str4 = 'string #4';
392
- </script>
393
- </head>
394
- <body>
395
- <a href="/link1">link1</a>
396
- <a href="http://host2.example.com/offsite-link">offsite link</a>
397
- <a href="/link2">link2</a>
398
- </body>
399
- </html>
400
- HTML
401
- end
402
-
403
- get '/javascript1.js' do
404
- content_type 'text/javascript'
405
- <<~JS
406
- var str1 = "string #1";
407
- var str2 = 'string #2';
408
- JS
409
- end
410
- end
411
- end
412
-
413
- let(:host) { 'example.com' }
414
-
415
- let(:test_app) { TestAgentEveryJavaScriptString::TestApp }
416
-
417
- before do
418
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
419
- end
420
-
421
- it "must yield every JavaScript string from any <script> tag" do
422
- yielded_javascript_strings = []
423
-
424
- subject.every_javascript_string do |string|
425
- yielded_javascript_strings << string
426
- end
427
-
428
- subject.start_at("http://#{host}/")
429
-
430
- expect(yielded_javascript_strings).to match_array(
431
- [
432
- 'string #1',
433
- 'string #2',
434
- 'string #3',
435
- 'string #4'
436
- ]
437
- )
438
- end
439
- end
440
-
441
- describe "#every_javascript_comment" do
442
- module TestAgentEveryJavaScriptComment
443
- class TestApp < Sinatra::Base
444
-
445
- set :host, 'example.com'
446
- set :port, 80
447
-
448
- get '/' do
449
- <<~HTML
450
- <html>
451
- <head>
452
- <script type="text/javascript" src="/javascript1.js"></script>
453
- <script type="text/javascript">
454
- // comment 3
455
- var str3 = "string #3";
456
- /*
457
- comment 4
458
- */
459
- var str4 = 'string #4';
460
- </script>
461
- </head>
462
- <body>
463
- <a href="/link1">link1</a>
464
- <a href="http://host2.example.com/offsite-link">offsite link</a>
465
- <a href="/link2">link2</a>
466
- </body>
467
- </html>
468
- HTML
469
- end
470
-
471
- get '/javascript1.js' do
472
- content_type 'text/javascript'
473
- <<~JS
474
- // comment 1
475
- var str1 = "string #1";
476
- /* comment 2 */
477
- var str2 = 'string #2';
478
- JS
479
- end
480
- end
481
- end
482
-
483
- let(:host) { 'example.com' }
484
-
485
- let(:test_app) { TestAgentEveryJavaScriptComment::TestApp }
486
-
487
- before do
488
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
489
- end
490
-
491
- it "must yield every JavaScript comment from any <script> tag" do
492
- yielded_javascript_comments = []
493
-
494
- subject.every_javascript_comment do |comment|
495
- yielded_javascript_comments << comment
496
- end
497
-
498
- subject.start_at("http://#{host}/")
499
-
500
- expect(yielded_javascript_comments).to match_array(
501
- [
502
- "// comment 1\n",
503
- "/* comment 2 */",
504
- "// comment 3\n",
505
- "/*\n comment 4\n */"
506
- ]
507
- )
508
- end
509
- end
510
-
511
- describe "#every_comment" do
512
- module TestAgentEveryComment
513
- class TestApp < Sinatra::Base
514
-
515
- set :host, 'example.com'
516
- set :port, 80
517
-
518
- get '/' do
519
- <<~HTML
520
- <html>
521
- <head>
522
- <!-- HTML comment 1 -->
523
- <script type="text/javascript" src="/javascript1.js"></script>
524
- <script type="text/javascript">
525
- // JavaScript comment 3
526
- var str3 = "string #3";
527
- /*
528
- JavaScript comment 4
529
- */
530
- var str4 = 'string #4';
531
- </script>
532
- </head>
533
- <!-- -->
534
- <body>
535
- <!-- HTML comment 2 -->
536
- <a href="/link1">link1</a>
537
- <a href="http://host2.example.com/offsite-link">offsite link</a>
538
- <a href="/link2">link2</a>
539
- </body>
540
- </html>
541
- HTML
542
- end
543
-
544
- get '/javascript1.js' do
545
- content_type 'text/javascript'
546
- <<~JS
547
- // JavaScript comment 1
548
- var str1 = "string #1";
549
- /* JavaScript comment 2 */
550
- var str2 = 'string #2';
551
- JS
552
- end
553
- end
554
- end
555
-
556
- let(:host) { 'example.com' }
557
-
558
- let(:test_app) { TestAgentEveryComment::TestApp }
559
-
560
- before do
561
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app)
562
- end
563
-
564
- it "must yield every HTML and JavaScript comment from any <script> tag" do
565
- yielded_comments = []
566
-
567
- subject.every_comment do |comment|
568
- yielded_comments << comment
569
- end
570
-
571
- subject.start_at("http://#{host}/")
572
-
573
- expect(yielded_comments).to match_array(
574
- [
575
- "HTML comment 1",
576
- "// JavaScript comment 1\n",
577
- "/* JavaScript comment 2 */",
578
- "// JavaScript comment 3\n",
579
- "/*\n JavaScript comment 4\n */",
580
- "HTML comment 2"
581
- ]
582
- )
583
- end
584
- end
585
- end
data/spec/archive_spec.rb DELETED
@@ -1,91 +0,0 @@
1
- require 'spec_helper'
2
- require 'ronin/web/spider/archive'
3
-
4
- require 'tmpdir'
5
-
6
- describe Ronin::Web::Spider::Archive do
7
- let(:root) { File.join(Dir.mktmpdir('ronin-web-spider')) }
8
-
9
- subject { described_class.new(root) }
10
-
11
- describe "#initialize" do
12
- it "must set #root" do
13
- expect(subject.root).to eq(root)
14
- end
15
- end
16
-
17
- describe ".open" do
18
- subject { described_class.open(root) }
19
-
20
- it "must return a new #{described_class}" do
21
- expect(subject).to be_kind_of(described_class)
22
- end
23
-
24
- context "when given a block" do
25
- it "must yield the new #{described_class}" do
26
- expect { |b|
27
- described_class.open(root,&b)
28
- }.to yield_with_args(described_class)
29
- end
30
- end
31
-
32
- context "when the root directory does not exist" do
33
- let(:root) { File.join(super(),'does-not-exist-yet') }
34
-
35
- it "must create the given root directory" do
36
- described_class.open(root)
37
-
38
- expect(File.directory?(root)).to be(true)
39
- end
40
- end
41
-
42
- context "when the root directory does exist" do
43
- let(:root) { File.join(super(),'does-not-exist-yet') }
44
-
45
- before { FileUtils.mkdir(root) }
46
-
47
- it "must not raise an error" do
48
- expect {
49
- described_class.open(root)
50
- }.to_not raise_error
51
- end
52
- end
53
- end
54
-
55
- describe "#write" do
56
- let(:url) { URI('https://example.com/foo/bar.html') }
57
- let(:body) { 'test file' }
58
-
59
- before { subject.write(url,body) }
60
-
61
- it "must automatically create parent directory" do
62
- expect(File.directory?(File.join(root,'foo'))).to be(true)
63
- end
64
-
65
- it "must write the body into the file" do
66
- expect(File.read(File.join(root,'foo','bar.html'))).to eq(body)
67
- end
68
-
69
- context "when the URL has a query string" do
70
- let(:url) { URI('https://example.com/foo/bar.php?q=1') }
71
-
72
- it "must include the query string as part of the file name" do
73
- expect(File.read(File.join(root,'foo','bar.php?q=1'))).to eq(body)
74
- end
75
- end
76
-
77
- context "when the URL path ends with a '/'" do
78
- let(:url) { URI('https://example.com/foo/bar/') }
79
-
80
- it "must write the body to an index.html file within the URL's path" do
81
- expect(File.read(File.join(root,'foo','bar','index.html'))).to eq(body)
82
- end
83
- end
84
- end
85
-
86
- describe "#to_s" do
87
- it "must return the root directory" do
88
- expect(subject.to_s).to eq(root)
89
- end
90
- end
91
- end
data/spec/example_app.rb DELETED
@@ -1,27 +0,0 @@
1
- require 'rspec'
2
- require 'sinatra/base'
3
- require 'webmock/rspec'
4
-
5
- require 'ronin/web/spider/agent'
6
-
7
- RSpec.shared_context "example App" do
8
- let(:host) { 'example.com' }
9
-
10
- subject { Ronin::Web::Spider::Agent.new(host: host) }
11
-
12
- def self.app(&block)
13
- let(:app) do
14
- klass = Class.new(Sinatra::Base)
15
- klass.set :host, host
16
- klass.set :port, 80
17
- klass.class_eval(&block)
18
- return klass
19
- end
20
-
21
- before do
22
- stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
23
-
24
- subject.start_at("http://#{host}/")
25
- end
26
- end
27
- end